Two optimizations. Even more aggressive printProgress meter optimization to only even consider doing work once every 1000 cycles through the engine. Second, GenomeLocParser now uses a single indirection around the contigInfo variable. This class uses a last used cache to retrieve efficiently contig information instead of always returning to the underlying SAMSequenceDictionary hashmap to make genome locs.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5670 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-04-21 01:31:26 +00:00
parent 29857f5ba6
commit cc78027bd3
2 changed files with 87 additions and 8 deletions

View File

@ -115,6 +115,8 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
private SimpleTimer timer = new SimpleTimer("Traversal");
// How long can we go without printing some progress info?
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
private int printProgressCheckCounter = 0;
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
@ -242,6 +244,10 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
* @param mustPrint If true, will print out info, regardless of nRecords or time interval
*/
private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) {
if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 )
// don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES
return;
if(!progressMeterInitialized && mustPrint == false ) {
logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]");
logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining",

View File

@ -62,8 +62,82 @@ public class GenomeLocParser {
// Ugly global variable defining the optional ordering of contig elements
//
// --------------------------------------------------------------------------------------------------------------
//public static Map<String, Integer> refContigOrdering = null;
protected SAMSequenceDictionary contigInfo = null;
/**
* A wrapper class that provides efficient last used caching for the global
* SAMSequenceDictionary underlying all of the GATK engine capabilities
*/
private final class MasterSequenceDictionary {
final private SAMSequenceDictionary dict;
// cache
SAMSequenceRecord lastSSR = null;
String lastContig = "";
int lastIndex = -1;
public MasterSequenceDictionary(SAMSequenceDictionary dict) {
this.dict = dict;
}
public final int getNSequences() {
return dict.size();
}
public synchronized final SAMSequenceRecord getSequence(final String contig) {
if ( isCached(contig) )
return lastSSR;
else
return updateCache(dict.getSequence(contig));
}
public synchronized final SAMSequenceRecord getSequence(final int index) {
if ( isCached(index) )
return lastSSR;
else
return updateCache(dict.getSequence(index));
}
public synchronized final int getSequenceIndex(final String contig) {
if ( ! isCached(contig) ) {
SAMSequenceRecord rec = dict.getSequence(contig);
if ( rec == null )
return -1; // not found
else
updateCache(rec);
}
return lastIndex;
}
private synchronized boolean isCached(final String contig) {
return lastContig.equals(contig);
}
private synchronized boolean isCached(final int index) {
return lastIndex == index;
}
/**
* The key algorithm. Given a new record, update the last used record, contig
* name, and index.
*
* @param rec
* @return
*/
private synchronized SAMSequenceRecord updateCache(SAMSequenceRecord rec) {
if ( rec == null ) {
return null;
} else {
lastSSR = rec;
lastContig = rec.getSequenceName();
lastIndex = rec.getSequenceIndex();
return rec;
}
}
}
private MasterSequenceDictionary contigInfo = null;
/**
* set our internal reference contig order
@ -78,7 +152,7 @@ public class GenomeLocParser {
//logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
throw new UserException.CommandLineException("Failed to load reference dictionary");
} else if (contigInfo == null) {
contigInfo = seqDict;
contigInfo = new MasterSequenceDictionary(seqDict);
logger.debug(String.format("Prepared reference sequence contig dictionary"));
for (SAMSequenceRecord contig : seqDict.getSequences()) {
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
@ -123,7 +197,6 @@ public class GenomeLocParser {
* @return a GenomeLoc representing the String
*
*/
public GenomeLoc parseGenomeInterval(final String str) {
GenomeLoc ret = parseGenomeLoc(str);
exceptionOnInvalidGenomeLocBounds(ret);
@ -263,7 +336,7 @@ public class GenomeLocParser {
*/
private boolean isContigValid(String contig) {
int contigIndex = contigInfo.getSequenceIndex(contig);
return contigIndex >= 0 && contigIndex < contigInfo.size();
return contigIndex >= 0 && contigIndex < contigInfo.getNSequences();
}
/**
@ -426,7 +499,7 @@ public class GenomeLocParser {
if (toReturn.getContigIndex() < 0) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0");
}
if (toReturn.getContigIndex() >= contigInfo.getSequences().size()) {
if (toReturn.getContigIndex() >= contigInfo.getNSequences()) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater then the stored sequence count");
}
@ -468,7 +541,7 @@ public class GenomeLocParser {
public boolean validGenomeLoc(GenomeLoc loc) {
// quick check before we get the contig size, is the contig number valid
if ((loc.getContigIndex() < 0) || // the contig index has to be positive
(loc.getContigIndex() >= contigInfo.getSequences().size())) // the contig must be in the integer range of contigs)
(loc.getContigIndex() >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs)
return false;
int contigSize = contigInfo.getSequence(loc.getContigIndex()).getSequenceLength();
@ -510,7 +583,7 @@ public class GenomeLocParser {
* performs interval-style validation: contig is valid and atart and stop less than the end
*/
public boolean validGenomeLoc(int contigIndex, int start, int stop) {
if (contigIndex < 0 || contigIndex >= contigInfo.size()) return false;
if (contigIndex < 0 || contigIndex >= contigInfo.getNSequences()) return false;
return validGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contigIndex), contigIndex, start, stop));
}