GenotypesContext now updates cached data for add, set, replace operations when possible

-- Involved separately managing the sample -> offset and sample sorted list operations.  This should improve performance throughout the system
This commit is contained in:
Mark DePristo 2011-11-22 08:40:48 -05:00
parent 29ca24694a
commit e484625594
4 changed files with 173 additions and 78 deletions

View File

@ -52,9 +52,6 @@ public class GenotypesContext implements List<Genotype> {
*/
Map<String, Integer> sampleNameToOffset = null;
/** if true, then we need to reinitialize sampleNamesInOrder and sampleNameToOffset before we use them /*/
boolean cacheIsInvalid = true;
/**
* An ArrayList of genotypes contained in this context
*
@ -95,7 +92,6 @@ public class GenotypesContext implements List<Genotype> {
protected GenotypesContext(final ArrayList<Genotype> genotypes) {
this.notToBeDirectlyAccessedGenotypes = genotypes;
this.sampleNameToOffset = null;
this.cacheIsInvalid = true;
}
/**
@ -120,7 +116,6 @@ public class GenotypesContext implements List<Genotype> {
this.notToBeDirectlyAccessedGenotypes = genotypes;
this.sampleNameToOffset = sampleNameToOffset;
this.sampleNamesInOrder = sampleNamesInOrder;
this.cacheIsInvalid = false;
}
// ---------------------------------------------------------------------------
@ -246,33 +241,46 @@ public class GenotypesContext implements List<Genotype> {
//
// ---------------------------------------------------------------------------
@Ensures({"cacheIsInvalid == true"})
protected void invalidateCaches() {
cacheIsInvalid = true;
sampleNamesInOrder = null;
@Ensures({"sampleNameToOffset == null"})
protected void invalidateSampleNameMap() {
sampleNameToOffset = null;
}
@Ensures({"cacheIsInvalid == false",
"sampleNamesInOrder != null",
"sampleNameToOffset != null",
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)",
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"})
protected void buildCache() {
if ( cacheIsInvalid ) {
cacheIsInvalid = false;
@Ensures({"sampleNamesInOrder == null"})
protected void invalidateSampleOrdering() {
sampleNamesInOrder = null;
}
@Ensures({"sampleNamesInOrder != null",
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)"})
protected void ensureSampleOrdering() {
if ( sampleNamesInOrder == null ) {
sampleNamesInOrder = new ArrayList<String>(size());
sampleNameToOffset = new HashMap<String, Integer>(size());
for ( int i = 0; i < size(); i++ ) {
final Genotype g = getGenotypes().get(i);
sampleNamesInOrder.add(g.getSampleName());
sampleNameToOffset.put(g.getSampleName(), i);
sampleNamesInOrder.add(getGenotypes().get(i).getSampleName());
}
Collections.sort(sampleNamesInOrder);
}
}
@Ensures({"sampleNameToOffset != null",
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"})
protected void ensureSampleNameMap() {
if ( sampleNameToOffset == null ) {
sampleNameToOffset = new HashMap<String, Integer>(size());
for ( int i = 0; i < size(); i++ ) {
sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i);
}
}
}
// for testing purposes
protected void ensureAll() {
ensureSampleNameMap();
ensureSampleOrdering();
}
// ---------------------------------------------------------------------------
//
@ -287,7 +295,8 @@ public class GenotypesContext implements List<Genotype> {
@Override
public void clear() {
checkImmutability();
invalidateCaches();
invalidateSampleNameMap();
invalidateSampleOrdering();
getGenotypes().clear();
}
@ -301,21 +310,43 @@ public class GenotypesContext implements List<Genotype> {
return getGenotypes().isEmpty();
}
/**
* Adds a single genotype to this context.
*
* There are many constraints on this input, and important
* impacts on the performance of other functions provided by this
* context.
*
* First, the sample name of genotype must be unique within this
* context. However, this is not enforced in the code itself, through
* you will invalid the contract on this context if you add duplicate
* samples and are running with CoFoJa enabled.
*
* Second, adding genotype also updates the sample name -> index map,
* so add() followed by containsSample and related function is an efficient
* series of operations.
*
* Third, adding the genotype invalidates the sorted list of sample names, to
* add() followed by any of the SampleNamesInOrder operations is inefficient, as
* each SampleNamesInOrder must rebuild the sorted list of sample names at
* an O(n log n) cost.
*
* @param genotype
* @return
*/
@Override
@Requires({"genotype != null", "get(genotype.getSampleName()) == null"})
@Ensures("noDups(getGenotypes())")
public boolean add(final Genotype genotype) {
checkImmutability();
invalidateCaches();
return getGenotypes().add(genotype);
}
invalidateSampleOrdering();
@Requires({"genotype != null", "! containsAny(Arrays.asList(genotype))"})
@Ensures("noDups(getGenotypes())")
public boolean add(final Genotype ... genotype) {
checkImmutability();
invalidateCaches();
return getGenotypes().addAll(Arrays.asList(genotype));
if ( sampleNameToOffset != null ) {
// update the name map by adding entries
sampleNameToOffset.put(genotype.getSampleName(), size());
}
return getGenotypes().add(genotype);
}
@Override
@ -325,12 +356,30 @@ public class GenotypesContext implements List<Genotype> {
throw new UnsupportedOperationException();
}
/**
* Adds all of the genotypes to this context
*
* See {@link #add(Genotype)} for important information about this functions
* constraints and performance costs
*
* @param genotypes
* @return
*/
@Override
@Requires("! containsAny(genotypes)")
@Ensures("noDups(getGenotypes())")
public boolean addAll(final Collection<? extends Genotype> genotypes) {
checkImmutability();
invalidateCaches();
invalidateSampleOrdering();
if ( sampleNameToOffset != null ) {
// update the name map by adding entries
int pos = size();
for ( final Genotype g : genotypes ) {
sampleNameToOffset.put(g.getSampleName(), pos++);
}
}
return getGenotypes().addAll(genotypes);
}
@ -362,13 +411,12 @@ public class GenotypesContext implements List<Genotype> {
}
public Genotype get(final String sampleName) {
buildCache();
Integer offset = getSampleI(sampleName);
return offset == null ? null : getGenotypes().get(offset);
}
private Integer getSampleI(final String sampleName) {
buildCache();
ensureSampleNameMap();
return sampleNameToOffset.get(sampleName);
}
@ -401,31 +449,58 @@ public class GenotypesContext implements List<Genotype> {
// return genotypes.listIterator(i);
}
/**
* Note that remove requires us to invalidate our sample -> index
* cache. The loop:
*
* GenotypesContext gc = ...
* for ( sample in samples )
* if ( gc.containsSample(sample) )
* gc.remove(sample)
*
* is extremely inefficient, as each call to remove invalidates the cache
* and containsSample requires us to rebuild it, an O(n) operation.
*
* If you must remove many samples from the GC, use either removeAll or retainAll
* to avoid this O(n * m) operation.
*
* @param i
* @return
*/
@Override
public Genotype remove(final int i) {
checkImmutability();
invalidateCaches();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().remove(i);
}
/**
* See for important warning {@link this.remove(Integer)}
* @param o
* @return
*/
@Override
public boolean remove(final Object o) {
checkImmutability();
invalidateCaches();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().remove(o);
}
@Override
public boolean removeAll(final Collection<?> objects) {
checkImmutability();
invalidateCaches();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().removeAll(objects);
}
@Override
public boolean retainAll(final Collection<?> objects) {
checkImmutability();
invalidateCaches();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().retainAll(objects);
}
@ -433,14 +508,28 @@ public class GenotypesContext implements List<Genotype> {
@Ensures("noDups(getGenotypes())")
public Genotype set(final int i, final Genotype genotype) {
checkImmutability();
invalidateCaches();
return getGenotypes().set(i, genotype);
final Genotype prev = getGenotypes().set(i, genotype);
invalidateSampleOrdering();
if ( sampleNameToOffset != null ) {
// update the name map by removing the old entry and replacing it with the new one
sampleNameToOffset.remove(prev.getSampleName());
sampleNameToOffset.put(genotype.getSampleName(), i);
}
return prev;
}
/**
* Replaces the genotype in this context -- note for efficiency
* reasons we do not add the genotype if it's not present. The
* return value will be null indicating this happened.
*
* Note this operation is preserves the map cache Sample -> Offset but
* invalidates the sorted list of samples. Using replace within a loop
* containing any of the SampleNameInOrder operation requires an O(n log n)
* resorting after each replace operation.
*
* @param genotype a non null genotype to bind in this context
* @return null if genotype was not added, otherwise returns the previous genotype
*/
@ -451,7 +540,7 @@ public class GenotypesContext implements List<Genotype> {
if ( offset == null )
return null;
else
return getGenotypes().set(offset, genotype);
return set(offset, genotype);
}
@Override
@ -523,7 +612,7 @@ public class GenotypesContext implements List<Genotype> {
*/
@Ensures("result != null")
public Set<String> getSampleNames() {
buildCache();
ensureSampleNameMap();
return sampleNameToOffset.keySet();
}
@ -532,19 +621,18 @@ public class GenotypesContext implements List<Genotype> {
*/
@Ensures("result != null")
public List<String> getSampleNamesOrderedByName() {
buildCache();
ensureSampleOrdering();
return sampleNamesInOrder;
}
@Requires("sample != null")
public boolean containsSample(final String sample) {
buildCache();
ensureSampleNameMap();
return sampleNameToOffset.containsKey(sample);
}
@Requires("samples != null")
public boolean containsSamples(final Collection<String> samples) {
buildCache();
return getSampleNames().containsAll(samples);
}

View File

@ -81,8 +81,8 @@ public class LazyGenotypesContext extends GenotypesContext {
final List<String> sampleNamesInOrder;
@Requires({"genotypes != null", "sampleNamesInOrder != null", "sampleNameToOffset != null",
"sameSamples(genotypes, sampleNamesInOrder)",
"sameSamples(genotypes, sampleNameToOffset.keySet())"})
"sameSamples(genotypes, sampleNamesInOrder)",
"sameSamples(genotypes, sampleNameToOffset.keySet())"})
public LazyData(final ArrayList<Genotype> genotypes,
final List<String> sampleNamesInOrder,
final Map<String, Integer> sampleNameToOffset) {
@ -119,13 +119,20 @@ public class LazyGenotypesContext extends GenotypesContext {
@Override
@Ensures("result != null")
protected ArrayList<Genotype> getGenotypes() {
decode();
return notToBeDirectlyAccessedGenotypes;
}
/**
* Force us to decode the genotypes, if not already done
*/
public void decode() {
if ( ! loaded ) {
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
LazyData parsed = parser.parse(unparsedGenotypeData);
notToBeDirectlyAccessedGenotypes = parsed.genotypes;
sampleNamesInOrder = parsed.sampleNamesInOrder;
sampleNameToOffset = parsed.sampleNameToOffset;
cacheIsInvalid = false; // these values build the cache
loaded = true;
unparsedGenotypeData = null; // don't hold the unparsed data any longer
@ -133,31 +140,43 @@ public class LazyGenotypesContext extends GenotypesContext {
// That said, it's not such an important routine -- it's just checking that the genotypes
// are well formed w.r.t. the alleles list, but this will be enforced within the VCFCodec
}
return notToBeDirectlyAccessedGenotypes;
}
/**
* Overrides the buildCache functionality. If the data hasn't been loaded
* Overrides the ensure* functionality. If the data hasn't been loaded
* yet and we want to build the cache, just decode it and we're done. If we've
* already decoded the data, though, go through the super class
*/
@Override
protected synchronized void buildCache() {
if ( cacheIsInvalid ) {
if ( ! loaded ) {
getGenotypes(); // will load up all of the necessary data
} else {
super.buildCache();
}
protected synchronized void ensureSampleNameMap() {
if ( ! loaded ) {
decode(); // will load up all of the necessary data
} else {
super.ensureSampleNameMap();
}
}
@Override
protected void invalidateCaches() {
protected synchronized void ensureSampleOrdering() {
if ( ! loaded ) {
decode(); // will load up all of the necessary data
} else {
super.ensureSampleOrdering();
}
}
@Override
protected void invalidateSampleNameMap() {
// if the cache is invalidated, and we haven't loaded our data yet, do so
if ( ! loaded ) getGenotypes();
super.invalidateCaches();
if ( ! loaded ) decode();
super.invalidateSampleNameMap();
}
@Override
protected void invalidateSampleOrdering() {
// if the cache is invalidated, and we haven't loaded our data yet, do so
if ( ! loaded ) decode();
super.invalidateSampleOrdering();
}
@Override
@ -177,11 +196,4 @@ public class LazyGenotypesContext extends GenotypesContext {
public Object getUnparsedGenotypeData() {
return unparsedGenotypeData;
}
/**
* Force us to decode the genotypes
*/
public void decode() {
buildCache();
}
}

View File

@ -532,7 +532,6 @@ public class VariantContextUtils {
final Map<String, Object> attributesWithMaxAC = new TreeMap<String, Object>();
double log10PError = 1;
VariantContext vcWithMaxAC = null;
Set<String> addedSamples = new HashSet<String>(first.getNSamples());
GenotypesContext genotypes = GenotypesContext.create();
// counting the number of filtered and variant VCs
@ -557,7 +556,7 @@ public class VariantContextUtils {
alleles.addAll(alleleMapping.values());
mergeGenotypes(genotypes, addedSamples, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY);
mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY);
log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1);
@ -963,10 +962,10 @@ public class VariantContextUtils {
}
}
private static void mergeGenotypes(GenotypesContext mergedGenotypes, Set<String> addedSamples, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) {
private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) {
for ( Genotype g : oneVC.getGenotypes() ) {
String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples);
if ( ! addedSamples.contains(name) ) {
if ( mergedGenotypes.containsSample(name) ) {
// only add if the name is new
Genotype newG = g;
@ -976,7 +975,6 @@ public class VariantContextUtils {
}
mergedGenotypes.add(newG);
addedSamples.add(name);
}
}
}

View File

@ -88,7 +88,8 @@ public class GenotypesContextUnitTest extends BaseTest {
@Override
public LazyGenotypesContext.LazyData parse(final Object data) {
GenotypesContext gc = GenotypesContext.copy((List<Genotype>)data);
gc.buildCache();
gc.ensureSampleNameMap();
gc.ensureSampleOrdering();
return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset);
}
@ -234,10 +235,6 @@ public class GenotypesContextUnitTest extends BaseTest {
gc.add(add2);
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));
gc = cfg.makeContext();
gc.add(add1, add2);
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));
gc = cfg.makeContext();
gc.addAll(Arrays.asList(add1, add2));
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));