diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index 8d28ba18c..248fdad9d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -52,9 +52,6 @@ public class GenotypesContext implements List { */ Map sampleNameToOffset = null; - /** if true, then we need to reinitialize sampleNamesInOrder and sampleNameToOffset before we use them /*/ - boolean cacheIsInvalid = true; - /** * An ArrayList of genotypes contained in this context * @@ -95,7 +92,6 @@ public class GenotypesContext implements List { protected GenotypesContext(final ArrayList genotypes) { this.notToBeDirectlyAccessedGenotypes = genotypes; this.sampleNameToOffset = null; - this.cacheIsInvalid = true; } /** @@ -120,7 +116,6 @@ public class GenotypesContext implements List { this.notToBeDirectlyAccessedGenotypes = genotypes; this.sampleNameToOffset = sampleNameToOffset; this.sampleNamesInOrder = sampleNamesInOrder; - this.cacheIsInvalid = false; } // --------------------------------------------------------------------------- @@ -246,33 +241,46 @@ public class GenotypesContext implements List { // // --------------------------------------------------------------------------- - @Ensures({"cacheIsInvalid == true"}) - protected void invalidateCaches() { - cacheIsInvalid = true; - sampleNamesInOrder = null; + @Ensures({"sampleNameToOffset == null"}) + protected void invalidateSampleNameMap() { sampleNameToOffset = null; } - @Ensures({"cacheIsInvalid == false", - "sampleNamesInOrder != null", - "sampleNameToOffset != null", - "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)", - "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"}) - protected void buildCache() { - if ( cacheIsInvalid ) { - cacheIsInvalid = false; + @Ensures({"sampleNamesInOrder == null"}) + protected void invalidateSampleOrdering() { + sampleNamesInOrder = null; + } + + @Ensures({"sampleNamesInOrder != null", + "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)"}) + protected void ensureSampleOrdering() { + if ( sampleNamesInOrder == null ) { sampleNamesInOrder = new ArrayList(size()); - sampleNameToOffset = new HashMap(size()); for ( int i = 0; i < size(); i++ ) { - final Genotype g = getGenotypes().get(i); - sampleNamesInOrder.add(g.getSampleName()); - sampleNameToOffset.put(g.getSampleName(), i); + sampleNamesInOrder.add(getGenotypes().get(i).getSampleName()); } Collections.sort(sampleNamesInOrder); } } + @Ensures({"sampleNameToOffset != null", + "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"}) + protected void ensureSampleNameMap() { + if ( sampleNameToOffset == null ) { + sampleNameToOffset = new HashMap(size()); + + for ( int i = 0; i < size(); i++ ) { + sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i); + } + } + } + + // for testing purposes + protected void ensureAll() { + ensureSampleNameMap(); + ensureSampleOrdering(); + } // --------------------------------------------------------------------------- // @@ -287,7 +295,8 @@ public class GenotypesContext implements List { @Override public void clear() { checkImmutability(); - invalidateCaches(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); getGenotypes().clear(); } @@ -301,21 +310,43 @@ public class GenotypesContext implements List { return getGenotypes().isEmpty(); } + /** + * Adds a single genotype to this context. + * + * There are many constraints on this input, and important + * impacts on the performance of other functions provided by this + * context. + * + * First, the sample name of genotype must be unique within this + * context. However, this is not enforced in the code itself, through + * you will invalid the contract on this context if you add duplicate + * samples and are running with CoFoJa enabled. + * + * Second, adding genotype also updates the sample name -> index map, + * so add() followed by containsSample and related function is an efficient + * series of operations. + * + * Third, adding the genotype invalidates the sorted list of sample names, to + * add() followed by any of the SampleNamesInOrder operations is inefficient, as + * each SampleNamesInOrder must rebuild the sorted list of sample names at + * an O(n log n) cost. + * + * @param genotype + * @return + */ @Override @Requires({"genotype != null", "get(genotype.getSampleName()) == null"}) @Ensures("noDups(getGenotypes())") public boolean add(final Genotype genotype) { checkImmutability(); - invalidateCaches(); - return getGenotypes().add(genotype); - } + invalidateSampleOrdering(); - @Requires({"genotype != null", "! containsAny(Arrays.asList(genotype))"}) - @Ensures("noDups(getGenotypes())") - public boolean add(final Genotype ... genotype) { - checkImmutability(); - invalidateCaches(); - return getGenotypes().addAll(Arrays.asList(genotype)); + if ( sampleNameToOffset != null ) { + // update the name map by adding entries + sampleNameToOffset.put(genotype.getSampleName(), size()); + } + + return getGenotypes().add(genotype); } @Override @@ -325,12 +356,30 @@ public class GenotypesContext implements List { throw new UnsupportedOperationException(); } + /** + * Adds all of the genotypes to this context + * + * See {@link #add(Genotype)} for important information about this functions + * constraints and performance costs + * + * @param genotypes + * @return + */ @Override @Requires("! containsAny(genotypes)") @Ensures("noDups(getGenotypes())") public boolean addAll(final Collection genotypes) { checkImmutability(); - invalidateCaches(); + invalidateSampleOrdering(); + + if ( sampleNameToOffset != null ) { + // update the name map by adding entries + int pos = size(); + for ( final Genotype g : genotypes ) { + sampleNameToOffset.put(g.getSampleName(), pos++); + } + } + return getGenotypes().addAll(genotypes); } @@ -362,13 +411,12 @@ public class GenotypesContext implements List { } public Genotype get(final String sampleName) { - buildCache(); Integer offset = getSampleI(sampleName); return offset == null ? null : getGenotypes().get(offset); } private Integer getSampleI(final String sampleName) { - buildCache(); + ensureSampleNameMap(); return sampleNameToOffset.get(sampleName); } @@ -401,31 +449,58 @@ public class GenotypesContext implements List { // return genotypes.listIterator(i); } + /** + * Note that remove requires us to invalidate our sample -> index + * cache. The loop: + * + * GenotypesContext gc = ... + * for ( sample in samples ) + * if ( gc.containsSample(sample) ) + * gc.remove(sample) + * + * is extremely inefficient, as each call to remove invalidates the cache + * and containsSample requires us to rebuild it, an O(n) operation. + * + * If you must remove many samples from the GC, use either removeAll or retainAll + * to avoid this O(n * m) operation. + * + * @param i + * @return + */ @Override public Genotype remove(final int i) { checkImmutability(); - invalidateCaches(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); return getGenotypes().remove(i); } + /** + * See for important warning {@link this.remove(Integer)} + * @param o + * @return + */ @Override public boolean remove(final Object o) { checkImmutability(); - invalidateCaches(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); return getGenotypes().remove(o); } @Override public boolean removeAll(final Collection objects) { checkImmutability(); - invalidateCaches(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); return getGenotypes().removeAll(objects); } @Override public boolean retainAll(final Collection objects) { checkImmutability(); - invalidateCaches(); + invalidateSampleNameMap(); + invalidateSampleOrdering(); return getGenotypes().retainAll(objects); } @@ -433,14 +508,28 @@ public class GenotypesContext implements List { @Ensures("noDups(getGenotypes())") public Genotype set(final int i, final Genotype genotype) { checkImmutability(); - invalidateCaches(); - return getGenotypes().set(i, genotype); + final Genotype prev = getGenotypes().set(i, genotype); + + invalidateSampleOrdering(); + if ( sampleNameToOffset != null ) { + // update the name map by removing the old entry and replacing it with the new one + sampleNameToOffset.remove(prev.getSampleName()); + sampleNameToOffset.put(genotype.getSampleName(), i); + } + + return prev; } /** * Replaces the genotype in this context -- note for efficiency * reasons we do not add the genotype if it's not present. The * return value will be null indicating this happened. + * + * Note this operation is preserves the map cache Sample -> Offset but + * invalidates the sorted list of samples. Using replace within a loop + * containing any of the SampleNameInOrder operation requires an O(n log n) + * resorting after each replace operation. + * * @param genotype a non null genotype to bind in this context * @return null if genotype was not added, otherwise returns the previous genotype */ @@ -451,7 +540,7 @@ public class GenotypesContext implements List { if ( offset == null ) return null; else - return getGenotypes().set(offset, genotype); + return set(offset, genotype); } @Override @@ -523,7 +612,7 @@ public class GenotypesContext implements List { */ @Ensures("result != null") public Set getSampleNames() { - buildCache(); + ensureSampleNameMap(); return sampleNameToOffset.keySet(); } @@ -532,19 +621,18 @@ public class GenotypesContext implements List { */ @Ensures("result != null") public List getSampleNamesOrderedByName() { - buildCache(); + ensureSampleOrdering(); return sampleNamesInOrder; } @Requires("sample != null") public boolean containsSample(final String sample) { - buildCache(); + ensureSampleNameMap(); return sampleNameToOffset.containsKey(sample); } @Requires("samples != null") public boolean containsSamples(final Collection samples) { - buildCache(); return getSampleNames().containsAll(samples); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java index 574bdc3d0..ce0422352 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java @@ -81,8 +81,8 @@ public class LazyGenotypesContext extends GenotypesContext { final List sampleNamesInOrder; @Requires({"genotypes != null", "sampleNamesInOrder != null", "sampleNameToOffset != null", - "sameSamples(genotypes, sampleNamesInOrder)", - "sameSamples(genotypes, sampleNameToOffset.keySet())"}) + "sameSamples(genotypes, sampleNamesInOrder)", + "sameSamples(genotypes, sampleNameToOffset.keySet())"}) public LazyData(final ArrayList genotypes, final List sampleNamesInOrder, final Map sampleNameToOffset) { @@ -119,13 +119,20 @@ public class LazyGenotypesContext extends GenotypesContext { @Override @Ensures("result != null") protected ArrayList getGenotypes() { + decode(); + return notToBeDirectlyAccessedGenotypes; + } + + /** + * Force us to decode the genotypes, if not already done + */ + public void decode() { if ( ! loaded ) { //System.out.printf("Loading genotypes... %s:%d%n", contig, start); LazyData parsed = parser.parse(unparsedGenotypeData); notToBeDirectlyAccessedGenotypes = parsed.genotypes; sampleNamesInOrder = parsed.sampleNamesInOrder; sampleNameToOffset = parsed.sampleNameToOffset; - cacheIsInvalid = false; // these values build the cache loaded = true; unparsedGenotypeData = null; // don't hold the unparsed data any longer @@ -133,31 +140,43 @@ public class LazyGenotypesContext extends GenotypesContext { // That said, it's not such an important routine -- it's just checking that the genotypes // are well formed w.r.t. the alleles list, but this will be enforced within the VCFCodec } - - return notToBeDirectlyAccessedGenotypes; } /** - * Overrides the buildCache functionality. If the data hasn't been loaded + * Overrides the ensure* functionality. If the data hasn't been loaded * yet and we want to build the cache, just decode it and we're done. If we've * already decoded the data, though, go through the super class */ @Override - protected synchronized void buildCache() { - if ( cacheIsInvalid ) { - if ( ! loaded ) { - getGenotypes(); // will load up all of the necessary data - } else { - super.buildCache(); - } + protected synchronized void ensureSampleNameMap() { + if ( ! loaded ) { + decode(); // will load up all of the necessary data + } else { + super.ensureSampleNameMap(); } } @Override - protected void invalidateCaches() { + protected synchronized void ensureSampleOrdering() { + if ( ! loaded ) { + decode(); // will load up all of the necessary data + } else { + super.ensureSampleOrdering(); + } + } + + @Override + protected void invalidateSampleNameMap() { // if the cache is invalidated, and we haven't loaded our data yet, do so - if ( ! loaded ) getGenotypes(); - super.invalidateCaches(); + if ( ! loaded ) decode(); + super.invalidateSampleNameMap(); + } + + @Override + protected void invalidateSampleOrdering() { + // if the cache is invalidated, and we haven't loaded our data yet, do so + if ( ! loaded ) decode(); + super.invalidateSampleOrdering(); } @Override @@ -177,11 +196,4 @@ public class LazyGenotypesContext extends GenotypesContext { public Object getUnparsedGenotypeData() { return unparsedGenotypeData; } - - /** - * Force us to decode the genotypes - */ - public void decode() { - buildCache(); - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 21a371e2f..05e768ea7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -532,7 +532,6 @@ public class VariantContextUtils { final Map attributesWithMaxAC = new TreeMap(); double log10PError = 1; VariantContext vcWithMaxAC = null; - Set addedSamples = new HashSet(first.getNSamples()); GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs @@ -557,7 +556,7 @@ public class VariantContextUtils { alleles.addAll(alleleMapping.values()); - mergeGenotypes(genotypes, addedSamples, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); + mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1); @@ -963,10 +962,10 @@ public class VariantContextUtils { } } - private static void mergeGenotypes(GenotypesContext mergedGenotypes, Set addedSamples, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { for ( Genotype g : oneVC.getGenotypes() ) { String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); - if ( ! addedSamples.contains(name) ) { + if ( mergedGenotypes.containsSample(name) ) { // only add if the name is new Genotype newG = g; @@ -976,7 +975,6 @@ public class VariantContextUtils { } mergedGenotypes.add(newG); - addedSamples.add(name); } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java index afb9336a0..ee0a5dfe0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java @@ -88,7 +88,8 @@ public class GenotypesContextUnitTest extends BaseTest { @Override public LazyGenotypesContext.LazyData parse(final Object data) { GenotypesContext gc = GenotypesContext.copy((List)data); - gc.buildCache(); + gc.ensureSampleNameMap(); + gc.ensureSampleOrdering(); return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset); } @@ -234,10 +235,6 @@ public class GenotypesContextUnitTest extends BaseTest { gc.add(add2); testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); - gc = cfg.makeContext(); - gc.add(add1, add2); - testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); - gc = cfg.makeContext(); gc.addAll(Arrays.asList(add1, add2)); testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));