From 2e9ecf639ef8846faa40e1200c032bf80c38406e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 21 Nov 2011 09:30:40 -0500 Subject: [PATCH] Generalized interface to LazyGenotypesContext -- Now you provide a LazyParsing object -- LazyGenotypesContext now knows nothing about the VCF parser itself. The parser holds all of the necessary data to parse the VCF genotypes when necessarily, and the LGC only has a pointer to this object -- Using new interface added LazyGenotypesContext to unit tests with a simple lazy version -- Deleted VCFParser interface, as it was no longer necessary --- .../utils/codecs/vcf/AbstractVCFCodec.java | 32 ++++++- .../utils/codecs/vcf/StandardVCFWriter.java | 2 +- .../sting/utils/codecs/vcf/VCF3Codec.java | 10 +- .../sting/utils/codecs/vcf/VCFCodec.java | 10 +- .../sting/utils/codecs/vcf/VCFParser.java | 24 ----- .../variantcontext/GenotypesContext.java | 5 +- .../variantcontext/LazyGenotypesContext.java | 93 +++++++++++-------- .../utils/variantcontext/VariantContext.java | 2 - .../GenotypesContextUnitTest.java | 31 +++++-- 9 files changed, 117 insertions(+), 92 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index ee3184dc2..216ee3fb6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -17,7 +17,7 @@ import java.util.*; import java.util.zip.GZIPInputStream; -public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser { +public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { protected final static Logger log = Logger.getLogger(VCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column @@ -59,6 +59,29 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, protected Map stringCache = new HashMap(); + /** + * Creates a LazyParser for a LazyGenotypesContext to use to decode + * our genotypes only when necessary. We do this instead of eagarly + * decoding the genotypes just to turn around and reencode in the frequent + * case where we don't actually want to manipulate the genotypes + */ + class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { + final List alleles; + final String contig; + final int start; + + LazyVCFGenotypesParser(final List alleles, final String contig, final int start) { + this.alleles = alleles; + this.contig = contig; + this.start = start; + } + + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + //System.out.printf("Loading genotypes... %s:%d%n", contig, start); + return createGenotypeMap((String) data, alleles, contig, start); + } + } /** * @param reader the line reader to take header lines from @@ -68,13 +91,14 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @param chr chrom * @param pos position * @return a mapping of sample name to genotype object */ - public abstract GenotypesContext createGenotypeMap(String str, List alleles, String chr, int pos); + public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos); /** @@ -294,7 +318,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // do we have genotyping data if (parts.length > NUM_STANDARD_FIELDS) { - LazyGenotypesContext lazy = new LazyGenotypesContext(this, parts[8], chr, pos, alleles, header.getGenotypeSamples().size()); + final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); + final int nGenotypes = header.getGenotypeSamples().size(); + LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); builder.genotypesNoValidation(lazy); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 7a496cb7c..ac1da7110 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -229,7 +229,7 @@ public class StandardVCFWriter extends IndexingVCFWriter { final GenotypesContext gc = vc.getGenotypes(); if ( gc instanceof LazyGenotypesContext && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() != null) { mWriter.write(VCFConstants.FIELD_SEPARATOR); - mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData()); + mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString()); } else { List genotypeAttributeKeys = new ArrayList(); if ( vc.hasGenotypes() ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index 5e6e2e94a..6f8e64e55 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -3,10 +3,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.IOException; @@ -112,13 +109,14 @@ public class VCF3Codec extends AbstractVCFCodec { /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @param chr chrom * @param pos position * @return a mapping of sample name to genotype object */ - public GenotypesContext createGenotypeMap(String str, List alleles, String chr, int pos) { + public LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos) { if (genotypeParts == null) genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; @@ -191,7 +189,7 @@ public class VCF3Codec extends AbstractVCFCodec { } } - return GenotypesContext.create(genotypes, header.sampleNameToOffset, header.sampleNamesInOrder); + return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index b7b7eb5f7..407c4bc41 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -3,10 +3,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.IOException; @@ -141,11 +138,12 @@ public class VCFCodec extends AbstractVCFCodec { /** * create a genotype map + * * @param str the string * @param alleles the list of alleles * @return a mapping of sample name to genotype object */ - public GenotypesContext createGenotypeMap(String str, List alleles, String chr, int pos) { + public LazyGenotypesContext.LazyData createGenotypeMap(String str, List alleles, String chr, int pos) { if (genotypeParts == null) genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; @@ -215,7 +213,7 @@ public class VCFCodec extends AbstractVCFCodec { } } - return GenotypesContext.create(genotypes, header.sampleNameToOffset, header.sampleNamesInOrder); + return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java deleted file mode 100755 index 8903a176a..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFParser.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.utils.codecs.vcf; - -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; - -import java.util.List; - - -/** - * All VCF codecs need to implement this interface so that we can perform lazy loading. - */ -public interface VCFParser { - - /** - * create a genotype map - * @param str the string - * @param alleles the list of alleles - * @param chr chrom - * @param pos position - * @return a mapping of sample name to genotype object - */ - public GenotypesContext createGenotypeMap(String str, List alleles, String chr, int pos); - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index 846e6c89c..ab5ab9465 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -251,7 +251,7 @@ public class GenotypesContext implements List { // --------------------------------------------------------------------------- @Ensures({"cacheIsInvalid == true"}) - private synchronized void invalidateCaches() { + protected void invalidateCaches() { cacheIsInvalid = true; sampleNamesInOrder = null; sampleNameToOffset = null; @@ -262,7 +262,7 @@ public class GenotypesContext implements List { "sampleNameToOffset != null", "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)", "sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"}) - protected synchronized void buildCache() { + protected void buildCache() { if ( cacheIsInvalid ) { cacheIsInvalid = false; sampleNamesInOrder = new ArrayList(size()); @@ -291,6 +291,7 @@ public class GenotypesContext implements List { @Override public void clear() { checkImmutability(); + invalidateCaches(); getGenotypes().clear(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java index fccc8c667..7facfacf6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/LazyGenotypesContext.java @@ -26,10 +26,8 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.codecs.vcf.VCFParser; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Map; @@ -41,20 +39,10 @@ import java.util.Map; * decoding the genotypes unnecessarily. */ public class LazyGenotypesContext extends GenotypesContext { - /** parser the VCF parser we'll use to decode unparsedGenotypeData if necessary */ - final VCFParser parser; + /** The LazyParser we'll use to decode unparsedGenotypeData if necessary */ + final LazyParser parser; - /** a string containing the unparsed VCF genotypes */ - String unparsedGenotypeData; - - /** alleles the current list of alleles at the site (known already in the parser) */ - final List alleles; - - /** contig the current contig (known already in the parser) */ - final String contig; - - /** the current start position (known already in the parser) */ - final int start; + Object unparsedGenotypeData; /** * nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data @@ -70,28 +58,48 @@ public class LazyGenotypesContext extends GenotypesContext { private final static ArrayList EMPTY = new ArrayList(0); /** - * Creates a new lazy loading genotypes context - * - * @param parser the VCF parser we'll use to decode unparsedGenotypeData if necessary - * @param unparsedGenotypeData a string containing the unparsed VCF genotypes - * @param contig the current contig (known already in the parser) - * @param start the current start position (known already in the parser) - * @param alleles the current list of alleles at the site (known already in the parser) - * @param nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data - * (known already in the parser). Useful for isEmpty and size() optimizations + * Simple lazy parser interface. Provide an object implementing this + * interface to LazyGenotypesContext, and it's parse method will be called + * when the use of the lazy context requires the underlying genotypes data + * be parsed into Genotype objects. The data argument is the data provided + * to the LazyGenotypesContext holding encoded genotypes data */ - @Requires({"parser != null", "unparsedGenotypeData != null", - "contig != null", "start >= 0", "alleles != null && alleles.size() > 0", - "nUnparsedGenotypes > 0"}) - public LazyGenotypesContext(final VCFParser parser, final String unparsedGenotypeData, - final String contig, final int start, final List alleles, - int nUnparsedGenotypes ) { + public interface LazyParser { + public LazyData parse(Object data); + } + + /** + * Returns the data used in the full GenotypesContext constructor + * + * {@link GenotypesContext#GenotypesContext(java.util.ArrayList, java.util.Map, java.util.List, boolean)} + */ + public static class LazyData { + final ArrayList genotypes; + final Map sampleNameToOffset; + final List sampleNamesInOrder; + + public LazyData(final ArrayList genotypes, + final List sampleNamesInOrder, + final Map sampleNameToOffset) { + this.genotypes = genotypes; + this.sampleNamesInOrder = sampleNamesInOrder; + this.sampleNameToOffset = sampleNameToOffset; + } + } + + /** + * Creates a new lazy loading genotypes context using the LazyParser to create + * genotypes data on demand. + * + * @param parser the parser to be used to load on-demand genotypes data + * @param unparsedGenotypeData the encoded genotypes data that we will decode if necessary + * @param nUnparsedGenotypes the number of genotypes that will be produced if / when we actually decode the genotypes data + */ + @Requires({"parser != null", "unparsedGenotypeData != null", "nUnparsedGenotypes >= 0"}) + public LazyGenotypesContext(final LazyParser parser, final Object unparsedGenotypeData, final int nUnparsedGenotypes) { super(EMPTY, false); - this.unparsedGenotypeData = unparsedGenotypeData; - this.start = start; this.parser = parser; - this.contig = contig; - this.alleles = alleles; + this.unparsedGenotypeData = unparsedGenotypeData; this.nUnparsedGenotypes = nUnparsedGenotypes; } @@ -108,10 +116,10 @@ public class LazyGenotypesContext extends GenotypesContext { protected ArrayList getGenotypes() { if ( ! loaded ) { //System.out.printf("Loading genotypes... %s:%d%n", contig, start); - GenotypesContext subcontext = parser.createGenotypeMap(unparsedGenotypeData, alleles, contig, start); - notToBeDirectlyAccessedGenotypes = subcontext.notToBeDirectlyAccessedGenotypes; - sampleNamesInOrder = subcontext.sampleNamesInOrder; - sampleNameToOffset = subcontext.sampleNameToOffset; + LazyData parsed = parser.parse(unparsedGenotypeData); + notToBeDirectlyAccessedGenotypes = parsed.genotypes; + sampleNamesInOrder = parsed.sampleNamesInOrder; + sampleNameToOffset = parsed.sampleNameToOffset; cacheIsInvalid = false; // these values build the cache loaded = true; unparsedGenotypeData = null; // don't hold the unparsed data any longer @@ -140,6 +148,13 @@ public class LazyGenotypesContext extends GenotypesContext { } } + @Override + protected void invalidateCaches() { + // if the cache is invalidated, and we haven't loaded our data yet, do so + if ( ! loaded ) getGenotypes(); + super.invalidateCaches(); + } + @Override public boolean isEmpty() { // optimization -- we know the number of samples in the unparsed data, so use it here to @@ -154,7 +169,7 @@ public class LazyGenotypesContext extends GenotypesContext { return loaded ? super.size() : nUnparsedGenotypes; } - public String getUnparsedGenotypeData() { + public Object getUnparsedGenotypeData() { return unparsedGenotypeData; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 8d74f5220..331ca97d3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,11 +1,9 @@ package org.broadinstitute.sting.utils.variantcontext; -import org.apache.commons.lang.Validate; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java index a65051fae..c12fbac9c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypesContextUnitTest.java @@ -79,8 +79,21 @@ public class GenotypesContextUnitTest extends BaseTest { } }; - private Collection allMakers = - Arrays.asList(baseMaker); + private final class lazyMaker implements LazyGenotypesContext.LazyParser, ContextMaker { + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + GenotypesContext gc = GenotypesContext.copy((List)data); + gc.buildCache(); + return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset); + } + + @Override + public GenotypesContext make(final List initialSamples) { + return new LazyGenotypesContext(this, initialSamples, initialSamples.size()); + } + } + + private Collection allMakers = Arrays.asList(baseMaker, new lazyMaker()); private class GenotypesContextProvider extends TestDataProvider { ContextMaker maker; @@ -163,7 +176,7 @@ public class GenotypesContextUnitTest extends BaseTest { Assert.assertFalse(gc.containsSamples(withMissing)); } - @Test(dataProvider = "GenotypesContextProvider") + @Test(enabled = true, dataProvider = "GenotypesContextProvider") public void testImmutable(GenotypesContextProvider cfg) { GenotypesContext gc = cfg.makeContext(); Assert.assertEquals(gc.isMutable(), true); @@ -171,14 +184,14 @@ public class GenotypesContextUnitTest extends BaseTest { Assert.assertEquals(gc.isMutable(), false); } - @Test(dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class ) + @Test(enabled = true, dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class ) public void testImmutableCall1(GenotypesContextProvider cfg) { GenotypesContext gc = cfg.makeContext(); gc.immutable(); gc.add(MISSING); } - @Test(dataProvider = "GenotypesContextProvider") + @Test(enabled = true, dataProvider = "GenotypesContextProvider") public void testClear(GenotypesContextProvider cfg) { GenotypesContext gc = cfg.makeContext(); gc.clear(); @@ -197,7 +210,7 @@ public class GenotypesContextUnitTest extends BaseTest { return l; } - @Test(dataProvider = "GenotypesContextProvider") + @Test(enabled = true, dataProvider = "GenotypesContextProvider") public void testAdds(GenotypesContextProvider cfg) { Genotype add1 = new Genotype("add1", Arrays.asList(Aref, Aref)); Genotype add2 = new Genotype("add2", Arrays.asList(Aref, Aref)); @@ -220,7 +233,7 @@ public class GenotypesContextUnitTest extends BaseTest { testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); } - @Test(dataProvider = "GenotypesContextProvider") + @Test(enabled = true, dataProvider = "GenotypesContextProvider") public void testRemoves(GenotypesContextProvider cfg) { Genotype rm1 = AA; Genotype rm2 = AC; @@ -257,7 +270,7 @@ public class GenotypesContextUnitTest extends BaseTest { testGenotypesContextContainsExpectedSamples(gc, gc.getGenotypes()); } - @Test(dataProvider = "GenotypesContextProvider") + @Test(enabled = true, dataProvider = "GenotypesContextProvider") public void testSet(GenotypesContextProvider cfg) { Genotype set = new Genotype("replace", Arrays.asList(Aref, Aref)); int n = cfg.makeContext().size(); @@ -271,7 +284,7 @@ public class GenotypesContextUnitTest extends BaseTest { } } - @Test(dataProvider = "GenotypesContextProvider") + @Test(enabled = true, dataProvider = "GenotypesContextProvider") public void testReplace(GenotypesContextProvider cfg) { int n = cfg.makeContext().size(); for ( int i = 0; i < n; i++ ) {