Generalized interface to LazyGenotypesContext

-- Now you provide a LazyParsing object
-- LazyGenotypesContext now knows nothing about the VCF parser itself.  The parser holds all of the necessary data to parse the VCF genotypes when necessarily, and the LGC only has a pointer to this object
-- Using new interface added LazyGenotypesContext to unit tests with a simple lazy version
-- Deleted VCFParser interface, as it was no longer necessary
This commit is contained in:
Mark DePristo 2011-11-21 09:30:40 -05:00
parent f0ac588d32
commit 2e9ecf639e
9 changed files with 117 additions and 92 deletions

View File

@ -17,7 +17,7 @@ import java.util.*;
import java.util.zip.GZIPInputStream;
public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser {
public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
protected final static Logger log = Logger.getLogger(VCFCodec.class);
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
@ -59,6 +59,29 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
protected Map<String, String> stringCache = new HashMap<String, String>();
/**
* Creates a LazyParser for a LazyGenotypesContext to use to decode
* our genotypes only when necessary. We do this instead of eagarly
* decoding the genotypes just to turn around and reencode in the frequent
* case where we don't actually want to manipulate the genotypes
*/
class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser {
final List<Allele> alleles;
final String contig;
final int start;
LazyVCFGenotypesParser(final List<Allele> alleles, final String contig, final int start) {
this.alleles = alleles;
this.contig = contig;
this.start = start;
}
@Override
public LazyGenotypesContext.LazyData parse(final Object data) {
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
return createGenotypeMap((String) data, alleles, contig, start);
}
}
/**
* @param reader the line reader to take header lines from
@ -68,13 +91,14 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public abstract GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
/**
@ -294,7 +318,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
// do we have genotyping data
if (parts.length > NUM_STANDARD_FIELDS) {
LazyGenotypesContext lazy = new LazyGenotypesContext(this, parts[8], chr, pos, alleles, header.getGenotypeSamples().size());
final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos);
final int nGenotypes = header.getGenotypeSamples().size();
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes);
builder.genotypesNoValidation(lazy);
}

View File

@ -229,7 +229,7 @@ public class StandardVCFWriter extends IndexingVCFWriter {
final GenotypesContext gc = vc.getGenotypes();
if ( gc instanceof LazyGenotypesContext && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() != null) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData());
mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString());
} else {
List<String> genotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) {

View File

@ -3,10 +3,7 @@ package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File;
import java.io.IOException;
@ -112,13 +109,14 @@ public class VCF3Codec extends AbstractVCFCodec {
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
@ -191,7 +189,7 @@ public class VCF3Codec extends AbstractVCFCodec {
}
}
return GenotypesContext.create(genotypes, header.sampleNameToOffset, header.sampleNamesInOrder);
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
}
@Override

View File

@ -3,10 +3,7 @@ package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File;
import java.io.IOException;
@ -141,11 +138,12 @@ public class VCFCodec extends AbstractVCFCodec {
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
@ -215,7 +213,7 @@ public class VCFCodec extends AbstractVCFCodec {
}
}
return GenotypesContext.create(genotypes, header.sampleNameToOffset, header.sampleNamesInOrder);
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
}
@Override

View File

@ -1,24 +0,0 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import java.util.List;
/**
* All VCF codecs need to implement this interface so that we can perform lazy loading.
*/
public interface VCFParser {
/**
* create a genotype map
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
}

View File

@ -251,7 +251,7 @@ public class GenotypesContext implements List<Genotype> {
// ---------------------------------------------------------------------------
@Ensures({"cacheIsInvalid == true"})
private synchronized void invalidateCaches() {
protected void invalidateCaches() {
cacheIsInvalid = true;
sampleNamesInOrder = null;
sampleNameToOffset = null;
@ -262,7 +262,7 @@ public class GenotypesContext implements List<Genotype> {
"sampleNameToOffset != null",
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)",
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"})
protected synchronized void buildCache() {
protected void buildCache() {
if ( cacheIsInvalid ) {
cacheIsInvalid = false;
sampleNamesInOrder = new ArrayList<String>(size());
@ -291,6 +291,7 @@ public class GenotypesContext implements List<Genotype> {
@Override
public void clear() {
checkImmutability();
invalidateCaches();
getGenotypes().clear();
}

View File

@ -26,10 +26,8 @@ package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFParser;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@ -41,20 +39,10 @@ import java.util.Map;
* decoding the genotypes unnecessarily.
*/
public class LazyGenotypesContext extends GenotypesContext {
/** parser the VCF parser we'll use to decode unparsedGenotypeData if necessary */
final VCFParser parser;
/** The LazyParser we'll use to decode unparsedGenotypeData if necessary */
final LazyParser parser;
/** a string containing the unparsed VCF genotypes */
String unparsedGenotypeData;
/** alleles the current list of alleles at the site (known already in the parser) */
final List<Allele> alleles;
/** contig the current contig (known already in the parser) */
final String contig;
/** the current start position (known already in the parser) */
final int start;
Object unparsedGenotypeData;
/**
* nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data
@ -70,28 +58,48 @@ public class LazyGenotypesContext extends GenotypesContext {
private final static ArrayList<Genotype> EMPTY = new ArrayList<Genotype>(0);
/**
* Creates a new lazy loading genotypes context
*
* @param parser the VCF parser we'll use to decode unparsedGenotypeData if necessary
* @param unparsedGenotypeData a string containing the unparsed VCF genotypes
* @param contig the current contig (known already in the parser)
* @param start the current start position (known already in the parser)
* @param alleles the current list of alleles at the site (known already in the parser)
* @param nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data
* (known already in the parser). Useful for isEmpty and size() optimizations
* Simple lazy parser interface. Provide an object implementing this
* interface to LazyGenotypesContext, and it's parse method will be called
* when the use of the lazy context requires the underlying genotypes data
* be parsed into Genotype objects. The data argument is the data provided
* to the LazyGenotypesContext holding encoded genotypes data
*/
@Requires({"parser != null", "unparsedGenotypeData != null",
"contig != null", "start >= 0", "alleles != null && alleles.size() > 0",
"nUnparsedGenotypes > 0"})
public LazyGenotypesContext(final VCFParser parser, final String unparsedGenotypeData,
final String contig, final int start, final List<Allele> alleles,
int nUnparsedGenotypes ) {
public interface LazyParser {
public LazyData parse(Object data);
}
/**
* Returns the data used in the full GenotypesContext constructor
*
* {@link GenotypesContext#GenotypesContext(java.util.ArrayList, java.util.Map, java.util.List, boolean)}
*/
public static class LazyData {
final ArrayList<Genotype> genotypes;
final Map<String, Integer> sampleNameToOffset;
final List<String> sampleNamesInOrder;
public LazyData(final ArrayList<Genotype> genotypes,
final List<String> sampleNamesInOrder,
final Map<String, Integer> sampleNameToOffset) {
this.genotypes = genotypes;
this.sampleNamesInOrder = sampleNamesInOrder;
this.sampleNameToOffset = sampleNameToOffset;
}
}
/**
* Creates a new lazy loading genotypes context using the LazyParser to create
* genotypes data on demand.
*
* @param parser the parser to be used to load on-demand genotypes data
* @param unparsedGenotypeData the encoded genotypes data that we will decode if necessary
* @param nUnparsedGenotypes the number of genotypes that will be produced if / when we actually decode the genotypes data
*/
@Requires({"parser != null", "unparsedGenotypeData != null", "nUnparsedGenotypes >= 0"})
public LazyGenotypesContext(final LazyParser parser, final Object unparsedGenotypeData, final int nUnparsedGenotypes) {
super(EMPTY, false);
this.unparsedGenotypeData = unparsedGenotypeData;
this.start = start;
this.parser = parser;
this.contig = contig;
this.alleles = alleles;
this.unparsedGenotypeData = unparsedGenotypeData;
this.nUnparsedGenotypes = nUnparsedGenotypes;
}
@ -108,10 +116,10 @@ public class LazyGenotypesContext extends GenotypesContext {
protected ArrayList<Genotype> getGenotypes() {
if ( ! loaded ) {
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
GenotypesContext subcontext = parser.createGenotypeMap(unparsedGenotypeData, alleles, contig, start);
notToBeDirectlyAccessedGenotypes = subcontext.notToBeDirectlyAccessedGenotypes;
sampleNamesInOrder = subcontext.sampleNamesInOrder;
sampleNameToOffset = subcontext.sampleNameToOffset;
LazyData parsed = parser.parse(unparsedGenotypeData);
notToBeDirectlyAccessedGenotypes = parsed.genotypes;
sampleNamesInOrder = parsed.sampleNamesInOrder;
sampleNameToOffset = parsed.sampleNameToOffset;
cacheIsInvalid = false; // these values build the cache
loaded = true;
unparsedGenotypeData = null; // don't hold the unparsed data any longer
@ -140,6 +148,13 @@ public class LazyGenotypesContext extends GenotypesContext {
}
}
@Override
protected void invalidateCaches() {
// if the cache is invalidated, and we haven't loaded our data yet, do so
if ( ! loaded ) getGenotypes();
super.invalidateCaches();
}
@Override
public boolean isEmpty() {
// optimization -- we know the number of samples in the unparsed data, so use it here to
@ -154,7 +169,7 @@ public class LazyGenotypesContext extends GenotypesContext {
return loaded ? super.size() : nUnparsedGenotypes;
}
public String getUnparsedGenotypeData() {
public Object getUnparsedGenotypeData() {
return unparsedGenotypeData;
}
}

View File

@ -1,11 +1,9 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.apache.commons.lang.Validate;
import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;

View File

@ -79,8 +79,21 @@ public class GenotypesContextUnitTest extends BaseTest {
}
};
private Collection<ContextMaker> allMakers =
Arrays.asList(baseMaker);
private final class lazyMaker implements LazyGenotypesContext.LazyParser, ContextMaker {
@Override
public LazyGenotypesContext.LazyData parse(final Object data) {
GenotypesContext gc = GenotypesContext.copy((List<Genotype>)data);
gc.buildCache();
return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset);
}
@Override
public GenotypesContext make(final List<Genotype> initialSamples) {
return new LazyGenotypesContext(this, initialSamples, initialSamples.size());
}
}
private Collection<ContextMaker> allMakers = Arrays.asList(baseMaker, new lazyMaker());
private class GenotypesContextProvider extends TestDataProvider {
ContextMaker maker;
@ -163,7 +176,7 @@ public class GenotypesContextUnitTest extends BaseTest {
Assert.assertFalse(gc.containsSamples(withMissing));
}
@Test(dataProvider = "GenotypesContextProvider")
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
public void testImmutable(GenotypesContextProvider cfg) {
GenotypesContext gc = cfg.makeContext();
Assert.assertEquals(gc.isMutable(), true);
@ -171,14 +184,14 @@ public class GenotypesContextUnitTest extends BaseTest {
Assert.assertEquals(gc.isMutable(), false);
}
@Test(dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class )
@Test(enabled = true, dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class )
public void testImmutableCall1(GenotypesContextProvider cfg) {
GenotypesContext gc = cfg.makeContext();
gc.immutable();
gc.add(MISSING);
}
@Test(dataProvider = "GenotypesContextProvider")
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
public void testClear(GenotypesContextProvider cfg) {
GenotypesContext gc = cfg.makeContext();
gc.clear();
@ -197,7 +210,7 @@ public class GenotypesContextUnitTest extends BaseTest {
return l;
}
@Test(dataProvider = "GenotypesContextProvider")
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
public void testAdds(GenotypesContextProvider cfg) {
Genotype add1 = new Genotype("add1", Arrays.asList(Aref, Aref));
Genotype add2 = new Genotype("add2", Arrays.asList(Aref, Aref));
@ -220,7 +233,7 @@ public class GenotypesContextUnitTest extends BaseTest {
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));
}
@Test(dataProvider = "GenotypesContextProvider")
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
public void testRemoves(GenotypesContextProvider cfg) {
Genotype rm1 = AA;
Genotype rm2 = AC;
@ -257,7 +270,7 @@ public class GenotypesContextUnitTest extends BaseTest {
testGenotypesContextContainsExpectedSamples(gc, gc.getGenotypes());
}
@Test(dataProvider = "GenotypesContextProvider")
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
public void testSet(GenotypesContextProvider cfg) {
Genotype set = new Genotype("replace", Arrays.asList(Aref, Aref));
int n = cfg.makeContext().size();
@ -271,7 +284,7 @@ public class GenotypesContextUnitTest extends BaseTest {
}
}
@Test(dataProvider = "GenotypesContextProvider")
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
public void testReplace(GenotypesContextProvider cfg) {
int n = cfg.makeContext().size();
for ( int i = 0; i < n; i++ ) {