Generalized interface to LazyGenotypesContext
-- Now you provide a LazyParsing object -- LazyGenotypesContext now knows nothing about the VCF parser itself. The parser holds all of the necessary data to parse the VCF genotypes when necessarily, and the LGC only has a pointer to this object -- Using new interface added LazyGenotypesContext to unit tests with a simple lazy version -- Deleted VCFParser interface, as it was no longer necessary
This commit is contained in:
parent
f0ac588d32
commit
2e9ecf639e
|
|
@ -17,7 +17,7 @@ import java.util.*;
|
|||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
|
||||
public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser {
|
||||
public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||
|
||||
protected final static Logger log = Logger.getLogger(VCFCodec.class);
|
||||
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
|
||||
|
|
@ -59,6 +59,29 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
|
||||
protected Map<String, String> stringCache = new HashMap<String, String>();
|
||||
|
||||
/**
|
||||
* Creates a LazyParser for a LazyGenotypesContext to use to decode
|
||||
* our genotypes only when necessary. We do this instead of eagarly
|
||||
* decoding the genotypes just to turn around and reencode in the frequent
|
||||
* case where we don't actually want to manipulate the genotypes
|
||||
*/
|
||||
class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser {
|
||||
final List<Allele> alleles;
|
||||
final String contig;
|
||||
final int start;
|
||||
|
||||
LazyVCFGenotypesParser(final List<Allele> alleles, final String contig, final int start) {
|
||||
this.alleles = alleles;
|
||||
this.contig = contig;
|
||||
this.start = start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LazyGenotypesContext.LazyData parse(final Object data) {
|
||||
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
|
||||
return createGenotypeMap((String) data, alleles, contig, start);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param reader the line reader to take header lines from
|
||||
|
|
@ -68,13 +91,14 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
|
||||
/**
|
||||
* create a genotype map
|
||||
*
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @param chr chrom
|
||||
* @param pos position
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public abstract GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
|
||||
public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -294,7 +318,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
|
||||
// do we have genotyping data
|
||||
if (parts.length > NUM_STANDARD_FIELDS) {
|
||||
LazyGenotypesContext lazy = new LazyGenotypesContext(this, parts[8], chr, pos, alleles, header.getGenotypeSamples().size());
|
||||
final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos);
|
||||
final int nGenotypes = header.getGenotypeSamples().size();
|
||||
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes);
|
||||
builder.genotypesNoValidation(lazy);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -229,7 +229,7 @@ public class StandardVCFWriter extends IndexingVCFWriter {
|
|||
final GenotypesContext gc = vc.getGenotypes();
|
||||
if ( gc instanceof LazyGenotypesContext && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() != null) {
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData());
|
||||
mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString());
|
||||
} else {
|
||||
List<String> genotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
|
|
|
|||
|
|
@ -3,10 +3,7 @@ package org.broadinstitute.sting.utils.codecs.vcf;
|
|||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
|
@ -112,13 +109,14 @@ public class VCF3Codec extends AbstractVCFCodec {
|
|||
|
||||
/**
|
||||
* create a genotype map
|
||||
*
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @param chr chrom
|
||||
* @param pos position
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
|
||||
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
|
||||
if (genotypeParts == null)
|
||||
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
|
||||
|
||||
|
|
@ -191,7 +189,7 @@ public class VCF3Codec extends AbstractVCFCodec {
|
|||
}
|
||||
}
|
||||
|
||||
return GenotypesContext.create(genotypes, header.sampleNameToOffset, header.sampleNamesInOrder);
|
||||
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -3,10 +3,7 @@ package org.broadinstitute.sting.utils.codecs.vcf;
|
|||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
|
@ -141,11 +138,12 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
|
||||
/**
|
||||
* create a genotype map
|
||||
*
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
|
||||
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
|
||||
if (genotypeParts == null)
|
||||
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
|
||||
|
||||
|
|
@ -215,7 +213,7 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
}
|
||||
}
|
||||
|
||||
return GenotypesContext.create(genotypes, header.sampleNameToOffset, header.sampleNamesInOrder);
|
||||
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -1,24 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* All VCF codecs need to implement this interface so that we can perform lazy loading.
|
||||
*/
|
||||
public interface VCFParser {
|
||||
|
||||
/**
|
||||
* create a genotype map
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @param chr chrom
|
||||
* @param pos position
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public GenotypesContext createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
|
||||
|
||||
}
|
||||
|
|
@ -251,7 +251,7 @@ public class GenotypesContext implements List<Genotype> {
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
@Ensures({"cacheIsInvalid == true"})
|
||||
private synchronized void invalidateCaches() {
|
||||
protected void invalidateCaches() {
|
||||
cacheIsInvalid = true;
|
||||
sampleNamesInOrder = null;
|
||||
sampleNameToOffset = null;
|
||||
|
|
@ -262,7 +262,7 @@ public class GenotypesContext implements List<Genotype> {
|
|||
"sampleNameToOffset != null",
|
||||
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)",
|
||||
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"})
|
||||
protected synchronized void buildCache() {
|
||||
protected void buildCache() {
|
||||
if ( cacheIsInvalid ) {
|
||||
cacheIsInvalid = false;
|
||||
sampleNamesInOrder = new ArrayList<String>(size());
|
||||
|
|
@ -291,6 +291,7 @@ public class GenotypesContext implements List<Genotype> {
|
|||
@Override
|
||||
public void clear() {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
getGenotypes().clear();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,10 +26,8 @@ package org.broadinstitute.sting.utils.variantcontext;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFParser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -41,20 +39,10 @@ import java.util.Map;
|
|||
* decoding the genotypes unnecessarily.
|
||||
*/
|
||||
public class LazyGenotypesContext extends GenotypesContext {
|
||||
/** parser the VCF parser we'll use to decode unparsedGenotypeData if necessary */
|
||||
final VCFParser parser;
|
||||
/** The LazyParser we'll use to decode unparsedGenotypeData if necessary */
|
||||
final LazyParser parser;
|
||||
|
||||
/** a string containing the unparsed VCF genotypes */
|
||||
String unparsedGenotypeData;
|
||||
|
||||
/** alleles the current list of alleles at the site (known already in the parser) */
|
||||
final List<Allele> alleles;
|
||||
|
||||
/** contig the current contig (known already in the parser) */
|
||||
final String contig;
|
||||
|
||||
/** the current start position (known already in the parser) */
|
||||
final int start;
|
||||
Object unparsedGenotypeData;
|
||||
|
||||
/**
|
||||
* nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data
|
||||
|
|
@ -70,28 +58,48 @@ public class LazyGenotypesContext extends GenotypesContext {
|
|||
private final static ArrayList<Genotype> EMPTY = new ArrayList<Genotype>(0);
|
||||
|
||||
/**
|
||||
* Creates a new lazy loading genotypes context
|
||||
*
|
||||
* @param parser the VCF parser we'll use to decode unparsedGenotypeData if necessary
|
||||
* @param unparsedGenotypeData a string containing the unparsed VCF genotypes
|
||||
* @param contig the current contig (known already in the parser)
|
||||
* @param start the current start position (known already in the parser)
|
||||
* @param alleles the current list of alleles at the site (known already in the parser)
|
||||
* @param nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data
|
||||
* (known already in the parser). Useful for isEmpty and size() optimizations
|
||||
* Simple lazy parser interface. Provide an object implementing this
|
||||
* interface to LazyGenotypesContext, and it's parse method will be called
|
||||
* when the use of the lazy context requires the underlying genotypes data
|
||||
* be parsed into Genotype objects. The data argument is the data provided
|
||||
* to the LazyGenotypesContext holding encoded genotypes data
|
||||
*/
|
||||
@Requires({"parser != null", "unparsedGenotypeData != null",
|
||||
"contig != null", "start >= 0", "alleles != null && alleles.size() > 0",
|
||||
"nUnparsedGenotypes > 0"})
|
||||
public LazyGenotypesContext(final VCFParser parser, final String unparsedGenotypeData,
|
||||
final String contig, final int start, final List<Allele> alleles,
|
||||
int nUnparsedGenotypes ) {
|
||||
public interface LazyParser {
|
||||
public LazyData parse(Object data);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the data used in the full GenotypesContext constructor
|
||||
*
|
||||
* {@link GenotypesContext#GenotypesContext(java.util.ArrayList, java.util.Map, java.util.List, boolean)}
|
||||
*/
|
||||
public static class LazyData {
|
||||
final ArrayList<Genotype> genotypes;
|
||||
final Map<String, Integer> sampleNameToOffset;
|
||||
final List<String> sampleNamesInOrder;
|
||||
|
||||
public LazyData(final ArrayList<Genotype> genotypes,
|
||||
final List<String> sampleNamesInOrder,
|
||||
final Map<String, Integer> sampleNameToOffset) {
|
||||
this.genotypes = genotypes;
|
||||
this.sampleNamesInOrder = sampleNamesInOrder;
|
||||
this.sampleNameToOffset = sampleNameToOffset;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new lazy loading genotypes context using the LazyParser to create
|
||||
* genotypes data on demand.
|
||||
*
|
||||
* @param parser the parser to be used to load on-demand genotypes data
|
||||
* @param unparsedGenotypeData the encoded genotypes data that we will decode if necessary
|
||||
* @param nUnparsedGenotypes the number of genotypes that will be produced if / when we actually decode the genotypes data
|
||||
*/
|
||||
@Requires({"parser != null", "unparsedGenotypeData != null", "nUnparsedGenotypes >= 0"})
|
||||
public LazyGenotypesContext(final LazyParser parser, final Object unparsedGenotypeData, final int nUnparsedGenotypes) {
|
||||
super(EMPTY, false);
|
||||
this.unparsedGenotypeData = unparsedGenotypeData;
|
||||
this.start = start;
|
||||
this.parser = parser;
|
||||
this.contig = contig;
|
||||
this.alleles = alleles;
|
||||
this.unparsedGenotypeData = unparsedGenotypeData;
|
||||
this.nUnparsedGenotypes = nUnparsedGenotypes;
|
||||
}
|
||||
|
||||
|
|
@ -108,10 +116,10 @@ public class LazyGenotypesContext extends GenotypesContext {
|
|||
protected ArrayList<Genotype> getGenotypes() {
|
||||
if ( ! loaded ) {
|
||||
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
|
||||
GenotypesContext subcontext = parser.createGenotypeMap(unparsedGenotypeData, alleles, contig, start);
|
||||
notToBeDirectlyAccessedGenotypes = subcontext.notToBeDirectlyAccessedGenotypes;
|
||||
sampleNamesInOrder = subcontext.sampleNamesInOrder;
|
||||
sampleNameToOffset = subcontext.sampleNameToOffset;
|
||||
LazyData parsed = parser.parse(unparsedGenotypeData);
|
||||
notToBeDirectlyAccessedGenotypes = parsed.genotypes;
|
||||
sampleNamesInOrder = parsed.sampleNamesInOrder;
|
||||
sampleNameToOffset = parsed.sampleNameToOffset;
|
||||
cacheIsInvalid = false; // these values build the cache
|
||||
loaded = true;
|
||||
unparsedGenotypeData = null; // don't hold the unparsed data any longer
|
||||
|
|
@ -140,6 +148,13 @@ public class LazyGenotypesContext extends GenotypesContext {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void invalidateCaches() {
|
||||
// if the cache is invalidated, and we haven't loaded our data yet, do so
|
||||
if ( ! loaded ) getGenotypes();
|
||||
super.invalidateCaches();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
// optimization -- we know the number of samples in the unparsed data, so use it here to
|
||||
|
|
@ -154,7 +169,7 @@ public class LazyGenotypesContext extends GenotypesContext {
|
|||
return loaded ? super.size() : nUnparsedGenotypes;
|
||||
}
|
||||
|
||||
public String getUnparsedGenotypeData() {
|
||||
public Object getUnparsedGenotypeData() {
|
||||
return unparsedGenotypeData;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,9 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext;
|
||||
|
||||
import org.apache.commons.lang.Validate;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
|
|
|||
|
|
@ -79,8 +79,21 @@ public class GenotypesContextUnitTest extends BaseTest {
|
|||
}
|
||||
};
|
||||
|
||||
private Collection<ContextMaker> allMakers =
|
||||
Arrays.asList(baseMaker);
|
||||
private final class lazyMaker implements LazyGenotypesContext.LazyParser, ContextMaker {
|
||||
@Override
|
||||
public LazyGenotypesContext.LazyData parse(final Object data) {
|
||||
GenotypesContext gc = GenotypesContext.copy((List<Genotype>)data);
|
||||
gc.buildCache();
|
||||
return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenotypesContext make(final List<Genotype> initialSamples) {
|
||||
return new LazyGenotypesContext(this, initialSamples, initialSamples.size());
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<ContextMaker> allMakers = Arrays.asList(baseMaker, new lazyMaker());
|
||||
|
||||
private class GenotypesContextProvider extends TestDataProvider {
|
||||
ContextMaker maker;
|
||||
|
|
@ -163,7 +176,7 @@ public class GenotypesContextUnitTest extends BaseTest {
|
|||
Assert.assertFalse(gc.containsSamples(withMissing));
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider")
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testImmutable(GenotypesContextProvider cfg) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
Assert.assertEquals(gc.isMutable(), true);
|
||||
|
|
@ -171,14 +184,14 @@ public class GenotypesContextUnitTest extends BaseTest {
|
|||
Assert.assertEquals(gc.isMutable(), false);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class )
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class )
|
||||
public void testImmutableCall1(GenotypesContextProvider cfg) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
gc.immutable();
|
||||
gc.add(MISSING);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider")
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testClear(GenotypesContextProvider cfg) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
gc.clear();
|
||||
|
|
@ -197,7 +210,7 @@ public class GenotypesContextUnitTest extends BaseTest {
|
|||
return l;
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider")
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testAdds(GenotypesContextProvider cfg) {
|
||||
Genotype add1 = new Genotype("add1", Arrays.asList(Aref, Aref));
|
||||
Genotype add2 = new Genotype("add2", Arrays.asList(Aref, Aref));
|
||||
|
|
@ -220,7 +233,7 @@ public class GenotypesContextUnitTest extends BaseTest {
|
|||
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider")
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testRemoves(GenotypesContextProvider cfg) {
|
||||
Genotype rm1 = AA;
|
||||
Genotype rm2 = AC;
|
||||
|
|
@ -257,7 +270,7 @@ public class GenotypesContextUnitTest extends BaseTest {
|
|||
testGenotypesContextContainsExpectedSamples(gc, gc.getGenotypes());
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider")
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testSet(GenotypesContextProvider cfg) {
|
||||
Genotype set = new Genotype("replace", Arrays.asList(Aref, Aref));
|
||||
int n = cfg.makeContext().size();
|
||||
|
|
@ -271,7 +284,7 @@ public class GenotypesContextUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider")
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testReplace(GenotypesContextProvider cfg) {
|
||||
int n = cfg.makeContext().size();
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue