Vastly better way of doing on-demand genotyping loading
-- With our GenotypesContext class we can naturally create a LazyGenotypesContext subclass that does the on-demand loading. -- This new class was replaced all of the old, complex functionality -- Better still, there were many cases were the genotypes were being loaded unnecessarily, resulting in efficiency. This was detected because some of the integration tests changed as the genotypes were no longer being parsing unnecessarily -- Misc. bug fixes throughout the system -- Bug fixes for PhaseByTransmission with new GenotypesContext
This commit is contained in:
parent
f392d330c3
commit
9cb3fe3a59
|
|
@ -746,11 +746,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
|
||||
if (tracker != null) {
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
|
||||
GenotypesContext genotypeMap = vc.getGenotypes();
|
||||
|
||||
int mvCount;
|
||||
|
||||
GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes());
|
||||
for (Sample sample : trios) {
|
||||
Genotype mother = vc.getGenotype(sample.getMaternalID());
|
||||
Genotype father = vc.getGenotype(sample.getPaternalID());
|
||||
|
|
@ -761,18 +759,18 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
continue;
|
||||
|
||||
ArrayList<Genotype> trioGenotypes = new ArrayList<Genotype>(3);
|
||||
mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
|
||||
final int mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
|
||||
|
||||
Genotype phasedMother = trioGenotypes.get(0);
|
||||
Genotype phasedFather = trioGenotypes.get(1);
|
||||
Genotype phasedChild = trioGenotypes.get(2);
|
||||
|
||||
//Fill the genotype map with the new genotypes and increment metrics counters
|
||||
genotypeMap.add(phasedChild);
|
||||
genotypesContext.replace(phasedChild);
|
||||
if(mother != null){
|
||||
genotypeMap.add(phasedMother);
|
||||
genotypesContext.replace(phasedMother);
|
||||
if(father != null){
|
||||
genotypeMap.add(phasedFather);
|
||||
genotypesContext.replace(phasedFather);
|
||||
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
|
||||
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
|
||||
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
||||
|
|
@ -786,7 +784,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
}
|
||||
}
|
||||
else{
|
||||
genotypeMap.add(phasedFather);
|
||||
genotypesContext.replace(phasedFather);
|
||||
updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
|
||||
if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
|
||||
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
|
||||
|
|
@ -797,10 +795,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
|
|||
//TODO: ADAPT FOR PAIRS TOO!!
|
||||
if(mvCount>0 && mvFile != null)
|
||||
mvFile.println(mvfLine);
|
||||
|
||||
}
|
||||
|
||||
vcfWriter.add(new VariantContextBuilder(vc).genotypes(genotypeMap).make());
|
||||
builder.genotypes(genotypesContext);
|
||||
vcfWriter.add(builder.make());
|
||||
}
|
||||
return metricsCounters;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,10 +10,7 @@ import org.broad.tribble.util.BlockCompressedInputStream;
|
|||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
|
@ -255,11 +252,14 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
*/
|
||||
private VariantContext parseVCFLine(String[] parts) {
|
||||
VariantContextBuilder builder = new VariantContextBuilder();
|
||||
builder.source(getName());
|
||||
|
||||
// increment the line count
|
||||
lineNo++;
|
||||
|
||||
// parse out the required fields
|
||||
builder.chr(getCachedString(parts[0]));
|
||||
final String chr = getCachedString(parts[0]);
|
||||
builder.chr(chr);
|
||||
int pos = Integer.valueOf(parts[1]);
|
||||
builder.start(pos);
|
||||
|
||||
|
|
@ -294,9 +294,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
|||
|
||||
// do we have genotyping data
|
||||
if (parts.length > NUM_STANDARD_FIELDS) {
|
||||
builder.attribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, new String(parts[8]));
|
||||
builder.attribute(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY, this);
|
||||
builder.genotypesAreUnparsed();
|
||||
LazyGenotypesContext lazy = new LazyGenotypesContext(this, parts[8], chr, pos, alleles, header.getGenotypeSamples().size());
|
||||
builder.genotypesNoValidation(lazy);
|
||||
}
|
||||
|
||||
VariantContext vc = null;
|
||||
|
|
|
|||
|
|
@ -219,9 +219,6 @@ public class StandardVCFWriter extends IndexingVCFWriter {
|
|||
Map<String, String> infoFields = new TreeMap<String, String>();
|
||||
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
|
||||
String key = field.getKey();
|
||||
if ( key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) )
|
||||
continue;
|
||||
|
||||
String outputValue = formatVCFField(field.getValue());
|
||||
if ( outputValue != null )
|
||||
infoFields.put(key, outputValue);
|
||||
|
|
@ -229,9 +226,10 @@ public class StandardVCFWriter extends IndexingVCFWriter {
|
|||
writeInfoString(infoFields);
|
||||
|
||||
// FORMAT
|
||||
if ( vc.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) ) {
|
||||
final GenotypesContext gc = vc.getGenotypes();
|
||||
if ( gc instanceof LazyGenotypesContext && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() != null) {
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
mWriter.write(vc.getAttributeAsString(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, ""));
|
||||
mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData());
|
||||
} else {
|
||||
List<String> genotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ public class VCF3Codec extends AbstractVCFCodec {
|
|||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
|
||||
GenotypesContext genotypes = GenotypesContext.create(nParts);
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
|
||||
|
||||
// get the format keys
|
||||
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
|
||||
|
|
@ -191,7 +191,7 @@ public class VCF3Codec extends AbstractVCFCodec {
|
|||
}
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
return GenotypesContext.create(genotypes, header.sampleNameToOffset, header.sampleNamesInOrder);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -191,8 +191,6 @@ public class GCF {
|
|||
boolean first = true;
|
||||
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
|
||||
String key = field.getKey();
|
||||
if ( key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) )
|
||||
continue;
|
||||
int stringIndex = GCFHeaderBuilder.encodeString(key);
|
||||
String outputValue = StandardVCFWriter.formatVCFField(field.getValue());
|
||||
if ( outputValue != null ) {
|
||||
|
|
|
|||
|
|
@ -55,8 +55,14 @@ public class GenotypesContext implements List<Genotype> {
|
|||
/** if true, then we need to reinitialize sampleNamesInOrder and sampleNameToOffset before we use them /*/
|
||||
boolean cacheIsInvalid = true;
|
||||
|
||||
/** An ArrayList of genotypes contained in this context */
|
||||
List<Genotype> genotypes;
|
||||
/**
|
||||
* An ArrayList of genotypes contained in this context
|
||||
*
|
||||
* WARNING: TO ENABLE THE LAZY VERSION OF THIS CLASS, NO METHODS SHOULD DIRECTLY
|
||||
* ACCESS THIS VARIABLE. USE getGenotypes() INSTEAD.
|
||||
*
|
||||
*/
|
||||
ArrayList<Genotype> notToBeDirectlyAccessedGenotypes;
|
||||
|
||||
/** Are we allowing users to modify the list? */
|
||||
boolean immutable = false;
|
||||
|
|
@ -70,7 +76,7 @@ public class GenotypesContext implements List<Genotype> {
|
|||
/**
|
||||
* Create an empty GenotypeContext
|
||||
*/
|
||||
private GenotypesContext() {
|
||||
protected GenotypesContext() {
|
||||
this(10, false);
|
||||
}
|
||||
|
||||
|
|
@ -78,7 +84,7 @@ public class GenotypesContext implements List<Genotype> {
|
|||
* Create an empty GenotypeContext, with initial capacity for n elements
|
||||
*/
|
||||
@Requires("n >= 0")
|
||||
private GenotypesContext(final int n, final boolean immutable) {
|
||||
protected GenotypesContext(final int n, final boolean immutable) {
|
||||
this(new ArrayList<Genotype>(n), immutable);
|
||||
}
|
||||
|
||||
|
|
@ -86,8 +92,8 @@ public class GenotypesContext implements List<Genotype> {
|
|||
* Create an GenotypeContext containing genotypes
|
||||
*/
|
||||
@Requires("genotypes != null")
|
||||
private GenotypesContext(final ArrayList<Genotype> genotypes, final boolean immutable) {
|
||||
this.genotypes = genotypes;
|
||||
protected GenotypesContext(final ArrayList<Genotype> genotypes, final boolean immutable) {
|
||||
this.notToBeDirectlyAccessedGenotypes = genotypes;
|
||||
this.immutable = immutable;
|
||||
this.sampleNameToOffset = null;
|
||||
this.cacheIsInvalid = true;
|
||||
|
|
@ -110,11 +116,11 @@ public class GenotypesContext implements List<Genotype> {
|
|||
"sampleNamesInOrder != null",
|
||||
"genotypes.size() == sampleNameToOffset.size()",
|
||||
"genotypes.size() == sampleNamesInOrder.size()"})
|
||||
private GenotypesContext(final ArrayList<Genotype> genotypes,
|
||||
protected GenotypesContext(final ArrayList<Genotype> genotypes,
|
||||
final Map<String, Integer> sampleNameToOffset,
|
||||
final List<String> sampleNamesInOrder,
|
||||
final boolean immutable) {
|
||||
this.genotypes = genotypes;
|
||||
this.notToBeDirectlyAccessedGenotypes = genotypes;
|
||||
this.immutable = immutable;
|
||||
this.sampleNameToOffset = sampleNameToOffset;
|
||||
this.sampleNamesInOrder = sampleNamesInOrder;
|
||||
|
|
@ -203,7 +209,7 @@ public class GenotypesContext implements List<Genotype> {
|
|||
@Requires({"toCopy != null"})
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext copy(final GenotypesContext toCopy) {
|
||||
return create(new ArrayList<Genotype>(toCopy.genotypes));
|
||||
return create(new ArrayList<Genotype>(toCopy.getGenotypes()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -225,7 +231,6 @@ public class GenotypesContext implements List<Genotype> {
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
public final GenotypesContext immutable() {
|
||||
this.genotypes = Collections.unmodifiableList(genotypes);
|
||||
immutable = true;
|
||||
return this;
|
||||
}
|
||||
|
|
@ -255,16 +260,16 @@ public class GenotypesContext implements List<Genotype> {
|
|||
@Ensures({"cacheIsInvalid == false",
|
||||
"sampleNamesInOrder != null",
|
||||
"sampleNameToOffset != null",
|
||||
"sameSamples(genotypes, sampleNamesInOrder)",
|
||||
"sameSamples(genotypes, sampleNameToOffset.keySet())"})
|
||||
private synchronized void buildCache() {
|
||||
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNamesInOrder)",
|
||||
"sameSamples(notToBeDirectlyAccessedGenotypes, sampleNameToOffset.keySet())"})
|
||||
protected synchronized void buildCache() {
|
||||
if ( cacheIsInvalid ) {
|
||||
cacheIsInvalid = false;
|
||||
sampleNamesInOrder = new ArrayList<String>(genotypes.size());
|
||||
sampleNameToOffset = new HashMap<String, Integer>(genotypes.size());
|
||||
sampleNamesInOrder = new ArrayList<String>(size());
|
||||
sampleNameToOffset = new HashMap<String, Integer>(size());
|
||||
|
||||
for ( int i = 0; i < genotypes.size(); i++ ) {
|
||||
final Genotype g = genotypes.get(i);
|
||||
for ( int i = 0; i < size(); i++ ) {
|
||||
final Genotype g = getGenotypes().get(i);
|
||||
sampleNamesInOrder.add(g.getSampleName());
|
||||
sampleNameToOffset.put(g.getSampleName(), i);
|
||||
}
|
||||
|
|
@ -279,20 +284,24 @@ public class GenotypesContext implements List<Genotype> {
|
|||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
protected ArrayList<Genotype> getGenotypes() {
|
||||
return notToBeDirectlyAccessedGenotypes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
checkImmutability();
|
||||
genotypes.clear();
|
||||
getGenotypes().clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return genotypes.size();
|
||||
return getGenotypes().size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return genotypes.isEmpty();
|
||||
return getGenotypes().isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -300,14 +309,14 @@ public class GenotypesContext implements List<Genotype> {
|
|||
public boolean add(final Genotype genotype) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return genotypes.add(genotype);
|
||||
return getGenotypes().add(genotype);
|
||||
}
|
||||
|
||||
@Requires("genotype != null")
|
||||
public boolean add(final Genotype ... genotype) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return genotypes.addAll(Arrays.asList(genotype));
|
||||
return getGenotypes().addAll(Arrays.asList(genotype));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -319,7 +328,7 @@ public class GenotypesContext implements List<Genotype> {
|
|||
public boolean addAll(final Collection<? extends Genotype> genotypes) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return this.genotypes.addAll(genotypes);
|
||||
return getGenotypes().addAll(genotypes);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -329,38 +338,43 @@ public class GenotypesContext implements List<Genotype> {
|
|||
|
||||
@Override
|
||||
public boolean contains(final Object o) {
|
||||
return this.genotypes.contains(o);
|
||||
return getGenotypes().contains(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsAll(final Collection<?> objects) {
|
||||
return this.genotypes.containsAll(objects);
|
||||
return getGenotypes().containsAll(objects);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Genotype get(final int i) {
|
||||
return genotypes.get(i);
|
||||
return getGenotypes().get(i);
|
||||
}
|
||||
|
||||
public Genotype get(final String sampleName) {
|
||||
buildCache();
|
||||
Integer offset = sampleNameToOffset.get(sampleName);
|
||||
return offset == null ? null : genotypes.get(offset);
|
||||
Integer offset = getSampleI(sampleName);
|
||||
return offset == null ? null : getGenotypes().get(offset);
|
||||
}
|
||||
|
||||
private Integer getSampleI(final String sampleName) {
|
||||
buildCache();
|
||||
return sampleNameToOffset.get(sampleName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int indexOf(final Object o) {
|
||||
return genotypes.indexOf(o);
|
||||
return getGenotypes().indexOf(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Genotype> iterator() {
|
||||
return genotypes.iterator();
|
||||
return getGenotypes().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int lastIndexOf(final Object o) {
|
||||
return genotypes.lastIndexOf(o);
|
||||
return getGenotypes().lastIndexOf(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -381,50 +395,67 @@ public class GenotypesContext implements List<Genotype> {
|
|||
public Genotype remove(final int i) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return genotypes.remove(i);
|
||||
return getGenotypes().remove(i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean remove(final Object o) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return genotypes.remove(o);
|
||||
return getGenotypes().remove(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean removeAll(final Collection<?> objects) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return genotypes.removeAll(objects);
|
||||
return getGenotypes().removeAll(objects);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retainAll(final Collection<?> objects) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return genotypes.retainAll(objects);
|
||||
return getGenotypes().retainAll(objects);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Genotype set(final int i, final Genotype genotype) {
|
||||
checkImmutability();
|
||||
invalidateCaches();
|
||||
return genotypes.set(i, genotype);
|
||||
return getGenotypes().set(i, genotype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces the genotype in this context -- note for efficiency
|
||||
* reasons we do not add the genotype if it's not present. The
|
||||
* return value will be null indicating this happened.
|
||||
* @param genotype a non null genotype to bind in this context
|
||||
* @return null if genotype was not added, otherwise returns the previous genotype
|
||||
*/
|
||||
@Requires("genotype != null")
|
||||
public Genotype replace(final Genotype genotype) {
|
||||
checkImmutability();
|
||||
Integer offset = getSampleI(genotype.getSampleName());
|
||||
if ( offset == null )
|
||||
return null;
|
||||
else
|
||||
return getGenotypes().set(offset, genotype);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Genotype> subList(final int i, final int i1) {
|
||||
return genotypes.subList(i, i1);
|
||||
return getGenotypes().subList(i, i1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] toArray() {
|
||||
return genotypes.toArray();
|
||||
return getGenotypes().toArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T[] toArray(final T[] ts) {
|
||||
return genotypes.toArray(ts);
|
||||
return getGenotypes().toArray(ts);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -528,13 +559,13 @@ public class GenotypesContext implements List<Genotype> {
|
|||
@Requires("samples != null")
|
||||
@Ensures("result != null")
|
||||
public GenotypesContext subsetToSamples( final Set<String> samples ) {
|
||||
if ( samples.size() == genotypes.size() )
|
||||
if ( samples.size() == size() )
|
||||
return this;
|
||||
else if ( samples.isEmpty() )
|
||||
return NO_GENOTYPES;
|
||||
else {
|
||||
GenotypesContext subset = create(samples.size());
|
||||
for ( final Genotype g : genotypes ) {
|
||||
for ( final Genotype g : getGenotypes() ) {
|
||||
if ( samples.contains(g.getSampleName()) ) {
|
||||
subset.add(g);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext;
|
||||
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFParser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* [Short one sentence description of this walker]
|
||||
* <p/>
|
||||
* <p>
|
||||
* [Functionality of this walker]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Input description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Output description]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T $WalkerName
|
||||
* </pre>
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class LazyGenotypesContext extends GenotypesContext {
|
||||
final VCFParser parser;
|
||||
String unparsedGenotypeData;
|
||||
final List<Allele> alleles;
|
||||
final String contig;
|
||||
final int start;
|
||||
final int nUnparsedGenotypes;
|
||||
|
||||
boolean loaded = false;
|
||||
|
||||
private final static ArrayList<Genotype> EMPTY = new ArrayList<Genotype>(0);
|
||||
|
||||
public LazyGenotypesContext(final VCFParser parser, final String unparsedGenotypeData,
|
||||
final String contig, final int start, final List<Allele> alleles,
|
||||
int nUnparsedGenotypes ) {
|
||||
super(EMPTY, false);
|
||||
this.unparsedGenotypeData = unparsedGenotypeData;
|
||||
this.start = start;
|
||||
this.parser = parser;
|
||||
this.contig = contig;
|
||||
this.alleles = alleles;
|
||||
this.nUnparsedGenotypes = nUnparsedGenotypes;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ArrayList<Genotype> getGenotypes() {
|
||||
if ( ! loaded ) {
|
||||
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
|
||||
GenotypesContext subcontext = parser.createGenotypeMap(unparsedGenotypeData, alleles, contig, start);
|
||||
notToBeDirectlyAccessedGenotypes = subcontext.notToBeDirectlyAccessedGenotypes;
|
||||
sampleNamesInOrder = subcontext.sampleNamesInOrder;
|
||||
sampleNameToOffset = subcontext.sampleNameToOffset;
|
||||
cacheIsInvalid = false;
|
||||
loaded = true;
|
||||
unparsedGenotypeData = null;
|
||||
|
||||
// warning -- this path allows us to create a VariantContext that doesn't run validateGenotypes()
|
||||
// That said, it's not such an important routine -- it's just checking that the genotypes
|
||||
// are well formed w.r.t. the alleles list, but this will be enforced within the VCFCodec
|
||||
}
|
||||
|
||||
return notToBeDirectlyAccessedGenotypes;
|
||||
}
|
||||
|
||||
protected synchronized void buildCache() {
|
||||
if ( cacheIsInvalid ) {
|
||||
getGenotypes(); // will load up all of the necessary data
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
// optimization -- we know the number of samples in the unparsed data, so use it here to
|
||||
// avoid parsing just to know if the genotypes context is empty
|
||||
return loaded ? super.isEmpty() : nUnparsedGenotypes == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
// optimization -- we know the number of samples in the unparsed data, so use it here to
|
||||
// avoid parsing just to know the size of the context
|
||||
return loaded ? super.size() : nUnparsedGenotypes;
|
||||
}
|
||||
|
||||
public String getUnparsedGenotypeData() {
|
||||
return unparsedGenotypeData;
|
||||
}
|
||||
}
|
||||
|
|
@ -165,8 +165,6 @@ import java.util.*;
|
|||
public class VariantContext implements Feature { // to enable tribble intergration
|
||||
protected CommonInfo commonInfo = null;
|
||||
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
|
||||
public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_";
|
||||
public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_";
|
||||
|
||||
@Deprecated // ID is no longer stored in the attributes map
|
||||
private final static String ID_KEY = "ID";
|
||||
|
|
@ -231,7 +229,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
* @param other the VariantContext to copy
|
||||
*/
|
||||
protected VariantContext(VariantContext other) {
|
||||
this(other.getSource(), other.getID(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false, NO_VALIDATION);
|
||||
this(other.getSource(), other.getID(), other.getChr(), other.getStart(), other.getEnd(),
|
||||
other.getAlleles(), other.getGenotypes(), other.getLog10PError(),
|
||||
other.getFiltersMaybeNull(),
|
||||
other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL,
|
||||
NO_VALIDATION);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -247,14 +249,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
* @param filters filters: use null for unfiltered and empty set for passes filters
|
||||
* @param attributes attributes
|
||||
* @param referenceBaseForIndel padded reference base
|
||||
* @param genotypesAreUnparsed true if the genotypes have not yet been parsed
|
||||
* @param validationToPerform set of validation steps to take
|
||||
*/
|
||||
protected VariantContext(String source, String ID,
|
||||
String contig, long start, long stop,
|
||||
Collection<Allele> alleles, GenotypesContext genotypes,
|
||||
double log10PError, Set<String> filters, Map<String, Object> attributes,
|
||||
Byte referenceBaseForIndel, boolean genotypesAreUnparsed,
|
||||
Byte referenceBaseForIndel,
|
||||
EnumSet<Validation> validationToPerform ) {
|
||||
if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); }
|
||||
this.contig = contig;
|
||||
|
|
@ -265,17 +266,6 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
if ( ID == null || ID.equals("") ) throw new IllegalArgumentException("ID field cannot be the null or the empty string");
|
||||
this.ID = ID.equals(VCFConstants.EMPTY_ID_FIELD) ? VCFConstants.EMPTY_ID_FIELD : ID;
|
||||
|
||||
if ( !genotypesAreUnparsed && attributes != null ) {
|
||||
if ( attributes.containsKey(UNPARSED_GENOTYPE_MAP_KEY) ) {
|
||||
attributes = new HashMap<String, Object>(attributes);
|
||||
attributes.remove(UNPARSED_GENOTYPE_MAP_KEY);
|
||||
}
|
||||
if ( attributes.containsKey(UNPARSED_GENOTYPE_PARSER_KEY) ) {
|
||||
attributes = new HashMap<String, Object>(attributes);
|
||||
attributes.remove(UNPARSED_GENOTYPE_PARSER_KEY);
|
||||
}
|
||||
}
|
||||
|
||||
this.commonInfo = new CommonInfo(source, log10PError, filters, attributes);
|
||||
REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel;
|
||||
|
||||
|
|
@ -316,13 +306,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public VariantContext subContextFromSamples(Set<String> sampleNames, Collection<Allele> alleles) {
|
||||
loadGenotypes();
|
||||
VariantContextBuilder builder = new VariantContextBuilder(this);
|
||||
return builder.genotypes(genotypes.subsetToSamples(sampleNames)).alleles(alleles).make();
|
||||
}
|
||||
|
||||
public VariantContext subContextFromSamples(Set<String> sampleNames) {
|
||||
loadGenotypes();
|
||||
VariantContextBuilder builder = new VariantContextBuilder(this);
|
||||
GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames);
|
||||
return builder.genotypes(newGenotypes).alleles(allelesOfGenotypes(newGenotypes)).make();
|
||||
|
|
@ -698,35 +686,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
private void loadGenotypes() {
|
||||
if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) {
|
||||
if ( genotypes == null )
|
||||
genotypes = NO_GENOTYPES;
|
||||
return;
|
||||
}
|
||||
|
||||
Object parserObj = getAttribute(UNPARSED_GENOTYPE_PARSER_KEY);
|
||||
if ( parserObj == null || !(parserObj instanceof VCFParser) )
|
||||
throw new IllegalStateException("There is no VCF parser stored to unparse the genotype data");
|
||||
VCFParser parser = (VCFParser)parserObj;
|
||||
|
||||
Object mapObj = getAttribute(UNPARSED_GENOTYPE_MAP_KEY);
|
||||
if ( mapObj == null )
|
||||
throw new IllegalStateException("There is no mapping string stored to unparse the genotype data");
|
||||
|
||||
genotypes = parser.createGenotypeMap(mapObj.toString(), new ArrayList<Allele>(alleles), getChr(), getStart());
|
||||
|
||||
commonInfo.removeAttribute(UNPARSED_GENOTYPE_MAP_KEY);
|
||||
commonInfo.removeAttribute(UNPARSED_GENOTYPE_PARSER_KEY);
|
||||
|
||||
validateGenotypes();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of samples in the context
|
||||
*/
|
||||
public int getNSamples() {
|
||||
loadGenotypes();
|
||||
return genotypes.size();
|
||||
}
|
||||
|
||||
|
|
@ -734,12 +697,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
* @return true if the context has associated genotypes
|
||||
*/
|
||||
public boolean hasGenotypes() {
|
||||
loadGenotypes();
|
||||
return ! genotypes.isEmpty();
|
||||
}
|
||||
|
||||
public boolean hasGenotypes(Collection<String> sampleNames) {
|
||||
loadGenotypes();
|
||||
return genotypes.containsSamples(sampleNames);
|
||||
}
|
||||
|
||||
|
|
@ -747,17 +708,14 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
* @return set of all Genotypes associated with this context
|
||||
*/
|
||||
public GenotypesContext getGenotypes() {
|
||||
loadGenotypes();
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
public Iterable<Genotype> getGenotypesOrderedByName() {
|
||||
loadGenotypes();
|
||||
return genotypes.iterateInSampleNameOrder();
|
||||
}
|
||||
|
||||
public Iterable<Genotype> getGenotypesOrderedBy(Iterable<String> sampleOrdering) {
|
||||
loadGenotypes();
|
||||
return genotypes.iterateInSampleNameOrder(sampleOrdering);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -72,7 +72,6 @@ public class VariantContextBuilder {
|
|||
private Map<String, Object> attributes = null;
|
||||
private boolean attributesCanBeModified = false;
|
||||
private Byte referenceBaseForIndel = null;
|
||||
private boolean genotypesAreUnparsed = false;
|
||||
|
||||
/** enum of what must be validated */
|
||||
final private EnumSet<VariantContext.Validation> toValidate = EnumSet.noneOf(VariantContext.Validation.class);
|
||||
|
|
@ -112,7 +111,6 @@ public class VariantContextBuilder {
|
|||
this.contig = parent.contig;
|
||||
this.filters = parent.getFiltersMaybeNull();
|
||||
this.genotypes = parent.genotypes;
|
||||
this.genotypesAreUnparsed = parent.hasAttribute(VariantContext.UNPARSED_GENOTYPE_MAP_KEY);
|
||||
this.ID = parent.getID();
|
||||
this.log10PError = parent.getLog10PError();
|
||||
this.referenceBaseForIndel = parent.getReferenceBaseForIndel();
|
||||
|
|
@ -179,7 +177,7 @@ public class VariantContextBuilder {
|
|||
|
||||
/**
|
||||
* Makes the attributes field modifiable. In many cases attributes is just a pointer to an immutable
|
||||
* collection, so methods that want to add / remove records require the attributes to be copied first
|
||||
* collection, so methods that want to add / remove records require the attributes to be copied to a
|
||||
*/
|
||||
private void makeAttributesModifiable() {
|
||||
if ( ! attributesCanBeModified ) {
|
||||
|
|
@ -243,6 +241,11 @@ public class VariantContextBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
public VariantContextBuilder genotypesNoValidation(final GenotypesContext genotypes) {
|
||||
this.genotypes = genotypes;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes
|
||||
*
|
||||
|
|
@ -270,15 +273,6 @@ public class VariantContextBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* ADVANCED! tells us that the genotypes data is stored as an unparsed attribute
|
||||
* @return
|
||||
*/
|
||||
public VariantContextBuilder genotypesAreUnparsed() {
|
||||
this.genotypesAreUnparsed = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have ID
|
||||
* @param ID
|
||||
|
|
@ -395,6 +389,6 @@ public class VariantContextBuilder {
|
|||
public VariantContext make() {
|
||||
return new VariantContext(source, ID, contig, start, stop, alleles,
|
||||
genotypes, log10PError, filters, attributes,
|
||||
referenceBaseForIndel, genotypesAreUnparsed, toValidate);
|
||||
referenceBaseForIndel, toValidate);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,9 +14,12 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testNoAction() {
|
||||
// note that this input if slightly malformed, but with the new properly
|
||||
// only when really needed genotype loading of VCF files we don't actually
|
||||
// fix the file in the output
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("8a105fa5eebdfffe7326bc5b3d8ffd1c"));
|
||||
Arrays.asList("b7b7c218e219cd923ce5b6eefc5b7171"));
|
||||
executeTest("test no action", spec);
|
||||
}
|
||||
|
||||
|
|
@ -24,59 +27,86 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testClusteredSnps() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -window 10 --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("27b13f179bb4920615dff3a32730d845"));
|
||||
Arrays.asList("6d45a19e4066e7de6ff6a61f43ffad2b"));
|
||||
executeTest("test clustered SNPs", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMasks() {
|
||||
public void testMask1() {
|
||||
// note that this input if slightly malformed, but with the new properly
|
||||
// only when really needed genotype loading of VCF files we don't actually
|
||||
// fix the file in the output
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
baseTestString() + " -maskName foo --mask:VCF3 " + validationDataLocation + "vcfexample2.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("578f9e774784c25871678e6464fd212b"));
|
||||
Arrays.asList("65b5006bf3ee9d9d08a36d6b854773f2"));
|
||||
executeTest("test mask all", spec1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMask2() {
|
||||
// note that this input if slightly malformed, but with the new properly
|
||||
// only when really needed genotype loading of VCF files we don't actually
|
||||
// fix the file in the output
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||
baseTestString() + " -maskName foo --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("bfa86a674aefca1b13d341cb14ab3c4f"));
|
||||
Arrays.asList("a275d36baca81a1ce03dbb528e95a069"));
|
||||
executeTest("test mask some", spec2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMask3() {
|
||||
// note that this input if slightly malformed, but with the new properly
|
||||
// only when really needed genotype loading of VCF files we don't actually
|
||||
// fix the file in the output
|
||||
WalkerTestSpec spec3 = new WalkerTestSpec(
|
||||
baseTestString() + " -maskName foo -maskExtend 10 --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("5939f80d14b32d88587373532d7b90e5"));
|
||||
Arrays.asList("c9489e1c1342817c36ab4f0770609bdb"));
|
||||
executeTest("test mask extend", spec3);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilter1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
// note that this input if slightly malformed, but with the new properly
|
||||
// only when really needed genotype loading of VCF files we don't actually
|
||||
// fix the file in the output
|
||||
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("45219dbcfb6f81bba2ea0c35f5bfd368"));
|
||||
Arrays.asList("327a611bf82c6c4ae77fbb6d06359f9d"));
|
||||
executeTest("test filter #1", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilter2() {
|
||||
// note that this input if slightly malformed, but with the new properly
|
||||
// only when really needed genotype loading of VCF files we don't actually
|
||||
// fix the file in the output
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("c95845e817da7352b9b72bc9794f18fb"));
|
||||
Arrays.asList("7612b3460575402ad78fa4173178bdcc"));
|
||||
executeTest("test filter #2", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterWithSeparateNames() {
|
||||
// note that this input if slightly malformed, but with the new properly
|
||||
// only when really needed genotype loading of VCF files we don't actually
|
||||
// fix the file in the output
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("b8cdd7f44ff1a395e0a9b06a87e1e530"));
|
||||
Arrays.asList("dce33441f58b284ac9ab94f8e64b84e3"));
|
||||
executeTest("test filter with separate names #2", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenotypeFilters() {
|
||||
public void testGenotypeFilters1() {
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("96b61e4543a73fe725e433f007260039"));
|
||||
executeTest("test genotype filter #1", spec1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenotypeFilters2() {
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||
baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("6c8112ab17ce39c8022c891ae73bf38e"));
|
||||
|
|
|
|||
Loading…
Reference in New Issue