Genotype filters are now just Strings, not Set<String>

This commit is contained in:
Mark DePristo 2012-06-28 14:27:09 -04:00
parent f631be8d80
commit 6bea28ae6f
12 changed files with 163 additions and 88 deletions

View File

@ -297,7 +297,8 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
// for each genotype, check filters then create a new object
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isCalled() ) {
List<String> filters = new ArrayList<String>(g.getFilters());
final List<String> filters = new ArrayList<String>();
if ( g.isFiltered() ) filters.add(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(vc, g, exp) )

View File

@ -276,9 +276,8 @@ public class BCF2GenotypeFieldDecoders {
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor, numElements);
if ( value != null ) { // don't add missing values
gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List<String>)value);
}
assert value == null || value instanceof String;
gb.filter((String)value);
}
}
}

View File

@ -183,7 +183,7 @@ public class VCFStandardHeaderLines {
registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"));
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed"));
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter"));
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter"));
// INFO lines
registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval"));

View File

@ -111,8 +111,9 @@ public final class FastGenotype extends Genotype {
final int DP,
final int[] AD,
final int[] PL,
final String filters,
final Map<String, Object> extendedAttributes) {
super(sampleName);
super(sampleName, filters);
this.alleles = alleles;
this.isPhased = isPhased;
this.GQ = GQ;
@ -152,10 +153,6 @@ public final class FastGenotype extends Genotype {
return GQ;
}
@Override public List<String> getFilters() {
return (List<String>) getExtendedAttribute(VCFConstants.GENOTYPE_FILTER_KEY, Collections.emptyList());
}
@Override public int[] getPL() {
return PL;
}

View File

@ -27,6 +27,7 @@ public abstract class Genotype implements Comparable<Genotype> {
* extended attributes map
*/
public final static Collection<String> PRIMARY_KEYS = Arrays.asList(
VCFConstants.GENOTYPE_FILTER_KEY,
VCFConstants.GENOTYPE_KEY,
VCFConstants.GENOTYPE_QUALITY_KEY,
VCFConstants.DEPTH_KEY,
@ -38,14 +39,11 @@ public abstract class Genotype implements Comparable<Genotype> {
private final String sampleName;
private GenotypeType type = null;
private final String filters;
protected Genotype(final String sampleName) {
protected Genotype(final String sampleName, final String filters) {
this.sampleName = sampleName;
}
protected Genotype(final String sampleName, final GenotypeType type) {
this.sampleName = sampleName;
this.type = type;
this.filters = filters;
}
/**
@ -348,13 +346,14 @@ public abstract class Genotype implements Comparable<Genotype> {
}
public String toString() {
return String.format("[%s %s%s%s%s%s%s]",
return String.format("[%s %s%s%s%s%s%s%s]",
getSampleName(),
getGenotypeString(false),
toStringIfExists(VCFConstants.GENOTYPE_QUALITY_KEY, getGQ()),
toStringIfExists(VCFConstants.DEPTH_KEY, getDP()),
toStringIfExists(VCFConstants.GENOTYPE_ALLELE_DEPTHS, getAD()),
toStringIfExists(VCFConstants.GENOTYPE_PL_KEY, getPL()),
toStringIfExists(VCFConstants.GENOTYPE_FILTER_KEY, getFilters()),
sortedString(getExtendedAttributes()));
}
@ -448,15 +447,25 @@ public abstract class Genotype implements Comparable<Genotype> {
}
/**
* Returns the filter string associated with this Genotype.
*
* @return
* @return If this result == null, then the genotype is considered PASSing filters
* If the result != null, then the genotype has failed filtering for the reason(s)
* specified in result. To be reference compliant multiple filter field
* string values can be encoded with a ; separator.
*/
@Ensures({"result != null"})
public abstract List<String> getFilters();
public final String getFilters() {
return filters;
}
@Ensures({"result != getFilters().isEmpty()"})
public boolean isFiltered() {
return ! getFilters().isEmpty();
/**
* Is this genotype filtered or not?
*
* @return returns false if getFilters() == null
*/
@Ensures({"result != (getFilters() == null)"})
public final boolean isFiltered() {
return getFilters() != null;
}
@Deprecated public boolean hasLog10PError() { return hasGQ(); }
@ -569,6 +578,16 @@ public abstract class Genotype implements Comparable<Genotype> {
return v == -1 ? "" : " " + name + " " + v;
}
/**
* Returns a display name for field name with String value v if this isn't null. Otherwise returns ""
* @param name of the field ("FT")
* @param v the value of the field, or null if missing
* @return a non-null string for display if the field is not missing
*/
protected final static String toStringIfExists(final String name, final String v) {
return v == null ? "" : " " + name + " " + v;
}
/**
* Returns a display name for field name with values vs if this isn't null. Otherwise returns ""
* @param name of the field ("AD")

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import java.util.*;
@ -63,6 +64,7 @@ public final class GenotypeBuilder {
private int[] AD = null;
private int[] PL = null;
private Map<String, Object> extendedAttributes = null;
private String filters = null;
private int initialAttributeMapSize = 5;
private boolean useFast = MAKE_FAST_BY_DEFAULT;
@ -147,6 +149,7 @@ public final class GenotypeBuilder {
DP(g.getDP());
AD(g.getAD());
PL(g.getPL());
filter(g.getFilters());
attributes(g.getExtendedAttributes());
return this;
}
@ -164,6 +167,7 @@ public final class GenotypeBuilder {
DP = -1;
AD = null;
PL = null;
filters = null;
extendedAttributes = null;
}
@ -180,21 +184,11 @@ public final class GenotypeBuilder {
public Genotype make() {
if ( useFast ) {
final Map<String, Object> ea = extendedAttributes == null ? NO_ATTRIBUTES : extendedAttributes;
return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, ea);
return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, filters, ea);
} else {
final Map<String, Object> attributes = new LinkedHashMap<String, Object>();
if ( extendedAttributes != null ) attributes.putAll(extendedAttributes);
final double log10PError = GQ == -1 ? SlowGenotype.NO_LOG10_PERROR : (GQ == 0 ? 0 : GQ / -10.0);
Set<String> filters = null;
if ( extendedAttributes != null && extendedAttributes.containsKey(VCFConstants.GENOTYPE_FILTER_KEY) )
{
final Object f = extendedAttributes.get(VCFConstants.GENOTYPE_FILTER_KEY);
if ( f != null )
filters = new LinkedHashSet<String>((List<String>)f);
attributes.remove(VCFConstants.GENOTYPE_FILTER_KEY);
}
if ( DP != -1 ) attributes.put(VCFConstants.DEPTH_KEY, DP);
if ( AD != null ) attributes.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, AD);
final double[] log10likelihoods = PL != null ? GenotypeLikelihoods.fromPLs(PL).getAsVector() : null;
@ -383,9 +377,12 @@ public final class GenotypeBuilder {
*/
@Requires("filters != null")
public GenotypeBuilder filters(final List<String> filters) {
if ( ! filters.isEmpty() )
attribute(VCFConstants.GENOTYPE_FILTER_KEY, filters);
return this;
if ( filters.isEmpty() )
return filter(null);
else if ( filters.size() == 1 )
return filter(filters.get(0));
else
return filter(ParsingUtils.join(";", filters));
}
/**
@ -398,15 +395,24 @@ public final class GenotypeBuilder {
return filters(Arrays.asList(filters));
}
/**
* Most efficient version of setting filters -- just set the filters string to filters
*
* @param filter if filters == null or filters.equals("PASS") => genotype is PASS
* @return
*/
public GenotypeBuilder filter(final String filter) {
this.filters = VCFConstants.PASSES_FILTERS_v4.equals(filter) ? null : filter;
return this;
}
/**
* This genotype is unfiltered
*
* @return
*/
public GenotypeBuilder unfiltered() {
if ( extendedAttributes != null )
extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY);
return this;
return filter(null);
}
/**

View File

@ -42,14 +42,20 @@ public class SlowGenotype extends Genotype {
protected List<Allele> alleles = null;
protected boolean isPhased = false;
protected SlowGenotype(String sampleName, List<Allele> alleles, double log10PError, Set<String> filters, Map<String, Object> attributes, boolean isPhased, double[] log10Likelihoods) {
super(sampleName);
protected SlowGenotype(final String sampleName,
final List<Allele> alleles,
final double log10PError,
final String filters,
final Map<String, Object> attributes,
final boolean isPhased,
final double[] log10Likelihoods) {
super(sampleName, filters);
if ( alleles == null || alleles.isEmpty() )
this.alleles = Collections.emptyList();
else
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes);
commonInfo = new CommonInfo(sampleName, log10PError, Collections.<String>emptySet(), attributes);
if ( log10Likelihoods != null )
commonInfo.putAttribute(VCFConstants.GENOTYPE_PL_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods));
this.isPhased = isPhased;
@ -112,7 +118,6 @@ public class SlowGenotype extends Genotype {
// get routines to access context info fields
//
// ---------------------------------------------------------------------------------------------------------
@Override public List<String> getFilters() { return new ArrayList<String>(commonInfo.getFilters()); }
@Override public boolean hasLog10PError() { return commonInfo.hasLog10PError(); }
@Override public double getLog10PError() { return commonInfo.getLog10PError(); }

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
@ -49,17 +50,22 @@ public abstract class BCF2FieldWriter {
private final VCFHeader header;
private final BCF2FieldEncoder fieldEncoder;
@Requires({"header != null", "fieldEncoder != null"})
protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
this.header = header;
this.fieldEncoder = fieldEncoder;
}
@Ensures("result != null")
protected VCFHeader getHeader() { return header; }
@Ensures("result != null")
protected BCF2FieldEncoder getFieldEncoder() {
return fieldEncoder;
}
@Ensures("result != null")
protected String getField() { return getFieldEncoder().getField(); }
@Requires("vc != null")
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
fieldEncoder.writeFieldKey(encoder);
}
@ -124,6 +130,9 @@ public abstract class BCF2FieldWriter {
}
@Override
@Requires({"encodingType != null",
"nValuesPerGenotype >= 0 || ! getFieldEncoder().hasConstantNumElements()"})
@Ensures("nValuesPerGenotype >= 0")
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
// writes the key information
super.start(encoder, vc);
@ -141,15 +150,18 @@ public abstract class BCF2FieldWriter {
encoder.encodeType(nValuesPerGenotype, encodingType);
}
@Requires({"encodingType != null", "nValuesPerGenotype >= 0"})
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
final Object fieldValue = g.getExtendedAttribute(getField(), null);
getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype);
}
@Ensures({"result >= 0"})
protected int numElements(final VariantContext vc, final Genotype g) {
return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField()));
}
@Ensures({"result >= 0"})
private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) {
int size = -1;
@ -227,6 +239,22 @@ public abstract class BCF2FieldWriter {
}
}
public static class FTGenotypesWriter extends StaticallyTypeGenotypesWriter {
public FTGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
final String fieldValue = g.getFilters();
getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype);
}
@Override
protected int numElements(final VariantContext vc, final Genotype g) {
return getFieldEncoder().numElements(vc, g.getFilters());
}
}
public static class GTWriter extends GenotypesWriter {
final Map<Allele, Integer> alleleMapForTriPlus = new HashMap<Allele, Integer>(5);
Allele ref, alt1;

View File

@ -140,6 +140,8 @@ public class BCF2FieldWriterManager {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
return new BCF2FieldWriter.GTWriter(header, fieldEncoder);
} else if ( line.getID().equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
return new BCF2FieldWriter.FTGenotypesWriter(header, fieldEncoder);
} else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) {
return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field));
} else if ( line.getType() == VCFHeaderLineType.Integer ) {

View File

@ -348,9 +348,8 @@ class VCFWriter extends IndexingVariantContextWriter {
missingSampleError(vc, mHeader);
}
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
final List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for ( String field : genotypeFormatKeys ) {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
if ( !g.isAvailable() ) {
throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
@ -363,54 +362,53 @@ class VCFWriter extends IndexingVariantContextWriter {
}
continue;
}
String outputValue;
final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field);
if ( accessor != null ) {
final int[] intValues = accessor.getValues(g);
if ( intValues == null )
outputValue = VCFConstants.MISSING_VALUE_v4;
else if ( intValues.length == 1 ) // fast path
outputValue = Integer.toString(intValues[0]);
else {
StringBuilder sb = new StringBuilder();
sb.append(intValues[0]);
for ( int i = 1; i < intValues.length; i++) {
sb.append(",");
sb.append(intValues[i]);
}
outputValue = sb.toString();
}
} else {
Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4;
// some exceptions
String outputValue;
if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) {
val = g.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())) : VCFConstants.PASSES_FILTERS_v4;
}
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field);
if ( metaData != null ) {
int numInFormatField = metaData.getCount(vc);
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
// For example, if Number=2, the string has to be ".,."
StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
for ( int i = 1; i < numInFormatField; i++ ) {
sb.append(",");
sb.append(VCFConstants.MISSING_VALUE_v4);
outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4;
} else {
final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field);
if ( accessor != null ) {
final int[] intValues = accessor.getValues(g);
if ( intValues == null )
outputValue = VCFConstants.MISSING_VALUE_v4;
else if ( intValues.length == 1 ) // fast path
outputValue = Integer.toString(intValues[0]);
else {
StringBuilder sb = new StringBuilder();
sb.append(intValues[0]);
for ( int i = 1; i < intValues.length; i++) {
sb.append(",");
sb.append(intValues[i]);
}
outputValue = sb.toString();
}
val = sb.toString();
} else {
Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4;
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field);
if ( metaData != null ) {
int numInFormatField = metaData.getCount(vc);
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
// For example, if Number=2, the string has to be ".,."
StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
for ( int i = 1; i < numInFormatField; i++ ) {
sb.append(",");
sb.append(VCFConstants.MISSING_VALUE_v4);
}
val = sb.toString();
}
}
// assume that if key is absent, then the given string encoding suffices
outputValue = formatVCFField(val);
}
}
// assume that if key is absent, then the given string encoding suffices
outputValue = formatVCFField(val);
if ( outputValue != null )
attrs.add(outputValue);
}
if ( outputValue != null )
attrs.add(outputValue);
}
// strip off trailing missing values

View File

@ -48,6 +48,26 @@ public class GenotypeUnitTest extends BaseTest {
T = Allele.create("T");
}
private static final GenotypeBuilder makeGB() {
return new GenotypeBuilder("misc");
}
@Test
public void testFilters() {
Assert.assertFalse(makeGB().make().isFiltered(), "by default Genotypes must be PASS");
Assert.assertNull(makeGB().make().getFilters(), "by default Genotypes must be PASS => getFilters() == null");
Assert.assertFalse(makeGB().filter(null).make().isFiltered(), "setting filter == null => Genotypes must be PASS");
Assert.assertNull(makeGB().filter(null).make().getFilters(), "Genotypes PASS => getFilters == null");
Assert.assertFalse(makeGB().filter("PASS").make().isFiltered(), "setting filter == PASS => Genotypes must be PASS");
Assert.assertNull(makeGB().filter("PASS").make().getFilters(), "Genotypes PASS => getFilters == null");
Assert.assertTrue(makeGB().filter("x").make().isFiltered(), "setting filter != null => Genotypes must be PASS");
Assert.assertEquals(makeGB().filter("x").make().getFilters(), "x", "Should get back the expected filter string");
Assert.assertEquals(makeGB().filters("x", "y").make().getFilters(), "x;y", "Multiple filter field values should be joined with ;");
Assert.assertEquals(makeGB().filters("x", "y", "z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;");
Assert.assertTrue(makeGB().filters("x", "y", "z").make().isFiltered(), "Multiple filter values should be filtered");
Assert.assertEquals(makeGB().filter("x;y;z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;");
}
// public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
// public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased, double[] log10Likelihoods) {
// public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, double[] log10Likelihoods)

View File

@ -191,7 +191,7 @@ public class VariantContextTestProvider {
addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String);
addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
addHeaderLine(metaData, "FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String);
// prep the header
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));