VariantFiltration can now filter specific samples.

This is *NOT* an ideal implementation.  One day when we have lots of free time (or a greater desire), we will implement this correctly and sophisticatedly using all the power of JEXL.  For now, though, this will have to do.
Docs coming tonight.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3060 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-22 20:45:11 +00:00
parent 543aefc3d7
commit 0097106938
7 changed files with 147 additions and 80 deletions

View File

@ -496,7 +496,7 @@ public class VariantContext {
return ref;
}
/** Private helper routine that grabs the reference allele but doesn't through an error if there's no such allele */
/** Private helper routine that grabs the reference allele but doesn't throw an error if there's no such allele */
private Allele getReferenceWithoutError() {
for ( Allele allele : getAlleles() )
if ( allele.isReference() )

View File

@ -123,6 +123,31 @@ public class VariantContextUtils {
}
/**
* Returns true if exp match VC. See collection<> version for full docs.
* @param g genotype
* @param exp expression
* @return true if there is a match
*/
public static boolean match(Genotype g, JexlVCMatchExp exp) {
return match(g,Arrays.asList(exp)).get(exp);
}
/**
* Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these
* expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL
* expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp
* expressions.
*
* @param g genotype
* @param exps expressions
* @return true if there is a match
*/
public static Map<JexlVCMatchExp, Boolean> match(Genotype g, Collection<JexlVCMatchExp> exps) {
return new VariantJEXLContext(exps,g).getVars();
}
private static void addAttributesToMap(Map<String, String> infoMap, Map<String, ?> attributes, String prefix ) {
for (Map.Entry<String, ?> e : attributes.entrySet()) {
infoMap.put(prefix + String.valueOf(e.getKey()), String.valueOf(e.getValue()));

View File

@ -50,12 +50,14 @@ class VariantJEXLContext implements JexlContext {
map = new JEXLMap(jexl, vc);
}
@Override
public VariantJEXLContext(Collection<VariantContextUtils.JexlVCMatchExp> jexl, Genotype g) {
map = new JEXLMap(jexl, g);
}
public void setVars(Map map) {
throw new UnsupportedOperationException("this operation is unsupported");
}
@Override
public Map getVars() {
return map;
}
@ -72,18 +74,32 @@ class VariantJEXLContext implements JexlContext {
*/
class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
// our variant context
// our variant context and/or Genotype
private final VariantContext vc;
private final Genotype g;
// our context
private JexlContext jContext = null;
// our mapping from JEXLVCMatchExp to Booleans, which will be set to NULL for previously uncached JexlVCMatchExp
private final Map<VariantContextUtils.JexlVCMatchExp,Boolean> jexl;
private Map<VariantContextUtils.JexlVCMatchExp,Boolean> jexl;
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc, Genotype g) {
this.vc = vc;
this.g = g;
initialize(jexlCollection);
}
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc) {
this.vc = vc;
this(jexlCollection, vc, null);
}
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, Genotype g) {
this(jexlCollection, null, g);
}
private void initialize(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection) {
jexl = new HashMap<VariantContextUtils.JexlVCMatchExp,Boolean>();
for (VariantContextUtils.JexlVCMatchExp exp: jexlCollection) {
jexl.put(exp, null);
@ -94,38 +110,49 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
* create the internal JexlContext, only when required. This code is where new JEXL context variables
* should get added.
*
* @param vc the VariantContext
*
*/
private void createContext(VariantContext vc) {
// create a mapping of what we know about the variant context, its Chromosome, positions, etc.
private void createContext() {
Map<String, String> infoMap = new HashMap<String, String>();
infoMap.put("CHROM", vc.getLocation().getContig());
infoMap.put("POS", String.valueOf(vc.getLocation().getStart()));
infoMap.put("TYPE", vc.getType().toString());
infoMap.put("QUAL", String.valueOf(10 * vc.getNegLog10PError()));
// add alleles
infoMap.put("ALLELES", Utils.join(";", vc.getAlleles()));
infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles()));
if ( vc != null ) {
// create a mapping of what we know about the variant context, its Chromosome, positions, etc.
infoMap.put("CHROM", vc.getLocation().getContig());
infoMap.put("POS", String.valueOf(vc.getLocation().getStart()));
infoMap.put("TYPE", vc.getType().toString());
infoMap.put("QUAL", String.valueOf(10 * vc.getNegLog10PError()));
// add attributes
addAttributesToMap(infoMap, vc.getAttributes());
// add alleles
infoMap.put("ALLELES", Utils.join(";", vc.getAlleles()));
infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles()));
// add filter fields
infoMap.put("FILTER", String.valueOf(vc.isFiltered() ? "1" : "0"));
for ( Object filterCode : vc.getFilters() ) {
infoMap.put(String.valueOf(filterCode), "1");
// add attributes
addAttributesToMap(infoMap, vc.getAttributes());
// add filter fields
infoMap.put("FILTER", vc.isFiltered() ? "1" : "0");
for ( Object filterCode : vc.getFilters() ) {
infoMap.put(String.valueOf(filterCode), "1");
}
// add genotype-specific fields
// TODO -- implement me when we figure out a good way to represent this
// for ( Genotype g : vc.getGenotypes().values() ) {
// String prefix = g.getSampleName() + ".";
// addAttributesToMap(infoMap, g.getAttributes(), prefix);
// infoMap.put(prefix + "GT", g.getGenotypeString());
// }
}
// add specific genotype if one is provided
if ( g != null ) {
infoMap.put("GT", g.getGenotypeString());
infoMap.put("isHomRef", g.isHomRef() ? "1" : "0");
infoMap.put("isHet", g.isHet() ? "1" : "0");
infoMap.put("isHomVar", g.isHomVar() ? "1" : "0");
for ( Map.Entry<String, Object> e : g.getAttributes().entrySet() )
infoMap.put(e.getKey(), String.valueOf(e.getValue()));
}
// add genotypes
// todo -- comment this back in when we figure out how to represent it nicely
// for ( Genotype g : vc.getGenotypes().values() ) {
// String prefix = g.getSampleName() + ".";
// addAttributesToMap(infoMap, g.getAttributes(), prefix);
// infoMap.put(prefix + "GT", g.getGenotypeString());
// }
// create the internal context that we can evaluate expressions against
jContext = JexlHelper.createContext();
@ -135,7 +162,6 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
/**
* @return the size of the internal data structure
*/
@Override
public int size() {
return jexl.size();
}
@ -143,7 +169,6 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
/**
* @return true if we're empty
*/
@Override
public boolean isEmpty() { return this.jexl.isEmpty(); }
/**
@ -151,10 +176,8 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
* @param o the key
* @return true if we have a value for that key
*/
@Override
public boolean containsKey(Object o) { return jexl.containsKey(o); }
@Override
public Boolean get(Object o) {
// if we've already determined the value, return it
if (jexl.containsKey(o) && jexl.get(o) != null) return jexl.get(o);
@ -169,7 +192,6 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
* get the keyset of map
* @return a set of keys of type JexlVCMatchExp
*/
@Override
public Set<VariantContextUtils.JexlVCMatchExp> keySet() {
return jexl.keySet();
}
@ -180,7 +202,6 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
* the keys you want, you would be better off using get() to get them by name.
* @return a collection of boolean values, representing the results of all the variants evaluated
*/
@Override
public Collection<Boolean> values() {
// this is an expensive call
for (VariantContextUtils.JexlVCMatchExp exp : jexl.keySet())
@ -195,7 +216,7 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
*/
private void evaulateExpression(VariantContextUtils.JexlVCMatchExp exp) {
// if the context is null, we need to create it to evaluate the JEXL expression
if (this.jContext == null) createContext(vc);
if (this.jContext == null) createContext();
try {
jexl.put (exp, (Boolean) exp.exp.evaluate(jContext));
} catch (Exception e) {
@ -210,16 +231,14 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
*/
private static void addAttributesToMap(Map<String, String> infoMap, Map<String, ?> attributes ) {
for (Map.Entry<String, ?> e : attributes.entrySet()) {
infoMap.put(String.valueOf(e.getKey()), String.valueOf(e.getValue()));
infoMap.put(e.getKey(), String.valueOf(e.getValue()));
}
}
@Override
public Boolean put(VariantContextUtils.JexlVCMatchExp jexlVCMatchExp, Boolean aBoolean) {
return jexl.put(jexlVCMatchExp,aBoolean);
}
@Override
public void putAll(Map<? extends VariantContextUtils.JexlVCMatchExp, ? extends Boolean> map) {
jexl.putAll(map);
}
@ -230,25 +249,21 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
// this doesn't make much sense to implement, boolean doesn't offer too much variety to deal
// with evaluating every key in the internal map.
@Override
public boolean containsValue(Object o) {
throw new UnsupportedOperationException("containsValue() not supported on a JEXLMap");
}
// this doesn't make much sense
@Override
public Boolean remove(Object o) {
throw new UnsupportedOperationException("remove() not supported on a JEXLMap");
}
@Override
public Set<Entry<VariantContextUtils.JexlVCMatchExp, Boolean>> entrySet() {
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
}
// nope
@Override
public void clear() {
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
}

View File

@ -217,10 +217,10 @@ public class VariantContextAdaptors {
}
public static VCFRecord toVCF(VariantContext vc, char vcfRefBase) {
return toVCF(vc, vcfRefBase, null, true);
return toVCF(vc, vcfRefBase, null, true, false);
}
public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List<String> allowedGenotypeAttributeKeys, boolean filtersWereAppliedToContext) {
public static VCFRecord toVCF(VariantContext vc, char vcfRefBase, List<String> allowedGenotypeAttributeKeys, boolean filtersWereAppliedToContext, boolean filtersWereAppliedToGenotypes) {
// deal with the reference
String referenceBases = new String(vc.getReference().getBases());
@ -284,6 +284,8 @@ public class VariantContextAdaptors {
if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
vcfGenotypeAttributeKeys.add(key);
}
if ( filtersWereAppliedToGenotypes )
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY);
}
String genotypeFormatString = Utils.join(VCFRecord.GENOTYPE_FIELD_SEPERATOR, vcfGenotypeAttributeKeys);
@ -313,6 +315,8 @@ public class VariantContextAdaptors {
ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY);
if ( pileup != null )
val = pileup.size();
} else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) {
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : VCFRecord.PASSES_FILTERS;
}
String outputValue = formatVCFField(key, val);

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.filters;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.*;
@ -25,16 +26,22 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected String[] FILTER_NAMES = new String[]{};
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize)", required=false)
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
protected String[] GENOTYPE_FILTER_EXPS = new String[]{};
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected String[] GENOTYPE_FILTER_NAMES = new String[]{};
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false)
protected Integer clusterSize = 3;
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1)", required=false)
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false)
protected Integer clusterWindow = 0;
@Argument(fullName="maskName", shortName="mask", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
@Argument(fullName="maskName", shortName="mask", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false)
protected String MASK_NAME = "Mask";
// JEXL expressions for the filters
List<VariantContextUtils.JexlVCMatchExp> filterExps;
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster";
@ -70,9 +77,13 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
if ( clusterWindow > 0 )
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
}
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
if ( genotypeFilterExps.size() > 0 )
hInfo.add(new VCFFormatHeaderLine(VCFGenotypeRecord.GENOTYPE_FILTER_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype-level filter"));
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
@ -91,6 +102,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
clusteredSNPs = new ClusteredSnps(clusterSize, clusterWindow);
filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS);
}
public Integer reduceInit() { return 0; }
@ -136,45 +148,53 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
if ( context == null )
return;
StringBuilder filterString = new StringBuilder();
VariantContext vc = context.getVariantContext();
// make new Genotypes based on filters
Map<String, Genotype> genotypes;
if ( genotypeFilterExps.size() == 0 ) {
genotypes = vc.getGenotypes();
} else {
genotypes = new HashMap<String, Genotype>(vc.getGenotypes().size());
// for each genotype, check filters then create a new object
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
Genotype g = genotype.getValue();
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(g, exp) )
filters.add(exp.name);
}
genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.genotypesArePhased()));
}
}
// make a new variant context based on filters
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
// test for SNP mask, if present
RODRecordList mask = context.getTracker().getTrackData("mask", null);
if ( mask != null && mask.size() > 0 )
addFilter(filterString, MASK_NAME);
filters.add(MASK_NAME);
// test for clustered SNPs if requested
if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) )
addFilter(filterString, CLUSTERED_SNP_FILTER_NAME);
filters.add(CLUSTERED_SNP_FILTER_NAME);
VariantContext vc = context.getVariantContext();
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
if ( VariantContextUtils.match(vc, exp) )
addFilter(filterString, exp.name);
filters.add(exp.name);
}
writeVCF(VariantContextAdaptors.toVCF(vc, context.getReferenceContext().getBase(), null, true), filterString);
VariantContext filteredVC = new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes());
writeVCF(VariantContextAdaptors.toVCF(filteredVC, context.getReferenceContext().getBase(), null, true, genotypeFilterExps.size() > 0));
}
private static void addFilter(StringBuilder sb, String filter) {
if ( sb.length() > 0 )
sb.append(";");
sb.append(filter);
}
private void writeVCF(VCFRecord rec, StringBuilder filterString) {
if ( filterString.length() != 0 ) {
// if the record is already filtered, don't destroy those filters
if ( rec.isFiltered() )
filterString.append(";" + rec.getFilterString());
rec.setFilterString(filterString.toString());
}
// otherwise, if it's not already filtered, set it to "passing filters"
else if ( !rec.isFiltered() ) {
rec.setFilterString(VCFRecord.PASSES_FILTERS);
}
private void writeVCF(VCFRecord rec) {
if ( writer == null )
initializeVcfWriter(rec);
writer.addRecord(rec);

View File

@ -28,6 +28,7 @@ public class VCFGenotypeRecord {
public static final int MISSING_GENOTYPE_QUALITY = -1;
public static final int MISSING_DEPTH = -1;
public static final int MISSING_HAPLOTYPE_QUALITY = -1;
public static final String PASSES_FILTERS = "0";
public static final String UNFILTERED = ".";
public static final double MAX_QUAL_VALUE = 99.0;
@ -199,7 +200,9 @@ public class VCFGenotypeRecord {
}
public boolean isFiltered() {
return ( mFields.get(GENOTYPE_FILTER_KEY) != null && ! mFields.get(GENOTYPE_FILTER_KEY).equals("0"));
return ( mFields.get(GENOTYPE_FILTER_KEY) != null &&
!mFields.get(GENOTYPE_FILTER_KEY).equals(UNFILTERED) &&
!mFields.get(GENOTYPE_FILTER_KEY).equals(PASSES_FILTERS));
}
public int getPloidy() {

View File

@ -83,7 +83,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added");
VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), allowedGenotypeFormatStrings, false);
VCFRecord call = VariantContextAdaptors.toVCF(vc, vc.getReference().toString().charAt(0), allowedGenotypeFormatStrings, false, false);
Set<Allele> altAlleles = vc.getAlternateAlleles();
StringBuffer altAlleleCountString = new StringBuffer();