fix 1430 for genotype filters; refactored filter() method; added unit and integration test; more comprehensive fix must be done first on htsjdk side in JEXLMap (#1456)

This commit is contained in:
Steve Huang 2016-09-06 17:30:18 -04:00 committed by GitHub
parent d582ee6d3c
commit 3c88e6859f
3 changed files with 461 additions and 240 deletions

View File

@ -211,6 +211,24 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
executeTest("testFilteringDPfromFORMAT", spec);
}
@Test
public void testFilteringDPfromFORMATWithMissing() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
Arrays.asList("4bf46103a71bac92a11eae04b97f9877"));
executeTest("testFilteringDPfromFORMATWithMissing", spec);
}
@Test
public void testFilteringDPfromFORMATAndFailMissing() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --missingValuesInExpressionsShouldEvaluateAsFailing --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
Arrays.asList("baeda696c92adc8745ac4ebbdead6c91"));
executeTest("testFilteringDPfromFORMATAndFailMissing", spec);
}
@Test
public void testInvertGenotypeFilterExpression() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -25,6 +25,7 @@
package org.broadinstitute.gatk.tools.walkers.filters;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.commandline.*;
@ -55,7 +56,7 @@ import java.util.*;
*
* <p>
* This tool is designed for hard-filtering variant calls based on certain criteria. Records are hard-filtered
* by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved
* by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved
* in the output unless their removal is requested in the command line. </p>
*
* <p>The most common way of specifying filtering criteria is by using JEXL queries. See the
@ -84,7 +85,7 @@ import java.util.*;
* </pre>
*
* <h3>Caveat</h3>
* <p>when you run VariantFiltration with a command that includes multiple logical parts, each part of the command is applied
* <p>when you run {@link VariantFiltration} with a command that includes multiple logical parts, each part of the command is applied
* individually to the original form of the VCF record. Say you ran a VF command that includes three parts: one applies
* some genotype filters, another applies setFilterGtToNoCall (which changes sample genotypes to ./. whenever a sample has a
* genotype-level FT annotation), and yet another one filters sites based on whether any samples have a no-call there. You might
@ -98,14 +99,17 @@ import java.util.*;
@Reference(window=@Window(start=-50,stop=50))
public class VariantFiltration extends RodWalker<Integer, Integer> {
// -----------------------------------------------------------------------------------------------
// Arguments
// -----------------------------------------------------------------------------------------------
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed,
* i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used.
* i.e. filter variants that do not overlap with provided mask, then argument {@code -filterNotInMask} can be used.
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
* (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
*/
@Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false)
public RodBinding<Feature> mask;
@ -115,41 +119,41 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
/**
* VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
* --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
* {@code --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"}).
*/
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
protected ArrayList<String> filterExpressions = new ArrayList<String>();
protected ArrayList<String> filterExpressions = new ArrayList<>();
/**
* This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
* This name is put in the <pre>FILTER</pre> field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
*/
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
protected ArrayList<String> filterNames = new ArrayList<String>();
protected ArrayList<String> filterNames = new ArrayList<>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
* VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
* One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
* methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). Also available are
* expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object.
* Similar to the <pre>INFO</pre> field based expressions, but used on the <pre>FORMAT</pre> (genotype) fields instead.
* {@link VariantFiltration} will add the sample-level <pre>FT</pre> tag to the <pre>FORMAT</pre> field of filtered samples (this does not affect the record's <pre>FILTER</pre> tag).
* One can filter normally based on most fields (e.g. {@code "GQ < 5.0"}), but the <pre>GT</pre> (genotype) field is an exception.
* We have put in convenience methods so that one can now filter out hets ({@code "isHet == 1"}), refs ({@code "isHomRef == 1"}), or homs ({@code "isHomVar == 1"}).
* Also available are expressions {@code isCalled}, {@code isNoCall}, {@code isMixed}, and {@code isAvailable}, in accordance with the methods of the {@link Genotype} object.
*/
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
protected ArrayList<String> genotypeFilterExpressions = new ArrayList<String>();
protected ArrayList<String> genotypeFilterExpressions = new ArrayList<>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
* Similar to the <pre>INFO</pre> field based expressions, but used on the <pre>FORMAT</pre> (genotype) fields instead.
*/
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected ArrayList<String> genotypeFilterNames = new ArrayList<String>();
protected ArrayList<String> genotypeFilterNames = new ArrayList<>();
/**
* Works together with the --clusterWindowSize argument.
* Works together with the {@code --clusterWindowSize} argument.
*/
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
protected Integer clusterSize = 3;
/**
* Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
* Works together with the {@code --clusterWindowSize} argument. To disable the clustered SNP filter, set this value to less than 1.
*/
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
protected Integer clusterWindow = 0;
@ -158,45 +162,45 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
protected Integer maskExtension = 0;
/**
* When using the -mask argument, the maskName will be annotated in the variant record.
* Note that when using the -filterNotInMask argument to reverse the masking logic,
* When using the {@code -mask} argument, the {@code maskName} will be annotated in the variant record.
* Note that when using the {@code -filterNotInMask} argument to reverse the masking logic,
* it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
* (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
*/
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
protected String maskName = "Mask";
/**
* By default, if the -mask argument is used, any variant falling in a mask will be filtered.
* By default, if the {@code -mask} argument is used, any variant falling in a mask will be filtered.
* If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered.
* Use case is, for example, if we have an interval list or BED file with "good" sites.
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
* (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
*/
@Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false)
protected boolean filterRecordsNotInMask = false;
/**
* By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
* By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as <pre>PASS</pre>ing.
* Use this argument to have it evaluate as failing filters instead for these cases.
*/
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean failMissingValues = false;
/**
* Invalidate previous filters applied to the VariantContext, applying only the filters here
* Invalidate previous filters applied to the {@link VariantContext}, applying only the filters here.
*/
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
boolean invalidatePrevious = false;
/**
* Invert the selection criteria for --filterExpression
* Invert the selection criteria for {@code --filterExpression}.
*/
@Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false)
protected boolean invertFilterExpression = false;
/**
* Invert the selection criteria for --genotypeFilterExpression
* Invert the selection criteria for {@code --genotypeFilterExpression}.
*/
@Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
protected boolean invertGenotypeFilterExpression = false;
@ -207,81 +211,42 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
@Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
private boolean setFilteredGenotypesToNocall = false;
// -----------------------------------------------------------------------------------------------
// Fields
// -----------------------------------------------------------------------------------------------
// JEXL expressions for the filters
List<VariantContextUtils.JexlVCMatchExp> filterExps;
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster";
private ClusteredSnps clusteredSNPs = null;
private GenomeLoc previousMaskPosition = null;
// the structures necessary to initialize and maintain a windowed context
private FiltrationContextWindow variantContextWindow;
private FiltrationContextWindow variantContextWindow; // the structures necessary to initialize and maintain a windowed context
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<FiltrationContext>();
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<>();
private final List<Allele> diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private static final List<Allele> DIPLOID_NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
/**
* Prepend inverse phrase to description if --invertFilterExpression
*
* @param description the description
* @return the description with inverse prepended if --invert_filter_expression
*/
private String possiblyInvertFilterExpression( String description ){
if ( invertFilterExpression )
description = "Inverse of: " + description;
return description;
}
private void initializeVcfWriter() {
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
// need AC, AN and AF since output if set filtered genotypes to no-call
if ( setFilteredGenotypesToNocall ) {
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
}
if ( clusterWindow > 0 )
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
if ( !genotypeFilterExps.isEmpty() )
hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
try {
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
if ( mask.isBound() ) {
if (filterRecordsNotInMask)
hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
}
} catch (IllegalArgumentException e) {
throw new UserException.BadInput(e.getMessage());
}
writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
}
// -----------------------------------------------------------------------------------------------
// public methods from base classes
// -----------------------------------------------------------------------------------------------
public void initialize() {
if ( clusterWindow > 0 )
clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow);
if ( maskExtension < 0 )
throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
if ( maskExtension < 0 ) {
throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
}
if (filterRecordsNotInMask && !mask.isBound()) {
throw new UserException.BadArgumentValue("filterNotInMask", "argument not allowed if mask argument is not provided");
}
if ( clusterWindow > 0 ) {
clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(), clusterSize, clusterWindow);
}
if (filterRecordsNotInMask && !mask.isBound())
throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided");
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
@ -290,44 +255,45 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
initializeVcfWriter();
}
public Integer reduceInit() { return 0; }
/**
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return 1 if the locus was successfully processed, 0 if otherwise
* @return 1 if the locus was successfully processed, 0 otherwise
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
if ( tracker == null ) {
return 0;
}
Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
final Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
// is there a SNP mask present?
boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask));
if ( hasMask )
final boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask));
if ( hasMask ) {
previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask
}
for ( VariantContext vc : VCs ) {
if ( invalidatePrevious ) {
vc = (new VariantContextBuilder(vc)).filters(new HashSet<String>()).make();
vc = (new VariantContextBuilder(vc)).filters(new HashSet<>()).make();
}
// filter based on previous mask position
vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false);
FiltrationContext varContext = new FiltrationContext(ref, vc);
final FiltrationContext varContext = new FiltrationContext(ref, vc);
// if we're still initializing the context, do so
if ( windowInitializer != null ) {
// if this is a mask position, filter previous records
if ( hasMask ) {
for ( FiltrationContext prevVC : windowInitializer )
for ( final FiltrationContext prevVC : windowInitializer ) {
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
}
}
windowInitializer.add(varContext);
@ -339,9 +305,10 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
// if this is a mask position, filter previous records
if ( hasMask ) {
for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
if ( prevVC != null )
for ( final FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
if ( prevVC != null ) {
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
}
}
}
@ -353,6 +320,244 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public Integer reduceInit() { return 0; }
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param result the number of loci seen.
*/
public void onTraversalDone(Integer result) {
// move the window over so that we can filter the last few variants
if ( windowInitializer != null ) {
while ( windowInitializer.size() < WINDOW_SIZE ) {
windowInitializer.add(null);
}
variantContextWindow = new FiltrationContextWindow(windowInitializer);
}
for (int i=0; i < WINDOW_SIZE; i++) {
variantContextWindow.moveWindow(null);
filter();
}
}
// -----------------------------------------------------------------------------------------------
// main filtering steps
// -----------------------------------------------------------------------------------------------
/**
* Organizing filters: genotype filters and normal filters.
*/
private void filter() {
// get the current context
final FiltrationContext context = variantContextWindow.getContext();
if ( context == null ) {
return;
}
final VariantContext vc = context.getVariantContext();
// make new Genotypes based on genotype filters
final VariantContextBuilder builder = ( genotypeFilterExps.isEmpty() && !setFilteredGenotypesToNocall ) ? new VariantContextBuilder(vc)
: applyGenotypeFilters(vc, genotypeFilterExps, invertGenotypeFilterExpression, failMissingValues, setFilteredGenotypesToNocall);
// extract filters already in VC and append new filters
final Set<String> filters = buildVCfilters(vc, filterExps, invertFilterExpression, failMissingValues);
// test for clustered SNPs if requested
if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) ) {
filters.add(CLUSTERED_SNP_FILTER_NAME);
}
// make a new variant context based on all filters, and write
writer.add(filters.isEmpty() ? builder.passFilters().make() : builder.filters(filters).make());
}
/**
* Given a VC builder and a vc (which was used to construct the builder), update the properties that the builder
* will use to construct a new VC, based on some of the attributes/annotations of the old VC.
* @param vc variant context holding genotypes to be filtered
* @param genotypeFilterExpressions genotype filter expressions
* @param invertGenotypeFilterExpression should invert the genotype filter expression or not
* @param failIfMissingValues if sample misses the corresponding annotation(s) the filter(s) work by, should we fail them or not
* @param setFilteredGenotypesToNocall if sample is filtered should we set genotype to non-call or not
*/
@VisibleForTesting
static VariantContextBuilder applyGenotypeFilters(final VariantContext vc,
final List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExpressions,
final boolean invertGenotypeFilterExpression,
final boolean failIfMissingValues,
final boolean setFilteredGenotypesToNocall) {
final VariantContextBuilder builder = new VariantContextBuilder(vc);
final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
// recompute AC, AN and AF if filtered genotypes are set to no-call
// occurrences of alternate alleles over all genotypes
final Map<Allele, Integer> calledAltAlleles = new LinkedHashMap<>(vc.getNAlleles()-1);
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
calledAltAlleles.put(altAllele, 0);
}
int calledAlleles = 0;
boolean haveFilteredNoCallAlleles = false;
// for each genotype, check filters then create a new object
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isCalled() ) {
final List<String> filters = new ArrayList<>();
if ( g.isFiltered() ) filters.add(g.getFilters());
// Add if expression filters the variant context
for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExpressions ) {
try {
if (Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression)) {
filters.add(exp.name);
}
} catch (final IllegalArgumentException e) {
// logic: right now (2016/08/18) if a filter is applied based on specific annotation and some sample contains missing value for such annotation,
// lower level code will throw IllegalArgumentException, therefore we specifically catch this type of exception
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
if ( failIfMissingValues ) {
filters.add(exp.name);
}
}
}
// if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) {
haveFilteredNoCallAlleles = true;
genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(DIPLOID_NO_CALL_ALLELES).make());
}
else {
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g);
}
} else {
genotypes.add(g);
}
}
builder.genotypes(genotypes);
// if filtered genotypes are set to no-call, output recomputed AC, AN, AF
if ( haveFilteredNoCallAlleles ) {
GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder);
}
return builder;
}
/**
* Extract filters already present in the {@code vc}, and append user provided expressions.
* For user provided genotype filter expressions, see {@link #applyGenotypeFilters(VariantContext, List, boolean, boolean, boolean)}
* @param vc VC from which filters to be extracted
* @param vcFilterExpressions more filter expressions provided by user
* @param invertVCfilterExpression should we invert the logic in expressions provided in {@code vcFilterExpressions}
* @param failIfMissingValues should we mark the VC as failing if it misses the value the filters work on
*
* @return filters already in the provided vc and user-provided filters
*/
@VisibleForTesting
static Set<String> buildVCfilters(final VariantContext vc,
final List<VariantContextUtils.JexlVCMatchExp> vcFilterExpressions,
final boolean invertVCfilterExpression,
final boolean failIfMissingValues) {
final Set<String> filters = new LinkedHashSet<>(vc.getFilters());
for ( final VariantContextUtils.JexlVCMatchExp exp : vcFilterExpressions ) {
try {
if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertVCfilterExpression) ) {
filters.add(exp.name);
}
} catch (final Exception e) {
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
if ( failIfMissingValues ) {
filters.add(exp.name);
}
}
}
return filters;
}
// -----------------------------------------------------------------------------------------------
// some other complications besides main stuff
// -----------------------------------------------------------------------------------------------
private void initializeVcfWriter() {
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
// need AC, AN and AF since output if set filtered genotypes to no-call
if ( setFilteredGenotypesToNocall ) {
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
}
if ( clusterWindow > 0 ) {
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
}
if ( !genotypeFilterExps.isEmpty() ) {
hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
}
try {
for ( final VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
}
for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
}
if ( mask.isBound() ) {
hInfo.add(new VCFFilterHeaderLine(maskName, filterRecordsNotInMask ? "Doesn't overlap a user-input mask" : "Overlaps a user-input mask"));
}
} catch (final IllegalArgumentException e) {
throw new UserException.BadInput(e.getMessage());
}
writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
}
/**
* Prepend inverse phrase to description if {@code --invertFilterExpression}.
*
* @param description the description
* @return the description with inverse prepended if {@code --invert_filter_expression}.
*/
private String possiblyInvertFilterExpression( final String description ){
return invertFilterExpression ? "Inverse of: " + description : description;
}
/**
* Add mask to variant context filters if it covers its location.
* @param vc VariantContext
* @param genomeLoc genome location
* @param maskName name of the mask
* @param maskExtension bases beyond the mask
* @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
* @return VariantContext with the mask added if the VariantContext is within the extended mask area
*/
private VariantContext addMaskIfCoversVariant(VariantContext vc, final GenomeLoc genomeLoc, final String maskName, final int maskExtension, final boolean locStart) {
if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
final Set<String> filters = new LinkedHashSet<>(vc.getFilters());
filters.add(maskName);
vc = new VariantContextBuilder(vc).filters(filters).make();
}
return vc;
}
/**
* Helper function to check if a mask covers the variant location.
*
@ -364,141 +569,14 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
* @return true if the genome location is within the extended mask area, false otherwise
*/
protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) {
boolean logic = genomeLoc != null && // have a location
genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
final boolean needToCheckOveralpping = genomeLoc != null && // have a location
genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
(vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied
if ( logic ) {
if (vcBeforeLoc)
return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
else
return vc.getStart() - genomeLoc.getStop() <= maskExtension;
if ( needToCheckOveralpping ) {
return vcBeforeLoc ? (genomeLoc.getStart() - vc.getEnd() <= maskExtension) // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
: (vc.getStart() - genomeLoc.getStop() <= maskExtension);
} else {
return false;
}
}
/**
* Add mask to variant context filters if it covers the it's location
*
* @param vc VariantContext
* @param genomeLoc genome location
* @param maskName name of the mask
* @param maskExtension bases beyond the mask
* @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
* @return VariantContext with the mask added if the VariantContext is within the extended mask area
*/
private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) {
if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
filters.add(maskName);
vc = new VariantContextBuilder(vc).filters(filters).make();
}
return vc;
}
private void filter() {
// get the current context
FiltrationContext context = variantContextWindow.getContext();
if ( context == null )
return;
final VariantContext vc = context.getVariantContext();
final VariantContextBuilder builder = new VariantContextBuilder(vc);
// make new Genotypes based on filters
if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) {
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
//
// recompute AC, AN and AF if filtered genotypes are set to no-call
//
// occurrences of alternate alleles over all genotypes
final Map<Allele, Integer> calledAltAlleles = new LinkedHashMap<Allele, Integer>(vc.getNAlleles()-1);
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
calledAltAlleles.put(altAllele, 0);
}
int calledAlleles = 0;
boolean haveFilteredNoCallAlleles = false;
// for each genotype, check filters then create a new object
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isCalled() ) {
final List<String> filters = new ArrayList<String>();
if ( g.isFiltered() ) filters.add(g.getFilters());
// Add if expression filters the variant context
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) )
filters.add(exp.name);
}
// if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) {
haveFilteredNoCallAlleles = true;
genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make());
}
else {
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g);
}
} else {
genotypes.add(g);
}
}
builder.genotypes(genotypes);
// if filtered genotypes are set to no-call, output recomputed AC, AN, AF
if ( haveFilteredNoCallAlleles )
GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder);
}
// make a new variant context based on filters
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
// test for clustered SNPs if requested
if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) )
filters.add(CLUSTERED_SNP_FILTER_NAME);
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
try {
if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) )
filters.add(exp.name);
} catch (Exception e) {
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
if ( failMissingValues )
filters.add(exp.name);
}
}
if ( filters.isEmpty() )
builder.passFilters();
else
builder.filters(filters);
writer.add(builder.make());
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param result the number of loci seen.
*/
public void onTraversalDone(Integer result) {
// move the window over so that we can filter the last few variants
if ( windowInitializer != null ) {
while ( windowInitializer.size() < WINDOW_SIZE )
windowInitializer.add(null);
variantContextWindow = new FiltrationContextWindow(windowInitializer);
}
for (int i=0; i < WINDOW_SIZE; i++) {
variantContextWindow.moveWindow(null);
filter();
}
}
}

View File

@ -26,21 +26,24 @@
package org.broadinstitute.gatk.tools.walkers.filters;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.Utils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.testng.Assert;
import org.testng.annotations.*;
import java.util.Set;
import java.util.stream.Collectors;
public class VariantFiltrationUnitTest extends BaseTest {
@ -58,7 +61,7 @@ public class VariantFiltrationUnitTest extends BaseTest {
}
@DataProvider(name = "VariantMaskData")
public Object[][] DoesMaskCoverVariantTestData() {
public Object[][] doesMaskCoverVariantTestData() {
final String maskName = "testMask";
@ -85,8 +88,8 @@ public class VariantFiltrationUnitTest extends BaseTest {
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
* @param expectedValue return the expected return value from doesMaskCoverVariant()
*/
@Test(dataProvider = "VariantMaskData")
public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
@Test(dataProvider = "doesMaskCoverVariantTestData")
public void testDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
final boolean vcBeforeLoc, final boolean expectedValue) {
// Build VariantContext
@ -104,4 +107,126 @@ public class VariantFiltrationUnitTest extends BaseTest {
boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc);
Assert.assertEquals(coversVariant, expectedValue);
}
@Test
public void testApplyGenotypeFilters(){
final VariantContext vc = buildDataForFilters().make();
final String filterName = "LowDP";
final String filterExpr = "DP < 10";
final List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
final VariantContextBuilder anotherVCBuilder = VariantFiltration.applyGenotypeFilters(vc, genotypeFilterExps, false, false, false);
final VariantContext anotherVC = anotherVCBuilder.filters().make();
Assert.assertEquals(anotherVC.getGenotype("one").isFiltered(), true);
Assert.assertTrue(anotherVC.getGenotype("one").getFilters().equals(filterName));
Assert.assertEquals(anotherVC.getGenotype("two").isFiltered(), false);
Assert.assertEquals(anotherVC.getGenotype("three").isFiltered(), false);
Assert.assertEquals(anotherVC.getGenotype("four").isFiltered(), false);
Assert.assertEquals(anotherVC.getGenotype("five").isFiltered(), false);
Assert.assertEquals(anotherVC.getGenotype("six").isFiltered(), false);
final VariantContextBuilder yetAnotherVCBuilder = VariantFiltration.applyGenotypeFilters(anotherVC, genotypeFilterExps, false, true, false);
final VariantContext yetAnotherVC = yetAnotherVCBuilder.filters().make();
Assert.assertEquals(yetAnotherVC.getGenotype("six").isFiltered(), true);
Assert.assertTrue(yetAnotherVC.getGenotype("six").getFilters().equals(filterName));
}
@Test
public void testApplyVCFilters(){
final VariantContext vcNoFilters = buildDataForFilters().make(); // assumes this vc doesn't hold any filters yet
String filterName = "LowDP";
String filterExpr = "DP < 23";
List<VariantContextUtils.JexlVCMatchExp> vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
final Set<String> filters = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false);
Assert.assertFalse(vcNoFilters.isFiltered());
Assert.assertEquals(filters.size(), 1);
Assert.assertTrue(filters.contains(filterName));
filterName = "ID";
filterExpr = "ID = rs123";
vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
Set<String> filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, true);
// Assert.assertEquals(filterWhenFailMissing.size(), 1);
// Assert.assertTrue(filterWhenFailMissing.contains(filterName));
filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false);
Assert.assertTrue(filterWhenFailMissing.isEmpty());
}
private static VariantContextBuilder buildDataForFilters() {
/**
* Uses (part of) the following (semi fake) data for testing (data was modified from real data so expect some minor inconsistencies in annotations)
* 1 1234567 . T C 152.03 .
* AC=6;AF=1.00;AN=6;DP=22;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;SOR=0.693;set=variant3-variant4-variant6
* GT:AD:DP:GQ:PGT:PID:PL:RGQ
* 1/1:0,2:9:6:1|1:15870493_CT_C:90,6,0
* 1/1:0,4:10:12:1|1:15870493_CT_C:180,12,0
* ./.:0:3:.:.:.:.:0
* ./.:0:0:.:.:.:.:0
* ./.:0:0:.:.:.:.:0
* 1/1:0,0:.:6:1|1:15870493_CT_C:90,6,0
*/
final Allele refT = Allele.create("T", true);
final Allele altC = Allele.create("C", false);
final Allele nocall = Allele.NO_CALL;
final VariantContextBuilder vcBuilder = new VariantContextBuilder("", "1", 1234567, 1234567, Arrays.asList(refT, altC));
vcBuilder.noID();
vcBuilder.attribute("AC", 6);
vcBuilder.attribute("AF", 1.00);
vcBuilder.attribute("AN", 6);
vcBuilder.attribute("DP", 22);
vcBuilder.attribute("ExcessHet", 3.0103);
vcBuilder.attribute("FS", 0.000);
vcBuilder.attribute("MLEAC", 2);
vcBuilder.attribute("MLEAF", 1.00);
vcBuilder.attribute("SOR", 0.693);
GenotypeBuilder gtBuilder = new GenotypeBuilder("one", Arrays.asList(altC,altC));
final Genotype firstSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
.DP(9) // edge case not passing "DP < 10"
.make();
gtBuilder = new GenotypeBuilder("two", Arrays.asList(altC,altC));
final Genotype secondSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
.DP(10) // edge case passing "DP < 10"
.make();
gtBuilder = new GenotypeBuilder("three", Arrays.asList(nocall,nocall));
final Genotype thirdSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
.DP(3)
.make();
gtBuilder = new GenotypeBuilder("four", Arrays.asList(nocall,nocall));
final Genotype fourthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
.DP(0)
.make();
gtBuilder = new GenotypeBuilder("five", Arrays.asList(nocall,nocall));
final Genotype fifthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
.DP(0)
.make();
gtBuilder = new GenotypeBuilder("six", Arrays.asList(altC,altC));
final Genotype sixthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
.DP(-1)
.make();
vcBuilder.genotypes(firstSample, secondSample, thirdSample, fourthSample, fifthSample, sixthSample);
return vcBuilder;
}
}