diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java
index 61d92f0bb..9f2678d20 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java
@@ -211,6 +211,24 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
executeTest("testFilteringDPfromFORMAT", spec);
}
+ @Test
+ public void testFilteringDPfromFORMATWithMissing() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ + " --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
+ Arrays.asList("4bf46103a71bac92a11eae04b97f9877"));
+ executeTest("testFilteringDPfromFORMATWithMissing", spec);
+ }
+
+ @Test
+ public void testFilteringDPfromFORMATAndFailMissing() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ + " --missingValuesInExpressionsShouldEvaluateAsFailing --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
+ Arrays.asList("baeda696c92adc8745ac4ebbdead6c91"));
+ executeTest("testFilteringDPfromFORMATAndFailMissing", spec);
+ }
+
@Test
public void testInvertGenotypeFilterExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java
index fb975c37b..c3087fe42 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java
@@ -25,6 +25,7 @@
package org.broadinstitute.gatk.tools.walkers.filters;
+import com.google.common.annotations.VisibleForTesting;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.commandline.*;
@@ -55,7 +56,7 @@ import java.util.*;
*
*
* This tool is designed for hard-filtering variant calls based on certain criteria. Records are hard-filtered
- * by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved
+ * by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved
* in the output unless their removal is requested in the command line.
*
* The most common way of specifying filtering criteria is by using JEXL queries. See the
@@ -84,7 +85,7 @@ import java.util.*;
*
*
*
Caveat
- * when you run VariantFiltration with a command that includes multiple logical parts, each part of the command is applied
+ *
when you run {@link VariantFiltration} with a command that includes multiple logical parts, each part of the command is applied
* individually to the original form of the VCF record. Say you ran a VF command that includes three parts: one applies
* some genotype filters, another applies setFilterGtToNoCall (which changes sample genotypes to ./. whenever a sample has a
* genotype-level FT annotation), and yet another one filters sites based on whether any samples have a no-call there. You might
@@ -98,14 +99,17 @@ import java.util.*;
@Reference(window=@Window(start=-50,stop=50))
public class VariantFiltration extends RodWalker {
+ // -----------------------------------------------------------------------------------------------
+ // Arguments
+ // -----------------------------------------------------------------------------------------------
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed,
- * i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used.
+ * i.e. filter variants that do not overlap with provided mask, then argument {@code -filterNotInMask} can be used.
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
- * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
+ * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
*/
@Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false)
public RodBinding mask;
@@ -115,41 +119,41 @@ public class VariantFiltration extends RodWalker {
/**
* VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
- * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
+ * {@code --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"}).
*/
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
- protected ArrayList filterExpressions = new ArrayList();
+ protected ArrayList filterExpressions = new ArrayList<>();
/**
- * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
+ * This name is put in the FILTER
field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
*/
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
- protected ArrayList filterNames = new ArrayList();
+ protected ArrayList filterNames = new ArrayList<>();
/**
- * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
- * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
- * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
- * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). Also available are
- * expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object.
+ * Similar to the INFO
field based expressions, but used on the FORMAT
(genotype) fields instead.
+ * {@link VariantFiltration} will add the sample-level FT
tag to the FORMAT
field of filtered samples (this does not affect the record's FILTER
tag).
+ * One can filter normally based on most fields (e.g. {@code "GQ < 5.0"}), but the GT
(genotype) field is an exception.
+ * We have put in convenience methods so that one can now filter out hets ({@code "isHet == 1"}), refs ({@code "isHomRef == 1"}), or homs ({@code "isHomVar == 1"}).
+ * Also available are expressions {@code isCalled}, {@code isNoCall}, {@code isMixed}, and {@code isAvailable}, in accordance with the methods of the {@link Genotype} object.
*/
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
- protected ArrayList genotypeFilterExpressions = new ArrayList();
+ protected ArrayList genotypeFilterExpressions = new ArrayList<>();
/**
- * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
+ * Similar to the INFO
field based expressions, but used on the FORMAT
(genotype) fields instead.
*/
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
- protected ArrayList genotypeFilterNames = new ArrayList();
+ protected ArrayList genotypeFilterNames = new ArrayList<>();
/**
- * Works together with the --clusterWindowSize argument.
+ * Works together with the {@code --clusterWindowSize} argument.
*/
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
protected Integer clusterSize = 3;
/**
- * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
+ * Works together with the {@code --clusterWindowSize} argument. To disable the clustered SNP filter, set this value to less than 1.
*/
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
protected Integer clusterWindow = 0;
@@ -158,45 +162,45 @@ public class VariantFiltration extends RodWalker {
protected Integer maskExtension = 0;
/**
- * When using the -mask argument, the maskName will be annotated in the variant record.
- * Note that when using the -filterNotInMask argument to reverse the masking logic,
+ * When using the {@code -mask} argument, the {@code maskName} will be annotated in the variant record.
+ * Note that when using the {@code -filterNotInMask} argument to reverse the masking logic,
* it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
- * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
+ * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
*/
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
protected String maskName = "Mask";
/**
- * By default, if the -mask argument is used, any variant falling in a mask will be filtered.
+ * By default, if the {@code -mask} argument is used, any variant falling in a mask will be filtered.
* If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered.
* Use case is, for example, if we have an interval list or BED file with "good" sites.
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
- * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
+ * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
*/
@Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false)
protected boolean filterRecordsNotInMask = false;
/**
- * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
+ * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASS
ing.
* Use this argument to have it evaluate as failing filters instead for these cases.
*/
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean failMissingValues = false;
/**
- * Invalidate previous filters applied to the VariantContext, applying only the filters here
+ * Invalidate previous filters applied to the {@link VariantContext}, applying only the filters here.
*/
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
boolean invalidatePrevious = false;
/**
- * Invert the selection criteria for --filterExpression
+ * Invert the selection criteria for {@code --filterExpression}.
*/
@Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false)
protected boolean invertFilterExpression = false;
/**
- * Invert the selection criteria for --genotypeFilterExpression
+ * Invert the selection criteria for {@code --genotypeFilterExpression}.
*/
@Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
protected boolean invertGenotypeFilterExpression = false;
@@ -207,81 +211,42 @@ public class VariantFiltration extends RodWalker {
@Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
private boolean setFilteredGenotypesToNocall = false;
+ // -----------------------------------------------------------------------------------------------
+ // Fields
+ // -----------------------------------------------------------------------------------------------
+
// JEXL expressions for the filters
List filterExps;
List genotypeFilterExps;
public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster";
+
private ClusteredSnps clusteredSNPs = null;
private GenomeLoc previousMaskPosition = null;
-
- // the structures necessary to initialize and maintain a windowed context
- private FiltrationContextWindow variantContextWindow;
+ private FiltrationContextWindow variantContextWindow; // the structures necessary to initialize and maintain a windowed context
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
- private ArrayList windowInitializer = new ArrayList();
+ private ArrayList windowInitializer = new ArrayList<>();
- private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
+ private static final List DIPLOID_NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
- /**
- * Prepend inverse phrase to description if --invertFilterExpression
- *
- * @param description the description
- * @return the description with inverse prepended if --invert_filter_expression
- */
- private String possiblyInvertFilterExpression( String description ){
- if ( invertFilterExpression )
- description = "Inverse of: " + description;
- return description;
- }
-
- private void initializeVcfWriter() {
-
- final List inputNames = Arrays.asList(variantCollection.variants.getName());
-
- // setup the header fields
- Set hInfo = new HashSet();
- hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
-
- // need AC, AN and AF since output if set filtered genotypes to no-call
- if ( setFilteredGenotypesToNocall ) {
- hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
- hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
- hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
- }
-
- if ( clusterWindow > 0 )
- hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
-
- if ( !genotypeFilterExps.isEmpty() )
- hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
-
- try {
- for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
- hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
- for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
- hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
-
- if ( mask.isBound() ) {
- if (filterRecordsNotInMask)
- hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
- else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
- }
- } catch (IllegalArgumentException e) {
- throw new UserException.BadInput(e.getMessage());
- }
-
- writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
- }
+ // -----------------------------------------------------------------------------------------------
+ // public methods from base classes
+ // -----------------------------------------------------------------------------------------------
public void initialize() {
- if ( clusterWindow > 0 )
- clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow);
- if ( maskExtension < 0 )
- throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
+ if ( maskExtension < 0 ) {
+ throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
+ }
+
+ if (filterRecordsNotInMask && !mask.isBound()) {
+ throw new UserException.BadArgumentValue("filterNotInMask", "argument not allowed if mask argument is not provided");
+ }
+
+ if ( clusterWindow > 0 ) {
+ clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(), clusterSize, clusterWindow);
+ }
- if (filterRecordsNotInMask && !mask.isBound())
- throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided");
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
@@ -290,44 +255,45 @@ public class VariantFiltration extends RodWalker {
initializeVcfWriter();
}
- public Integer reduceInit() { return 0; }
-
/**
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
- * @return 1 if the locus was successfully processed, 0 if otherwise
+ * @return 1 if the locus was successfully processed, 0 otherwise
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
- if ( tracker == null )
+ if ( tracker == null ) {
return 0;
+ }
- Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation());
+ final Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation());
// is there a SNP mask present?
- boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask));
- if ( hasMask )
+ final boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask));
+ if ( hasMask ) {
previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask
+ }
for ( VariantContext vc : VCs ) {
if ( invalidatePrevious ) {
- vc = (new VariantContextBuilder(vc)).filters(new HashSet()).make();
+ vc = (new VariantContextBuilder(vc)).filters(new HashSet<>()).make();
}
// filter based on previous mask position
vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false);
- FiltrationContext varContext = new FiltrationContext(ref, vc);
+ final FiltrationContext varContext = new FiltrationContext(ref, vc);
// if we're still initializing the context, do so
if ( windowInitializer != null ) {
// if this is a mask position, filter previous records
if ( hasMask ) {
- for ( FiltrationContext prevVC : windowInitializer )
+ for ( final FiltrationContext prevVC : windowInitializer ) {
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
+ }
}
windowInitializer.add(varContext);
@@ -339,9 +305,10 @@ public class VariantFiltration extends RodWalker {
// if this is a mask position, filter previous records
if ( hasMask ) {
- for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
- if ( prevVC != null )
+ for ( final FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
+ if ( prevVC != null ) {
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
+ }
}
}
@@ -353,6 +320,244 @@ public class VariantFiltration extends RodWalker {
return 1;
}
+ public Integer reduce(Integer value, Integer sum) {
+ return sum + value;
+ }
+
+ public Integer reduceInit() { return 0; }
+
+ /**
+ * Tell the user the number of loci processed and close out the new variants file.
+ *
+ * @param result the number of loci seen.
+ */
+ public void onTraversalDone(Integer result) {
+ // move the window over so that we can filter the last few variants
+ if ( windowInitializer != null ) {
+ while ( windowInitializer.size() < WINDOW_SIZE ) {
+ windowInitializer.add(null);
+ }
+ variantContextWindow = new FiltrationContextWindow(windowInitializer);
+ }
+ for (int i=0; i < WINDOW_SIZE; i++) {
+ variantContextWindow.moveWindow(null);
+ filter();
+ }
+ }
+
+ // -----------------------------------------------------------------------------------------------
+ // main filtering steps
+ // -----------------------------------------------------------------------------------------------
+
+ /**
+ * Organizing filters: genotype filters and normal filters.
+ */
+ private void filter() {
+ // get the current context
+ final FiltrationContext context = variantContextWindow.getContext();
+ if ( context == null ) {
+ return;
+ }
+
+ final VariantContext vc = context.getVariantContext();
+
+ // make new Genotypes based on genotype filters
+ final VariantContextBuilder builder = ( genotypeFilterExps.isEmpty() && !setFilteredGenotypesToNocall ) ? new VariantContextBuilder(vc)
+ : applyGenotypeFilters(vc, genotypeFilterExps, invertGenotypeFilterExpression, failMissingValues, setFilteredGenotypesToNocall);
+
+ // extract filters already in VC and append new filters
+ final Set filters = buildVCfilters(vc, filterExps, invertFilterExpression, failMissingValues);
+ // test for clustered SNPs if requested
+ if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) ) {
+ filters.add(CLUSTERED_SNP_FILTER_NAME);
+ }
+
+ // make a new variant context based on all filters, and write
+ writer.add(filters.isEmpty() ? builder.passFilters().make() : builder.filters(filters).make());
+ }
+
+ /**
+ * Given a VC builder and a vc (which was used to construct the builder), update the properties that the builder
+ * will use to construct a new VC, based on some of the attributes/annotations of the old VC.
+ * @param vc variant context holding genotypes to be filtered
+ * @param genotypeFilterExpressions genotype filter expressions
+ * @param invertGenotypeFilterExpression should invert the genotype filter expression or not
+ * @param failIfMissingValues if sample misses the corresponding annotation(s) the filter(s) work by, should we fail them or not
+ * @param setFilteredGenotypesToNocall if sample is filtered should we set genotype to non-call or not
+ */
+ @VisibleForTesting
+ static VariantContextBuilder applyGenotypeFilters(final VariantContext vc,
+ final List genotypeFilterExpressions,
+ final boolean invertGenotypeFilterExpression,
+ final boolean failIfMissingValues,
+ final boolean setFilteredGenotypesToNocall) {
+
+ final VariantContextBuilder builder = new VariantContextBuilder(vc);
+
+ final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
+
+ // recompute AC, AN and AF if filtered genotypes are set to no-call
+ // occurrences of alternate alleles over all genotypes
+ final Map calledAltAlleles = new LinkedHashMap<>(vc.getNAlleles()-1);
+ for ( final Allele altAllele : vc.getAlternateAlleles() ) {
+ calledAltAlleles.put(altAllele, 0);
+ }
+
+ int calledAlleles = 0;
+ boolean haveFilteredNoCallAlleles = false;
+
+ // for each genotype, check filters then create a new object
+ for ( final Genotype g : vc.getGenotypes() ) {
+ if ( g.isCalled() ) {
+ final List filters = new ArrayList<>();
+ if ( g.isFiltered() ) filters.add(g.getFilters());
+
+ // Add if expression filters the variant context
+ for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExpressions ) {
+ try {
+ if (Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression)) {
+ filters.add(exp.name);
+ }
+ } catch (final IllegalArgumentException e) {
+ // logic: right now (2016/08/18) if a filter is applied based on specific annotation and some sample contains missing value for such annotation,
+ // lower level code will throw IllegalArgumentException, therefore we specifically catch this type of exception
+ // do nothing unless specifically asked to; it just means that the expression isn't defined for this context
+ if ( failIfMissingValues ) {
+ filters.add(exp.name);
+ }
+ }
+ }
+
+ // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
+ if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) {
+ haveFilteredNoCallAlleles = true;
+ genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(DIPLOID_NO_CALL_ALLELES).make());
+ }
+ else {
+ genotypes.add(new GenotypeBuilder(g).filters(filters).make());
+ calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g);
+ }
+ } else {
+ genotypes.add(g);
+ }
+ }
+
+ builder.genotypes(genotypes);
+ // if filtered genotypes are set to no-call, output recomputed AC, AN, AF
+ if ( haveFilteredNoCallAlleles ) {
+ GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder);
+ }
+
+ return builder;
+ }
+
+ /**
+ * Extract filters already present in the {@code vc}, and append user provided expressions.
+ * For user provided genotype filter expressions, see {@link #applyGenotypeFilters(VariantContext, List, boolean, boolean, boolean)}
+ * @param vc VC from which filters to be extracted
+ * @param vcFilterExpressions more filter expressions provided by user
+ * @param invertVCfilterExpression should we invert the logic in expressions provided in {@code vcFilterExpressions}
+ * @param failIfMissingValues should we mark the VC as failing if it misses the value the filters work on
+ *
+ * @return filters already in the provided vc and user-provided filters
+ */
+ @VisibleForTesting
+ static Set buildVCfilters(final VariantContext vc,
+ final List vcFilterExpressions,
+ final boolean invertVCfilterExpression,
+ final boolean failIfMissingValues) {
+
+ final Set filters = new LinkedHashSet<>(vc.getFilters());
+
+ for ( final VariantContextUtils.JexlVCMatchExp exp : vcFilterExpressions ) {
+ try {
+ if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertVCfilterExpression) ) {
+ filters.add(exp.name);
+ }
+ } catch (final Exception e) {
+ // do nothing unless specifically asked to; it just means that the expression isn't defined for this context
+ if ( failIfMissingValues ) {
+ filters.add(exp.name);
+ }
+ }
+ }
+ return filters;
+ }
+
+ // -----------------------------------------------------------------------------------------------
+ // some other complications besides main stuff
+ // -----------------------------------------------------------------------------------------------
+
+ private void initializeVcfWriter() {
+
+ final List inputNames = Arrays.asList(variantCollection.variants.getName());
+
+ // setup the header fields
+ final Set hInfo = new HashSet();
+ hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
+
+ // need AC, AN and AF since output if set filtered genotypes to no-call
+ if ( setFilteredGenotypesToNocall ) {
+ hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
+ hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
+ hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
+ }
+
+ if ( clusterWindow > 0 ) {
+ hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
+ }
+
+ if ( !genotypeFilterExps.isEmpty() ) {
+ hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
+ }
+
+ try {
+ for ( final VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
+ hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
+ }
+ for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
+ hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
+ }
+
+ if ( mask.isBound() ) {
+ hInfo.add(new VCFFilterHeaderLine(maskName, filterRecordsNotInMask ? "Doesn't overlap a user-input mask" : "Overlaps a user-input mask"));
+ }
+ } catch (final IllegalArgumentException e) {
+ throw new UserException.BadInput(e.getMessage());
+ }
+
+ writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
+ }
+
+ /**
+ * Prepend inverse phrase to description if {@code --invertFilterExpression}.
+ *
+ * @param description the description
+ * @return the description with inverse prepended if {@code --invert_filter_expression}.
+ */
+ private String possiblyInvertFilterExpression( final String description ){
+ return invertFilterExpression ? "Inverse of: " + description : description;
+ }
+
+ /**
+ * Add mask to variant context filters if it covers its location.
+ * @param vc VariantContext
+ * @param genomeLoc genome location
+ * @param maskName name of the mask
+ * @param maskExtension bases beyond the mask
+ * @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
+ * @return VariantContext with the mask added if the VariantContext is within the extended mask area
+ */
+ private VariantContext addMaskIfCoversVariant(VariantContext vc, final GenomeLoc genomeLoc, final String maskName, final int maskExtension, final boolean locStart) {
+ if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
+ final Set filters = new LinkedHashSet<>(vc.getFilters());
+ filters.add(maskName);
+ vc = new VariantContextBuilder(vc).filters(filters).make();
+ }
+
+ return vc;
+ }
+
/**
* Helper function to check if a mask covers the variant location.
*
@@ -364,141 +569,14 @@ public class VariantFiltration extends RodWalker {
* @return true if the genome location is within the extended mask area, false otherwise
*/
protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) {
- boolean logic = genomeLoc != null && // have a location
- genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
+ final boolean needToCheckOveralpping = genomeLoc != null && // have a location
+ genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
(vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied
- if ( logic ) {
- if (vcBeforeLoc)
- return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
- else
- return vc.getStart() - genomeLoc.getStop() <= maskExtension;
+ if ( needToCheckOveralpping ) {
+ return vcBeforeLoc ? (genomeLoc.getStart() - vc.getEnd() <= maskExtension) // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
+ : (vc.getStart() - genomeLoc.getStop() <= maskExtension);
} else {
return false;
}
}
-
- /**
- * Add mask to variant context filters if it covers the it's location
- *
- * @param vc VariantContext
- * @param genomeLoc genome location
- * @param maskName name of the mask
- * @param maskExtension bases beyond the mask
- * @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
- * @return VariantContext with the mask added if the VariantContext is within the extended mask area
- */
- private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) {
- if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
- Set filters = new LinkedHashSet(vc.getFilters());
- filters.add(maskName);
- vc = new VariantContextBuilder(vc).filters(filters).make();
- }
-
- return vc;
- }
-
- private void filter() {
- // get the current context
- FiltrationContext context = variantContextWindow.getContext();
- if ( context == null )
- return;
-
- final VariantContext vc = context.getVariantContext();
- final VariantContextBuilder builder = new VariantContextBuilder(vc);
-
- // make new Genotypes based on filters
- if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) {
- GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
-
- //
- // recompute AC, AN and AF if filtered genotypes are set to no-call
- //
- // occurrences of alternate alleles over all genotypes
- final Map calledAltAlleles = new LinkedHashMap(vc.getNAlleles()-1);
- for ( final Allele altAllele : vc.getAlternateAlleles() ) {
- calledAltAlleles.put(altAllele, 0);
- }
-
- int calledAlleles = 0;
- boolean haveFilteredNoCallAlleles = false;
-
- // for each genotype, check filters then create a new object
- for ( final Genotype g : vc.getGenotypes() ) {
- if ( g.isCalled() ) {
- final List filters = new ArrayList();
- if ( g.isFiltered() ) filters.add(g.getFilters());
-
- // Add if expression filters the variant context
- for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
- if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) )
- filters.add(exp.name);
- }
-
- // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
- if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) {
- haveFilteredNoCallAlleles = true;
- genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make());
- }
- else {
- genotypes.add(new GenotypeBuilder(g).filters(filters).make());
- calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g);
- }
- } else {
- genotypes.add(g);
- }
- }
-
- builder.genotypes(genotypes);
- // if filtered genotypes are set to no-call, output recomputed AC, AN, AF
- if ( haveFilteredNoCallAlleles )
- GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder);
- }
-
- // make a new variant context based on filters
- Set filters = new LinkedHashSet(vc.getFilters());
-
- // test for clustered SNPs if requested
- if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) )
- filters.add(CLUSTERED_SNP_FILTER_NAME);
-
- for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
- try {
- if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) )
- filters.add(exp.name);
- } catch (Exception e) {
- // do nothing unless specifically asked to; it just means that the expression isn't defined for this context
- if ( failMissingValues )
- filters.add(exp.name);
- }
- }
-
- if ( filters.isEmpty() )
- builder.passFilters();
- else
- builder.filters(filters);
-
- writer.add(builder.make());
- }
-
- public Integer reduce(Integer value, Integer sum) {
- return sum + value;
- }
-
- /**
- * Tell the user the number of loci processed and close out the new variants file.
- *
- * @param result the number of loci seen.
- */
- public void onTraversalDone(Integer result) {
- // move the window over so that we can filter the last few variants
- if ( windowInitializer != null ) {
- while ( windowInitializer.size() < WINDOW_SIZE )
- windowInitializer.add(null);
- variantContextWindow = new FiltrationContextWindow(windowInitializer);
- }
- for (int i=0; i < WINDOW_SIZE; i++) {
- variantContextWindow.moveWindow(null);
- filter();
- }
- }
}
diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java
index 3ec696592..5262345fc 100644
--- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java
+++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java
@@ -26,21 +26,24 @@
package org.broadinstitute.gatk.tools.walkers.filters;
import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.variant.variantcontext.*;
+import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.Utils;
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.testng.Assert;
+import org.testng.annotations.BeforeTest;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import org.testng.Assert;
-import org.testng.annotations.*;
+import java.util.Set;
+import java.util.stream.Collectors;
public class VariantFiltrationUnitTest extends BaseTest {
@@ -58,7 +61,7 @@ public class VariantFiltrationUnitTest extends BaseTest {
}
@DataProvider(name = "VariantMaskData")
- public Object[][] DoesMaskCoverVariantTestData() {
+ public Object[][] doesMaskCoverVariantTestData() {
final String maskName = "testMask";
@@ -85,8 +88,8 @@ public class VariantFiltrationUnitTest extends BaseTest {
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
* @param expectedValue return the expected return value from doesMaskCoverVariant()
*/
- @Test(dataProvider = "VariantMaskData")
- public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
+ @Test(dataProvider = "doesMaskCoverVariantTestData")
+ public void testDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
final boolean vcBeforeLoc, final boolean expectedValue) {
// Build VariantContext
@@ -104,4 +107,126 @@ public class VariantFiltrationUnitTest extends BaseTest {
boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc);
Assert.assertEquals(coversVariant, expectedValue);
}
+
+ @Test
+ public void testApplyGenotypeFilters(){
+
+ final VariantContext vc = buildDataForFilters().make();
+
+ final String filterName = "LowDP";
+ final String filterExpr = "DP < 10";
+
+ final List genotypeFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
+
+ final VariantContextBuilder anotherVCBuilder = VariantFiltration.applyGenotypeFilters(vc, genotypeFilterExps, false, false, false);
+ final VariantContext anotherVC = anotherVCBuilder.filters().make();
+
+ Assert.assertEquals(anotherVC.getGenotype("one").isFiltered(), true);
+ Assert.assertTrue(anotherVC.getGenotype("one").getFilters().equals(filterName));
+
+ Assert.assertEquals(anotherVC.getGenotype("two").isFiltered(), false);
+
+ Assert.assertEquals(anotherVC.getGenotype("three").isFiltered(), false);
+
+ Assert.assertEquals(anotherVC.getGenotype("four").isFiltered(), false);
+
+ Assert.assertEquals(anotherVC.getGenotype("five").isFiltered(), false);
+
+ Assert.assertEquals(anotherVC.getGenotype("six").isFiltered(), false);
+
+ final VariantContextBuilder yetAnotherVCBuilder = VariantFiltration.applyGenotypeFilters(anotherVC, genotypeFilterExps, false, true, false);
+ final VariantContext yetAnotherVC = yetAnotherVCBuilder.filters().make();
+ Assert.assertEquals(yetAnotherVC.getGenotype("six").isFiltered(), true);
+ Assert.assertTrue(yetAnotherVC.getGenotype("six").getFilters().equals(filterName));
+ }
+
+ @Test
+ public void testApplyVCFilters(){
+
+ final VariantContext vcNoFilters = buildDataForFilters().make(); // assumes this vc doesn't hold any filters yet
+
+ String filterName = "LowDP";
+ String filterExpr = "DP < 23";
+ List vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
+
+ final Set filters = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false);
+ Assert.assertFalse(vcNoFilters.isFiltered());
+ Assert.assertEquals(filters.size(), 1);
+ Assert.assertTrue(filters.contains(filterName));
+
+ filterName = "ID";
+ filterExpr = "ID = rs123";
+ vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
+ Set filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, true);
+// Assert.assertEquals(filterWhenFailMissing.size(), 1);
+// Assert.assertTrue(filterWhenFailMissing.contains(filterName));
+ filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false);
+ Assert.assertTrue(filterWhenFailMissing.isEmpty());
+ }
+
+
+ private static VariantContextBuilder buildDataForFilters() {
+ /**
+ * Uses (part of) the following (semi fake) data for testing (data was modified from real data so expect some minor inconsistencies in annotations)
+ * 1 1234567 . T C 152.03 .
+ * AC=6;AF=1.00;AN=6;DP=22;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;SOR=0.693;set=variant3-variant4-variant6
+ * GT:AD:DP:GQ:PGT:PID:PL:RGQ
+ * 1/1:0,2:9:6:1|1:15870493_CT_C:90,6,0
+ * 1/1:0,4:10:12:1|1:15870493_CT_C:180,12,0
+ * ./.:0:3:.:.:.:.:0
+ * ./.:0:0:.:.:.:.:0
+ * ./.:0:0:.:.:.:.:0
+ * 1/1:0,0:.:6:1|1:15870493_CT_C:90,6,0
+ */
+
+ final Allele refT = Allele.create("T", true);
+ final Allele altC = Allele.create("C", false);
+ final Allele nocall = Allele.NO_CALL;
+
+ final VariantContextBuilder vcBuilder = new VariantContextBuilder("", "1", 1234567, 1234567, Arrays.asList(refT, altC));
+
+ vcBuilder.noID();
+ vcBuilder.attribute("AC", 6);
+ vcBuilder.attribute("AF", 1.00);
+ vcBuilder.attribute("AN", 6);
+ vcBuilder.attribute("DP", 22);
+ vcBuilder.attribute("ExcessHet", 3.0103);
+ vcBuilder.attribute("FS", 0.000);
+ vcBuilder.attribute("MLEAC", 2);
+ vcBuilder.attribute("MLEAF", 1.00);
+ vcBuilder.attribute("SOR", 0.693);
+
+ GenotypeBuilder gtBuilder = new GenotypeBuilder("one", Arrays.asList(altC,altC));
+ final Genotype firstSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
+ .DP(9) // edge case not passing "DP < 10"
+ .make();
+
+ gtBuilder = new GenotypeBuilder("two", Arrays.asList(altC,altC));
+ final Genotype secondSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
+ .DP(10) // edge case passing "DP < 10"
+ .make();
+
+ gtBuilder = new GenotypeBuilder("three", Arrays.asList(nocall,nocall));
+ final Genotype thirdSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
+ .DP(3)
+ .make();
+
+ gtBuilder = new GenotypeBuilder("four", Arrays.asList(nocall,nocall));
+ final Genotype fourthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
+ .DP(0)
+ .make();
+
+ gtBuilder = new GenotypeBuilder("five", Arrays.asList(nocall,nocall));
+ final Genotype fifthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
+ .DP(0)
+ .make();
+
+ gtBuilder = new GenotypeBuilder("six", Arrays.asList(altC,altC));
+ final Genotype sixthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
+ .DP(-1)
+ .make();
+
+ vcBuilder.genotypes(firstSample, secondSample, thirdSample, fourthSample, fifthSample, sixthSample);
+ return vcBuilder;
+ }
}