From 3c88e6859f0d0987df3f37e2765cacda06aaa8cf Mon Sep 17 00:00:00 2001 From: Steve Huang Date: Tue, 6 Sep 2016 17:30:18 -0400 Subject: [PATCH] fix 1430 for genotype filters; refactored filter() method; added unit and integration test; more comprehensive fix must be done first on htsjdk side in JEXLMap (#1456) --- .../VariantFiltrationIntegrationTest.java | 18 + .../walkers/filters/VariantFiltration.java | 542 ++++++++++-------- .../filters/VariantFiltrationUnitTest.java | 141 ++++- 3 files changed, 461 insertions(+), 240 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java index 61d92f0bb..9f2678d20 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java @@ -211,6 +211,24 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { executeTest("testFilteringDPfromFORMAT", spec); } + @Test + public void testFilteringDPfromFORMATWithMissing() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1, + Arrays.asList("4bf46103a71bac92a11eae04b97f9877")); + executeTest("testFilteringDPfromFORMATWithMissing", spec); + } + + @Test + public void testFilteringDPfromFORMATAndFailMissing() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --missingValuesInExpressionsShouldEvaluateAsFailing --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1, + Arrays.asList("baeda696c92adc8745ac4ebbdead6c91")); + executeTest("testFilteringDPfromFORMATAndFailMissing", spec); + } + @Test public void testInvertGenotypeFilterExpression() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index fb975c37b..c3087fe42 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.tools.walkers.filters; +import com.google.common.annotations.VisibleForTesting; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.commandline.*; @@ -55,7 +56,7 @@ import java.util.*; * *

* This tool is designed for hard-filtering variant calls based on certain criteria. Records are hard-filtered - * by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved + * by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved * in the output unless their removal is requested in the command line.

* *

The most common way of specifying filtering criteria is by using JEXL queries. See the @@ -84,7 +85,7 @@ import java.util.*; * * *

Caveat

- *

when you run VariantFiltration with a command that includes multiple logical parts, each part of the command is applied + *

when you run {@link VariantFiltration} with a command that includes multiple logical parts, each part of the command is applied * individually to the original form of the VCF record. Say you ran a VF command that includes three parts: one applies * some genotype filters, another applies setFilterGtToNoCall (which changes sample genotypes to ./. whenever a sample has a * genotype-level FT annotation), and yet another one filters sites based on whether any samples have a no-call there. You might @@ -98,14 +99,17 @@ import java.util.*; @Reference(window=@Window(start=-50,stop=50)) public class VariantFiltration extends RodWalker { + // ----------------------------------------------------------------------------------------------- + // Arguments + // ----------------------------------------------------------------------------------------------- @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** * Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed, - * i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used. + * i.e. filter variants that do not overlap with provided mask, then argument {@code -filterNotInMask} can be used. * Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used - * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). + * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking). */ @Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false) public RodBinding mask; @@ -115,41 +119,41 @@ public class VariantFiltration extends RodWalker { /** * VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using - * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"). + * {@code --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"}). */ @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false) - protected ArrayList filterExpressions = new ArrayList(); + protected ArrayList filterExpressions = new ArrayList<>(); /** - * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. + * This name is put in the

FILTER
field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. */ @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false) - protected ArrayList filterNames = new ArrayList(); + protected ArrayList filterNames = new ArrayList<>(); /** - * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. - * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag). - * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience - * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). Also available are - * expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object. + * Similar to the
INFO
field based expressions, but used on the
FORMAT
(genotype) fields instead. + * {@link VariantFiltration} will add the sample-level
FT
tag to the
FORMAT
field of filtered samples (this does not affect the record's
FILTER
tag). + * One can filter normally based on most fields (e.g. {@code "GQ < 5.0"}), but the
GT
(genotype) field is an exception. + * We have put in convenience methods so that one can now filter out hets ({@code "isHet == 1"}), refs ({@code "isHomRef == 1"}), or homs ({@code "isHomVar == 1"}). + * Also available are expressions {@code isCalled}, {@code isNoCall}, {@code isMixed}, and {@code isAvailable}, in accordance with the methods of the {@link Genotype} object. */ @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false) - protected ArrayList genotypeFilterExpressions = new ArrayList(); + protected ArrayList genotypeFilterExpressions = new ArrayList<>(); /** - * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + * Similar to the
INFO
field based expressions, but used on the
FORMAT
(genotype) fields instead. */ @Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) - protected ArrayList genotypeFilterNames = new ArrayList(); + protected ArrayList genotypeFilterNames = new ArrayList<>(); /** - * Works together with the --clusterWindowSize argument. + * Works together with the {@code --clusterWindowSize} argument. */ @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false) protected Integer clusterSize = 3; /** - * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1. + * Works together with the {@code --clusterWindowSize} argument. To disable the clustered SNP filter, set this value to less than 1. */ @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false) protected Integer clusterWindow = 0; @@ -158,45 +162,45 @@ public class VariantFiltration extends RodWalker { protected Integer maskExtension = 0; /** - * When using the -mask argument, the maskName will be annotated in the variant record. - * Note that when using the -filterNotInMask argument to reverse the masking logic, + * When using the {@code -mask} argument, the {@code maskName} will be annotated in the variant record. + * Note that when using the {@code -filterNotInMask} argument to reverse the masking logic, * it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used - * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). + * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking). */ @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) protected String maskName = "Mask"; /** - * By default, if the -mask argument is used, any variant falling in a mask will be filtered. + * By default, if the {@code -mask} argument is used, any variant falling in a mask will be filtered. * If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered. * Use case is, for example, if we have an interval list or BED file with "good" sites. * Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used - * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). + * (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking). */ @Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false) protected boolean filterRecordsNotInMask = false; /** - * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing. + * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as
PASS
ing. * Use this argument to have it evaluate as failing filters instead for these cases. */ @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean failMissingValues = false; /** - * Invalidate previous filters applied to the VariantContext, applying only the filters here + * Invalidate previous filters applied to the {@link VariantContext}, applying only the filters here. */ @Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false) boolean invalidatePrevious = false; /** - * Invert the selection criteria for --filterExpression + * Invert the selection criteria for {@code --filterExpression}. */ @Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false) protected boolean invertFilterExpression = false; /** - * Invert the selection criteria for --genotypeFilterExpression + * Invert the selection criteria for {@code --genotypeFilterExpression}. */ @Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false) protected boolean invertGenotypeFilterExpression = false; @@ -207,81 +211,42 @@ public class VariantFiltration extends RodWalker { @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call") private boolean setFilteredGenotypesToNocall = false; + // ----------------------------------------------------------------------------------------------- + // Fields + // ----------------------------------------------------------------------------------------------- + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster"; + private ClusteredSnps clusteredSNPs = null; private GenomeLoc previousMaskPosition = null; - - // the structures necessary to initialize and maintain a windowed context - private FiltrationContextWindow variantContextWindow; + private FiltrationContextWindow variantContextWindow; // the structures necessary to initialize and maintain a windowed context private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one - private ArrayList windowInitializer = new ArrayList(); + private ArrayList windowInitializer = new ArrayList<>(); - private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private static final List DIPLOID_NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - /** - * Prepend inverse phrase to description if --invertFilterExpression - * - * @param description the description - * @return the description with inverse prepended if --invert_filter_expression - */ - private String possiblyInvertFilterExpression( String description ){ - if ( invertFilterExpression ) - description = "Inverse of: " + description; - return description; - } - - private void initializeVcfWriter() { - - final List inputNames = Arrays.asList(variantCollection.variants.getName()); - - // setup the header fields - Set hInfo = new HashSet(); - hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames)); - - // need AC, AN and AF since output if set filtered genotypes to no-call - if ( setFilteredGenotypesToNocall ) { - hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY)); - hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY)); - hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY)); - } - - if ( clusterWindow > 0 ) - hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters")); - - if ( !genotypeFilterExps.isEmpty() ) - hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); - - try { - for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); - for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); - - if ( mask.isBound() ) { - if (filterRecordsNotInMask) - hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask")); - else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask")); - } - } catch (IllegalArgumentException e) { - throw new UserException.BadInput(e.getMessage()); - } - - writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); - } + // ----------------------------------------------------------------------------------------------- + // public methods from base classes + // ----------------------------------------------------------------------------------------------- public void initialize() { - if ( clusterWindow > 0 ) - clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow); - if ( maskExtension < 0 ) - throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed"); + if ( maskExtension < 0 ) { + throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed"); + } + + if (filterRecordsNotInMask && !mask.isBound()) { + throw new UserException.BadArgumentValue("filterNotInMask", "argument not allowed if mask argument is not provided"); + } + + if ( clusterWindow > 0 ) { + clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(), clusterSize, clusterWindow); + } - if (filterRecordsNotInMask && !mask.isBound()) - throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided"); filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions); genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions); @@ -290,44 +255,45 @@ public class VariantFiltration extends RodWalker { initializeVcfWriter(); } - public Integer reduceInit() { return 0; } - /** * * @param tracker the meta-data tracker * @param ref the reference base * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise + * @return 1 if the locus was successfully processed, 0 otherwise */ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) + if ( tracker == null ) { return 0; + } - Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + final Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); // is there a SNP mask present? - boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask)); - if ( hasMask ) + final boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask)); + if ( hasMask ) { previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask + } for ( VariantContext vc : VCs ) { if ( invalidatePrevious ) { - vc = (new VariantContextBuilder(vc)).filters(new HashSet()).make(); + vc = (new VariantContextBuilder(vc)).filters(new HashSet<>()).make(); } // filter based on previous mask position vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false); - FiltrationContext varContext = new FiltrationContext(ref, vc); + final FiltrationContext varContext = new FiltrationContext(ref, vc); // if we're still initializing the context, do so if ( windowInitializer != null ) { // if this is a mask position, filter previous records if ( hasMask ) { - for ( FiltrationContext prevVC : windowInitializer ) + for ( final FiltrationContext prevVC : windowInitializer ) { prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true)); + } } windowInitializer.add(varContext); @@ -339,9 +305,10 @@ public class VariantFiltration extends RodWalker { // if this is a mask position, filter previous records if ( hasMask ) { - for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) { - if ( prevVC != null ) + for ( final FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) { + if ( prevVC != null ) { prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true)); + } } } @@ -353,6 +320,244 @@ public class VariantFiltration extends RodWalker { return 1; } + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public Integer reduceInit() { return 0; } + + /** + * Tell the user the number of loci processed and close out the new variants file. + * + * @param result the number of loci seen. + */ + public void onTraversalDone(Integer result) { + // move the window over so that we can filter the last few variants + if ( windowInitializer != null ) { + while ( windowInitializer.size() < WINDOW_SIZE ) { + windowInitializer.add(null); + } + variantContextWindow = new FiltrationContextWindow(windowInitializer); + } + for (int i=0; i < WINDOW_SIZE; i++) { + variantContextWindow.moveWindow(null); + filter(); + } + } + + // ----------------------------------------------------------------------------------------------- + // main filtering steps + // ----------------------------------------------------------------------------------------------- + + /** + * Organizing filters: genotype filters and normal filters. + */ + private void filter() { + // get the current context + final FiltrationContext context = variantContextWindow.getContext(); + if ( context == null ) { + return; + } + + final VariantContext vc = context.getVariantContext(); + + // make new Genotypes based on genotype filters + final VariantContextBuilder builder = ( genotypeFilterExps.isEmpty() && !setFilteredGenotypesToNocall ) ? new VariantContextBuilder(vc) + : applyGenotypeFilters(vc, genotypeFilterExps, invertGenotypeFilterExpression, failMissingValues, setFilteredGenotypesToNocall); + + // extract filters already in VC and append new filters + final Set filters = buildVCfilters(vc, filterExps, invertFilterExpression, failMissingValues); + // test for clustered SNPs if requested + if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) ) { + filters.add(CLUSTERED_SNP_FILTER_NAME); + } + + // make a new variant context based on all filters, and write + writer.add(filters.isEmpty() ? builder.passFilters().make() : builder.filters(filters).make()); + } + + /** + * Given a VC builder and a vc (which was used to construct the builder), update the properties that the builder + * will use to construct a new VC, based on some of the attributes/annotations of the old VC. + * @param vc variant context holding genotypes to be filtered + * @param genotypeFilterExpressions genotype filter expressions + * @param invertGenotypeFilterExpression should invert the genotype filter expression or not + * @param failIfMissingValues if sample misses the corresponding annotation(s) the filter(s) work by, should we fail them or not + * @param setFilteredGenotypesToNocall if sample is filtered should we set genotype to non-call or not + */ + @VisibleForTesting + static VariantContextBuilder applyGenotypeFilters(final VariantContext vc, + final List genotypeFilterExpressions, + final boolean invertGenotypeFilterExpression, + final boolean failIfMissingValues, + final boolean setFilteredGenotypesToNocall) { + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + + final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); + + // recompute AC, AN and AF if filtered genotypes are set to no-call + // occurrences of alternate alleles over all genotypes + final Map calledAltAlleles = new LinkedHashMap<>(vc.getNAlleles()-1); + for ( final Allele altAllele : vc.getAlternateAlleles() ) { + calledAltAlleles.put(altAllele, 0); + } + + int calledAlleles = 0; + boolean haveFilteredNoCallAlleles = false; + + // for each genotype, check filters then create a new object + for ( final Genotype g : vc.getGenotypes() ) { + if ( g.isCalled() ) { + final List filters = new ArrayList<>(); + if ( g.isFiltered() ) filters.add(g.getFilters()); + + // Add if expression filters the variant context + for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExpressions ) { + try { + if (Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression)) { + filters.add(exp.name); + } + } catch (final IllegalArgumentException e) { + // logic: right now (2016/08/18) if a filter is applied based on specific annotation and some sample contains missing value for such annotation, + // lower level code will throw IllegalArgumentException, therefore we specifically catch this type of exception + // do nothing unless specifically asked to; it just means that the expression isn't defined for this context + if ( failIfMissingValues ) { + filters.add(exp.name); + } + } + } + + // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call + if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) { + haveFilteredNoCallAlleles = true; + genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(DIPLOID_NO_CALL_ALLELES).make()); + } + else { + genotypes.add(new GenotypeBuilder(g).filters(filters).make()); + calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g); + } + } else { + genotypes.add(g); + } + } + + builder.genotypes(genotypes); + // if filtered genotypes are set to no-call, output recomputed AC, AN, AF + if ( haveFilteredNoCallAlleles ) { + GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder); + } + + return builder; + } + + /** + * Extract filters already present in the {@code vc}, and append user provided expressions. + * For user provided genotype filter expressions, see {@link #applyGenotypeFilters(VariantContext, List, boolean, boolean, boolean)} + * @param vc VC from which filters to be extracted + * @param vcFilterExpressions more filter expressions provided by user + * @param invertVCfilterExpression should we invert the logic in expressions provided in {@code vcFilterExpressions} + * @param failIfMissingValues should we mark the VC as failing if it misses the value the filters work on + * + * @return filters already in the provided vc and user-provided filters + */ + @VisibleForTesting + static Set buildVCfilters(final VariantContext vc, + final List vcFilterExpressions, + final boolean invertVCfilterExpression, + final boolean failIfMissingValues) { + + final Set filters = new LinkedHashSet<>(vc.getFilters()); + + for ( final VariantContextUtils.JexlVCMatchExp exp : vcFilterExpressions ) { + try { + if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertVCfilterExpression) ) { + filters.add(exp.name); + } + } catch (final Exception e) { + // do nothing unless specifically asked to; it just means that the expression isn't defined for this context + if ( failIfMissingValues ) { + filters.add(exp.name); + } + } + } + return filters; + } + + // ----------------------------------------------------------------------------------------------- + // some other complications besides main stuff + // ----------------------------------------------------------------------------------------------- + + private void initializeVcfWriter() { + + final List inputNames = Arrays.asList(variantCollection.variants.getName()); + + // setup the header fields + final Set hInfo = new HashSet(); + hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames)); + + // need AC, AN and AF since output if set filtered genotypes to no-call + if ( setFilteredGenotypesToNocall ) { + hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY)); + hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY)); + hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY)); + } + + if ( clusterWindow > 0 ) { + hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters")); + } + + if ( !genotypeFilterExps.isEmpty() ) { + hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); + } + + try { + for ( final VariantContextUtils.JexlVCMatchExp exp : filterExps ) { + hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); + } + for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { + hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); + } + + if ( mask.isBound() ) { + hInfo.add(new VCFFilterHeaderLine(maskName, filterRecordsNotInMask ? "Doesn't overlap a user-input mask" : "Overlaps a user-input mask")); + } + } catch (final IllegalArgumentException e) { + throw new UserException.BadInput(e.getMessage()); + } + + writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); + } + + /** + * Prepend inverse phrase to description if {@code --invertFilterExpression}. + * + * @param description the description + * @return the description with inverse prepended if {@code --invert_filter_expression}. + */ + private String possiblyInvertFilterExpression( final String description ){ + return invertFilterExpression ? "Inverse of: " + description : description; + } + + /** + * Add mask to variant context filters if it covers its location. + * @param vc VariantContext + * @param genomeLoc genome location + * @param maskName name of the mask + * @param maskExtension bases beyond the mask + * @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite. + * @return VariantContext with the mask added if the VariantContext is within the extended mask area + */ + private VariantContext addMaskIfCoversVariant(VariantContext vc, final GenomeLoc genomeLoc, final String maskName, final int maskExtension, final boolean locStart) { + if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) { + final Set filters = new LinkedHashSet<>(vc.getFilters()); + filters.add(maskName); + vc = new VariantContextBuilder(vc).filters(filters).make(); + } + + return vc; + } + /** * Helper function to check if a mask covers the variant location. * @@ -364,141 +569,14 @@ public class VariantFiltration extends RodWalker { * @return true if the genome location is within the extended mask area, false otherwise */ protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) { - boolean logic = genomeLoc != null && // have a location - genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig + final boolean needToCheckOveralpping = genomeLoc != null && // have a location + genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig (vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied - if ( logic ) { - if (vcBeforeLoc) - return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance) - else - return vc.getStart() - genomeLoc.getStop() <= maskExtension; + if ( needToCheckOveralpping ) { + return vcBeforeLoc ? (genomeLoc.getStart() - vc.getEnd() <= maskExtension) // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance) + : (vc.getStart() - genomeLoc.getStop() <= maskExtension); } else { return false; } } - - /** - * Add mask to variant context filters if it covers the it's location - * - * @param vc VariantContext - * @param genomeLoc genome location - * @param maskName name of the mask - * @param maskExtension bases beyond the mask - * @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite. - * @return VariantContext with the mask added if the VariantContext is within the extended mask area - */ - private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) { - if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) { - Set filters = new LinkedHashSet(vc.getFilters()); - filters.add(maskName); - vc = new VariantContextBuilder(vc).filters(filters).make(); - } - - return vc; - } - - private void filter() { - // get the current context - FiltrationContext context = variantContextWindow.getContext(); - if ( context == null ) - return; - - final VariantContext vc = context.getVariantContext(); - final VariantContextBuilder builder = new VariantContextBuilder(vc); - - // make new Genotypes based on filters - if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) { - GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); - - // - // recompute AC, AN and AF if filtered genotypes are set to no-call - // - // occurrences of alternate alleles over all genotypes - final Map calledAltAlleles = new LinkedHashMap(vc.getNAlleles()-1); - for ( final Allele altAllele : vc.getAlternateAlleles() ) { - calledAltAlleles.put(altAllele, 0); - } - - int calledAlleles = 0; - boolean haveFilteredNoCallAlleles = false; - - // for each genotype, check filters then create a new object - for ( final Genotype g : vc.getGenotypes() ) { - if ( g.isCalled() ) { - final List filters = new ArrayList(); - if ( g.isFiltered() ) filters.add(g.getFilters()); - - // Add if expression filters the variant context - for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { - if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) ) - filters.add(exp.name); - } - - // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call - if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) { - haveFilteredNoCallAlleles = true; - genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make()); - } - else { - genotypes.add(new GenotypeBuilder(g).filters(filters).make()); - calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g); - } - } else { - genotypes.add(g); - } - } - - builder.genotypes(genotypes); - // if filtered genotypes are set to no-call, output recomputed AC, AN, AF - if ( haveFilteredNoCallAlleles ) - GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder); - } - - // make a new variant context based on filters - Set filters = new LinkedHashSet(vc.getFilters()); - - // test for clustered SNPs if requested - if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) ) - filters.add(CLUSTERED_SNP_FILTER_NAME); - - for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) { - try { - if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) ) - filters.add(exp.name); - } catch (Exception e) { - // do nothing unless specifically asked to; it just means that the expression isn't defined for this context - if ( failMissingValues ) - filters.add(exp.name); - } - } - - if ( filters.isEmpty() ) - builder.passFilters(); - else - builder.filters(filters); - - writer.add(builder.make()); - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - /** - * Tell the user the number of loci processed and close out the new variants file. - * - * @param result the number of loci seen. - */ - public void onTraversalDone(Integer result) { - // move the window over so that we can filter the last few variants - if ( windowInitializer != null ) { - while ( windowInitializer.size() < WINDOW_SIZE ) - windowInitializer.add(null); - variantContextWindow = new FiltrationContextWindow(windowInitializer); - } - for (int i=0; i < WINDOW_SIZE; i++) { - variantContextWindow.moveWindow(null); - filter(); - } - } } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java index 3ec696592..5262345fc 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java @@ -26,21 +26,24 @@ package org.broadinstitute.gatk.tools.walkers.filters; import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFConstants; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.Utils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import org.testng.Assert; -import org.testng.annotations.*; +import java.util.Set; +import java.util.stream.Collectors; public class VariantFiltrationUnitTest extends BaseTest { @@ -58,7 +61,7 @@ public class VariantFiltrationUnitTest extends BaseTest { } @DataProvider(name = "VariantMaskData") - public Object[][] DoesMaskCoverVariantTestData() { + public Object[][] doesMaskCoverVariantTestData() { final String maskName = "testMask"; @@ -85,8 +88,8 @@ public class VariantFiltrationUnitTest extends BaseTest { * @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true. * @param expectedValue return the expected return value from doesMaskCoverVariant() */ - @Test(dataProvider = "VariantMaskData") - public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension, + @Test(dataProvider = "doesMaskCoverVariantTestData") + public void testDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension, final boolean vcBeforeLoc, final boolean expectedValue) { // Build VariantContext @@ -104,4 +107,126 @@ public class VariantFiltrationUnitTest extends BaseTest { boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc); Assert.assertEquals(coversVariant, expectedValue); } + + @Test + public void testApplyGenotypeFilters(){ + + final VariantContext vc = buildDataForFilters().make(); + + final String filterName = "LowDP"; + final String filterExpr = "DP < 10"; + + final List genotypeFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr)); + + final VariantContextBuilder anotherVCBuilder = VariantFiltration.applyGenotypeFilters(vc, genotypeFilterExps, false, false, false); + final VariantContext anotherVC = anotherVCBuilder.filters().make(); + + Assert.assertEquals(anotherVC.getGenotype("one").isFiltered(), true); + Assert.assertTrue(anotherVC.getGenotype("one").getFilters().equals(filterName)); + + Assert.assertEquals(anotherVC.getGenotype("two").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("three").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("four").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("five").isFiltered(), false); + + Assert.assertEquals(anotherVC.getGenotype("six").isFiltered(), false); + + final VariantContextBuilder yetAnotherVCBuilder = VariantFiltration.applyGenotypeFilters(anotherVC, genotypeFilterExps, false, true, false); + final VariantContext yetAnotherVC = yetAnotherVCBuilder.filters().make(); + Assert.assertEquals(yetAnotherVC.getGenotype("six").isFiltered(), true); + Assert.assertTrue(yetAnotherVC.getGenotype("six").getFilters().equals(filterName)); + } + + @Test + public void testApplyVCFilters(){ + + final VariantContext vcNoFilters = buildDataForFilters().make(); // assumes this vc doesn't hold any filters yet + + String filterName = "LowDP"; + String filterExpr = "DP < 23"; + List vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr)); + + final Set filters = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false); + Assert.assertFalse(vcNoFilters.isFiltered()); + Assert.assertEquals(filters.size(), 1); + Assert.assertTrue(filters.contains(filterName)); + + filterName = "ID"; + filterExpr = "ID = rs123"; + vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr)); + Set filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, true); +// Assert.assertEquals(filterWhenFailMissing.size(), 1); +// Assert.assertTrue(filterWhenFailMissing.contains(filterName)); + filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false); + Assert.assertTrue(filterWhenFailMissing.isEmpty()); + } + + + private static VariantContextBuilder buildDataForFilters() { + /** + * Uses (part of) the following (semi fake) data for testing (data was modified from real data so expect some minor inconsistencies in annotations) + * 1 1234567 . T C 152.03 . + * AC=6;AF=1.00;AN=6;DP=22;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;SOR=0.693;set=variant3-variant4-variant6 + * GT:AD:DP:GQ:PGT:PID:PL:RGQ + * 1/1:0,2:9:6:1|1:15870493_CT_C:90,6,0 + * 1/1:0,4:10:12:1|1:15870493_CT_C:180,12,0 + * ./.:0:3:.:.:.:.:0 + * ./.:0:0:.:.:.:.:0 + * ./.:0:0:.:.:.:.:0 + * 1/1:0,0:.:6:1|1:15870493_CT_C:90,6,0 + */ + + final Allele refT = Allele.create("T", true); + final Allele altC = Allele.create("C", false); + final Allele nocall = Allele.NO_CALL; + + final VariantContextBuilder vcBuilder = new VariantContextBuilder("", "1", 1234567, 1234567, Arrays.asList(refT, altC)); + + vcBuilder.noID(); + vcBuilder.attribute("AC", 6); + vcBuilder.attribute("AF", 1.00); + vcBuilder.attribute("AN", 6); + vcBuilder.attribute("DP", 22); + vcBuilder.attribute("ExcessHet", 3.0103); + vcBuilder.attribute("FS", 0.000); + vcBuilder.attribute("MLEAC", 2); + vcBuilder.attribute("MLEAF", 1.00); + vcBuilder.attribute("SOR", 0.693); + + GenotypeBuilder gtBuilder = new GenotypeBuilder("one", Arrays.asList(altC,altC)); + final Genotype firstSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR) + .DP(9) // edge case not passing "DP < 10" + .make(); + + gtBuilder = new GenotypeBuilder("two", Arrays.asList(altC,altC)); + final Genotype secondSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR) + .DP(10) // edge case passing "DP < 10" + .make(); + + gtBuilder = new GenotypeBuilder("three", Arrays.asList(nocall,nocall)); + final Genotype thirdSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL) + .DP(3) + .make(); + + gtBuilder = new GenotypeBuilder("four", Arrays.asList(nocall,nocall)); + final Genotype fourthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL) + .DP(0) + .make(); + + gtBuilder = new GenotypeBuilder("five", Arrays.asList(nocall,nocall)); + final Genotype fifthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL) + .DP(0) + .make(); + + gtBuilder = new GenotypeBuilder("six", Arrays.asList(altC,altC)); + final Genotype sixthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR) + .DP(-1) + .make(); + + vcBuilder.genotypes(firstSample, secondSample, thirdSample, fourthSample, fifthSample, sixthSample); + return vcBuilder; + } }