fix 1430 for genotype filters; refactored filter() method; added unit and integration test; more comprehensive fix must be done first on htsjdk side in JEXLMap (#1456)
This commit is contained in:
parent
d582ee6d3c
commit
3c88e6859f
|
|
@ -211,6 +211,24 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
||||||
executeTest("testFilteringDPfromFORMAT", spec);
|
executeTest("testFilteringDPfromFORMAT", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilteringDPfromFORMATWithMissing() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
|
||||||
|
+ " --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
|
||||||
|
Arrays.asList("4bf46103a71bac92a11eae04b97f9877"));
|
||||||
|
executeTest("testFilteringDPfromFORMATWithMissing", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilteringDPfromFORMATAndFailMissing() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
|
||||||
|
+ " --missingValuesInExpressionsShouldEvaluateAsFailing --genotypeFilterExpression 'DP < 10' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormatWithMissing.vcf", 1,
|
||||||
|
Arrays.asList("baeda696c92adc8745ac4ebbdead6c91"));
|
||||||
|
executeTest("testFilteringDPfromFORMATAndFailMissing", spec);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInvertGenotypeFilterExpression() {
|
public void testInvertGenotypeFilterExpression() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.gatk.tools.walkers.filters;
|
package org.broadinstitute.gatk.tools.walkers.filters;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import htsjdk.tribble.Feature;
|
import htsjdk.tribble.Feature;
|
||||||
import org.broadinstitute.gatk.utils.Utils;
|
import org.broadinstitute.gatk.utils.Utils;
|
||||||
import org.broadinstitute.gatk.utils.commandline.*;
|
import org.broadinstitute.gatk.utils.commandline.*;
|
||||||
|
|
@ -55,7 +56,7 @@ import java.util.*;
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* This tool is designed for hard-filtering variant calls based on certain criteria. Records are hard-filtered
|
* This tool is designed for hard-filtering variant calls based on certain criteria. Records are hard-filtered
|
||||||
* by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved
|
* by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved
|
||||||
* in the output unless their removal is requested in the command line. </p>
|
* in the output unless their removal is requested in the command line. </p>
|
||||||
*
|
*
|
||||||
* <p>The most common way of specifying filtering criteria is by using JEXL queries. See the
|
* <p>The most common way of specifying filtering criteria is by using JEXL queries. See the
|
||||||
|
|
@ -84,7 +85,7 @@ import java.util.*;
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
* <h3>Caveat</h3>
|
* <h3>Caveat</h3>
|
||||||
* <p>when you run VariantFiltration with a command that includes multiple logical parts, each part of the command is applied
|
* <p>when you run {@link VariantFiltration} with a command that includes multiple logical parts, each part of the command is applied
|
||||||
* individually to the original form of the VCF record. Say you ran a VF command that includes three parts: one applies
|
* individually to the original form of the VCF record. Say you ran a VF command that includes three parts: one applies
|
||||||
* some genotype filters, another applies setFilterGtToNoCall (which changes sample genotypes to ./. whenever a sample has a
|
* some genotype filters, another applies setFilterGtToNoCall (which changes sample genotypes to ./. whenever a sample has a
|
||||||
* genotype-level FT annotation), and yet another one filters sites based on whether any samples have a no-call there. You might
|
* genotype-level FT annotation), and yet another one filters sites based on whether any samples have a no-call there. You might
|
||||||
|
|
@ -98,14 +99,17 @@ import java.util.*;
|
||||||
@Reference(window=@Window(start=-50,stop=50))
|
@Reference(window=@Window(start=-50,stop=50))
|
||||||
public class VariantFiltration extends RodWalker<Integer, Integer> {
|
public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
|
// Arguments
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
@ArgumentCollection
|
@ArgumentCollection
|
||||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed,
|
* Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed,
|
||||||
* i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used.
|
* i.e. filter variants that do not overlap with provided mask, then argument {@code -filterNotInMask} can be used.
|
||||||
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
|
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
|
||||||
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
|
* (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
|
||||||
*/
|
*/
|
||||||
@Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false)
|
@Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false)
|
||||||
public RodBinding<Feature> mask;
|
public RodBinding<Feature> mask;
|
||||||
|
|
@ -115,41 +119,41 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
|
* VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
|
||||||
* --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
|
* {@code --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"}).
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
|
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
|
||||||
protected ArrayList<String> filterExpressions = new ArrayList<String>();
|
protected ArrayList<String> filterExpressions = new ArrayList<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
|
* This name is put in the <pre>FILTER</pre> field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
|
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
|
||||||
protected ArrayList<String> filterNames = new ArrayList<String>();
|
protected ArrayList<String> filterNames = new ArrayList<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
|
* Similar to the <pre>INFO</pre> field based expressions, but used on the <pre>FORMAT</pre> (genotype) fields instead.
|
||||||
* VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
|
* {@link VariantFiltration} will add the sample-level <pre>FT</pre> tag to the <pre>FORMAT</pre> field of filtered samples (this does not affect the record's <pre>FILTER</pre> tag).
|
||||||
* One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
|
* One can filter normally based on most fields (e.g. {@code "GQ < 5.0"}), but the <pre>GT</pre> (genotype) field is an exception.
|
||||||
* methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). Also available are
|
* We have put in convenience methods so that one can now filter out hets ({@code "isHet == 1"}), refs ({@code "isHomRef == 1"}), or homs ({@code "isHomVar == 1"}).
|
||||||
* expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object.
|
* Also available are expressions {@code isCalled}, {@code isNoCall}, {@code isMixed}, and {@code isAvailable}, in accordance with the methods of the {@link Genotype} object.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
|
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
|
||||||
protected ArrayList<String> genotypeFilterExpressions = new ArrayList<String>();
|
protected ArrayList<String> genotypeFilterExpressions = new ArrayList<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
|
* Similar to the <pre>INFO</pre> field based expressions, but used on the <pre>FORMAT</pre> (genotype) fields instead.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
|
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
|
||||||
protected ArrayList<String> genotypeFilterNames = new ArrayList<String>();
|
protected ArrayList<String> genotypeFilterNames = new ArrayList<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Works together with the --clusterWindowSize argument.
|
* Works together with the {@code --clusterWindowSize} argument.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
|
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
|
||||||
protected Integer clusterSize = 3;
|
protected Integer clusterSize = 3;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
|
* Works together with the {@code --clusterWindowSize} argument. To disable the clustered SNP filter, set this value to less than 1.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
|
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
|
||||||
protected Integer clusterWindow = 0;
|
protected Integer clusterWindow = 0;
|
||||||
|
|
@ -158,45 +162,45 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
protected Integer maskExtension = 0;
|
protected Integer maskExtension = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When using the -mask argument, the maskName will be annotated in the variant record.
|
* When using the {@code -mask} argument, the {@code maskName} will be annotated in the variant record.
|
||||||
* Note that when using the -filterNotInMask argument to reverse the masking logic,
|
* Note that when using the {@code -filterNotInMask} argument to reverse the masking logic,
|
||||||
* it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
|
* it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
|
||||||
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
|
* (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
|
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
|
||||||
protected String maskName = "Mask";
|
protected String maskName = "Mask";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default, if the -mask argument is used, any variant falling in a mask will be filtered.
|
* By default, if the {@code -mask} argument is used, any variant falling in a mask will be filtered.
|
||||||
* If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered.
|
* If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered.
|
||||||
* Use case is, for example, if we have an interval list or BED file with "good" sites.
|
* Use case is, for example, if we have an interval list or BED file with "good" sites.
|
||||||
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
|
* Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used
|
||||||
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
|
* (e.g. if masking against Hapmap, use {@code -maskName=hapmap} for the normal masking and {@code -maskName=not_hapmap} for the reverse masking).
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false)
|
@Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false)
|
||||||
protected boolean filterRecordsNotInMask = false;
|
protected boolean filterRecordsNotInMask = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
|
* By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as <pre>PASS</pre>ing.
|
||||||
* Use this argument to have it evaluate as failing filters instead for these cases.
|
* Use this argument to have it evaluate as failing filters instead for these cases.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
|
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
|
||||||
protected Boolean failMissingValues = false;
|
protected Boolean failMissingValues = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invalidate previous filters applied to the VariantContext, applying only the filters here
|
* Invalidate previous filters applied to the {@link VariantContext}, applying only the filters here.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
|
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
|
||||||
boolean invalidatePrevious = false;
|
boolean invalidatePrevious = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invert the selection criteria for --filterExpression
|
* Invert the selection criteria for {@code --filterExpression}.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false)
|
@Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false)
|
||||||
protected boolean invertFilterExpression = false;
|
protected boolean invertFilterExpression = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invert the selection criteria for --genotypeFilterExpression
|
* Invert the selection criteria for {@code --genotypeFilterExpression}.
|
||||||
*/
|
*/
|
||||||
@Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
|
@Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
|
||||||
protected boolean invertGenotypeFilterExpression = false;
|
protected boolean invertGenotypeFilterExpression = false;
|
||||||
|
|
@ -207,81 +211,42 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
@Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
|
@Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
|
||||||
private boolean setFilteredGenotypesToNocall = false;
|
private boolean setFilteredGenotypesToNocall = false;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
|
// Fields
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
// JEXL expressions for the filters
|
// JEXL expressions for the filters
|
||||||
List<VariantContextUtils.JexlVCMatchExp> filterExps;
|
List<VariantContextUtils.JexlVCMatchExp> filterExps;
|
||||||
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
|
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
|
||||||
|
|
||||||
public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster";
|
public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster";
|
||||||
|
|
||||||
private ClusteredSnps clusteredSNPs = null;
|
private ClusteredSnps clusteredSNPs = null;
|
||||||
private GenomeLoc previousMaskPosition = null;
|
private GenomeLoc previousMaskPosition = null;
|
||||||
|
private FiltrationContextWindow variantContextWindow; // the structures necessary to initialize and maintain a windowed context
|
||||||
// the structures necessary to initialize and maintain a windowed context
|
|
||||||
private FiltrationContextWindow variantContextWindow;
|
|
||||||
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
|
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
|
||||||
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<FiltrationContext>();
|
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<>();
|
||||||
|
|
||||||
private final List<Allele> diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
private static final List<Allele> DIPLOID_NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||||
|
|
||||||
/**
|
// -----------------------------------------------------------------------------------------------
|
||||||
* Prepend inverse phrase to description if --invertFilterExpression
|
// public methods from base classes
|
||||||
*
|
// -----------------------------------------------------------------------------------------------
|
||||||
* @param description the description
|
|
||||||
* @return the description with inverse prepended if --invert_filter_expression
|
|
||||||
*/
|
|
||||||
private String possiblyInvertFilterExpression( String description ){
|
|
||||||
if ( invertFilterExpression )
|
|
||||||
description = "Inverse of: " + description;
|
|
||||||
return description;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeVcfWriter() {
|
|
||||||
|
|
||||||
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
|
|
||||||
|
|
||||||
// setup the header fields
|
|
||||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
|
||||||
hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
|
|
||||||
|
|
||||||
// need AC, AN and AF since output if set filtered genotypes to no-call
|
|
||||||
if ( setFilteredGenotypesToNocall ) {
|
|
||||||
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
|
|
||||||
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
|
|
||||||
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( clusterWindow > 0 )
|
|
||||||
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
|
|
||||||
|
|
||||||
if ( !genotypeFilterExps.isEmpty() )
|
|
||||||
hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
|
||||||
|
|
||||||
try {
|
|
||||||
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
|
|
||||||
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
|
|
||||||
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
|
|
||||||
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
|
|
||||||
|
|
||||||
if ( mask.isBound() ) {
|
|
||||||
if (filterRecordsNotInMask)
|
|
||||||
hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
|
|
||||||
else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
|
|
||||||
}
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
throw new UserException.BadInput(e.getMessage());
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
if ( clusterWindow > 0 )
|
|
||||||
clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow);
|
|
||||||
|
|
||||||
if ( maskExtension < 0 )
|
if ( maskExtension < 0 ) {
|
||||||
throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
|
throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filterRecordsNotInMask && !mask.isBound()) {
|
||||||
|
throw new UserException.BadArgumentValue("filterNotInMask", "argument not allowed if mask argument is not provided");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( clusterWindow > 0 ) {
|
||||||
|
clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(), clusterSize, clusterWindow);
|
||||||
|
}
|
||||||
|
|
||||||
if (filterRecordsNotInMask && !mask.isBound())
|
|
||||||
throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided");
|
|
||||||
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
|
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
|
||||||
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
|
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
|
||||||
|
|
||||||
|
|
@ -290,44 +255,45 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
initializeVcfWriter();
|
initializeVcfWriter();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduceInit() { return 0; }
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param tracker the meta-data tracker
|
* @param tracker the meta-data tracker
|
||||||
* @param ref the reference base
|
* @param ref the reference base
|
||||||
* @param context the context for the given locus
|
* @param context the context for the given locus
|
||||||
* @return 1 if the locus was successfully processed, 0 if otherwise
|
* @return 1 if the locus was successfully processed, 0 otherwise
|
||||||
*/
|
*/
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
if ( tracker == null )
|
if ( tracker == null ) {
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
|
final Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
|
||||||
|
|
||||||
// is there a SNP mask present?
|
// is there a SNP mask present?
|
||||||
boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask));
|
final boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask));
|
||||||
if ( hasMask )
|
if ( hasMask ) {
|
||||||
previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask
|
previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask
|
||||||
|
}
|
||||||
|
|
||||||
for ( VariantContext vc : VCs ) {
|
for ( VariantContext vc : VCs ) {
|
||||||
|
|
||||||
if ( invalidatePrevious ) {
|
if ( invalidatePrevious ) {
|
||||||
vc = (new VariantContextBuilder(vc)).filters(new HashSet<String>()).make();
|
vc = (new VariantContextBuilder(vc)).filters(new HashSet<>()).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
// filter based on previous mask position
|
// filter based on previous mask position
|
||||||
vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false);
|
vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false);
|
||||||
|
|
||||||
FiltrationContext varContext = new FiltrationContext(ref, vc);
|
final FiltrationContext varContext = new FiltrationContext(ref, vc);
|
||||||
|
|
||||||
// if we're still initializing the context, do so
|
// if we're still initializing the context, do so
|
||||||
if ( windowInitializer != null ) {
|
if ( windowInitializer != null ) {
|
||||||
|
|
||||||
// if this is a mask position, filter previous records
|
// if this is a mask position, filter previous records
|
||||||
if ( hasMask ) {
|
if ( hasMask ) {
|
||||||
for ( FiltrationContext prevVC : windowInitializer )
|
for ( final FiltrationContext prevVC : windowInitializer ) {
|
||||||
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
|
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
windowInitializer.add(varContext);
|
windowInitializer.add(varContext);
|
||||||
|
|
@ -339,9 +305,10 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
// if this is a mask position, filter previous records
|
// if this is a mask position, filter previous records
|
||||||
if ( hasMask ) {
|
if ( hasMask ) {
|
||||||
for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
|
for ( final FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
|
||||||
if ( prevVC != null )
|
if ( prevVC != null ) {
|
||||||
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
|
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -353,6 +320,244 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return sum + value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tell the user the number of loci processed and close out the new variants file.
|
||||||
|
*
|
||||||
|
* @param result the number of loci seen.
|
||||||
|
*/
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
// move the window over so that we can filter the last few variants
|
||||||
|
if ( windowInitializer != null ) {
|
||||||
|
while ( windowInitializer.size() < WINDOW_SIZE ) {
|
||||||
|
windowInitializer.add(null);
|
||||||
|
}
|
||||||
|
variantContextWindow = new FiltrationContextWindow(windowInitializer);
|
||||||
|
}
|
||||||
|
for (int i=0; i < WINDOW_SIZE; i++) {
|
||||||
|
variantContextWindow.moveWindow(null);
|
||||||
|
filter();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
|
// main filtering steps
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Organizing filters: genotype filters and normal filters.
|
||||||
|
*/
|
||||||
|
private void filter() {
|
||||||
|
// get the current context
|
||||||
|
final FiltrationContext context = variantContextWindow.getContext();
|
||||||
|
if ( context == null ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final VariantContext vc = context.getVariantContext();
|
||||||
|
|
||||||
|
// make new Genotypes based on genotype filters
|
||||||
|
final VariantContextBuilder builder = ( genotypeFilterExps.isEmpty() && !setFilteredGenotypesToNocall ) ? new VariantContextBuilder(vc)
|
||||||
|
: applyGenotypeFilters(vc, genotypeFilterExps, invertGenotypeFilterExpression, failMissingValues, setFilteredGenotypesToNocall);
|
||||||
|
|
||||||
|
// extract filters already in VC and append new filters
|
||||||
|
final Set<String> filters = buildVCfilters(vc, filterExps, invertFilterExpression, failMissingValues);
|
||||||
|
// test for clustered SNPs if requested
|
||||||
|
if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) ) {
|
||||||
|
filters.add(CLUSTERED_SNP_FILTER_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
// make a new variant context based on all filters, and write
|
||||||
|
writer.add(filters.isEmpty() ? builder.passFilters().make() : builder.filters(filters).make());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a VC builder and a vc (which was used to construct the builder), update the properties that the builder
|
||||||
|
* will use to construct a new VC, based on some of the attributes/annotations of the old VC.
|
||||||
|
* @param vc variant context holding genotypes to be filtered
|
||||||
|
* @param genotypeFilterExpressions genotype filter expressions
|
||||||
|
* @param invertGenotypeFilterExpression should invert the genotype filter expression or not
|
||||||
|
* @param failIfMissingValues if sample misses the corresponding annotation(s) the filter(s) work by, should we fail them or not
|
||||||
|
* @param setFilteredGenotypesToNocall if sample is filtered should we set genotype to non-call or not
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
static VariantContextBuilder applyGenotypeFilters(final VariantContext vc,
|
||||||
|
final List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExpressions,
|
||||||
|
final boolean invertGenotypeFilterExpression,
|
||||||
|
final boolean failIfMissingValues,
|
||||||
|
final boolean setFilteredGenotypesToNocall) {
|
||||||
|
|
||||||
|
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||||
|
|
||||||
|
final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
|
||||||
|
|
||||||
|
// recompute AC, AN and AF if filtered genotypes are set to no-call
|
||||||
|
// occurrences of alternate alleles over all genotypes
|
||||||
|
final Map<Allele, Integer> calledAltAlleles = new LinkedHashMap<>(vc.getNAlleles()-1);
|
||||||
|
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
|
||||||
|
calledAltAlleles.put(altAllele, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
int calledAlleles = 0;
|
||||||
|
boolean haveFilteredNoCallAlleles = false;
|
||||||
|
|
||||||
|
// for each genotype, check filters then create a new object
|
||||||
|
for ( final Genotype g : vc.getGenotypes() ) {
|
||||||
|
if ( g.isCalled() ) {
|
||||||
|
final List<String> filters = new ArrayList<>();
|
||||||
|
if ( g.isFiltered() ) filters.add(g.getFilters());
|
||||||
|
|
||||||
|
// Add if expression filters the variant context
|
||||||
|
for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExpressions ) {
|
||||||
|
try {
|
||||||
|
if (Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression)) {
|
||||||
|
filters.add(exp.name);
|
||||||
|
}
|
||||||
|
} catch (final IllegalArgumentException e) {
|
||||||
|
// logic: right now (2016/08/18) if a filter is applied based on specific annotation and some sample contains missing value for such annotation,
|
||||||
|
// lower level code will throw IllegalArgumentException, therefore we specifically catch this type of exception
|
||||||
|
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
|
||||||
|
if ( failIfMissingValues ) {
|
||||||
|
filters.add(exp.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
|
||||||
|
if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) {
|
||||||
|
haveFilteredNoCallAlleles = true;
|
||||||
|
genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(DIPLOID_NO_CALL_ALLELES).make());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
|
||||||
|
calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
genotypes.add(g);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.genotypes(genotypes);
|
||||||
|
// if filtered genotypes are set to no-call, output recomputed AC, AN, AF
|
||||||
|
if ( haveFilteredNoCallAlleles ) {
|
||||||
|
GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract filters already present in the {@code vc}, and append user provided expressions.
|
||||||
|
* For user provided genotype filter expressions, see {@link #applyGenotypeFilters(VariantContext, List, boolean, boolean, boolean)}
|
||||||
|
* @param vc VC from which filters to be extracted
|
||||||
|
* @param vcFilterExpressions more filter expressions provided by user
|
||||||
|
* @param invertVCfilterExpression should we invert the logic in expressions provided in {@code vcFilterExpressions}
|
||||||
|
* @param failIfMissingValues should we mark the VC as failing if it misses the value the filters work on
|
||||||
|
*
|
||||||
|
* @return filters already in the provided vc and user-provided filters
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
static Set<String> buildVCfilters(final VariantContext vc,
|
||||||
|
final List<VariantContextUtils.JexlVCMatchExp> vcFilterExpressions,
|
||||||
|
final boolean invertVCfilterExpression,
|
||||||
|
final boolean failIfMissingValues) {
|
||||||
|
|
||||||
|
final Set<String> filters = new LinkedHashSet<>(vc.getFilters());
|
||||||
|
|
||||||
|
for ( final VariantContextUtils.JexlVCMatchExp exp : vcFilterExpressions ) {
|
||||||
|
try {
|
||||||
|
if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertVCfilterExpression) ) {
|
||||||
|
filters.add(exp.name);
|
||||||
|
}
|
||||||
|
} catch (final Exception e) {
|
||||||
|
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
|
||||||
|
if ( failIfMissingValues ) {
|
||||||
|
filters.add(exp.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return filters;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
|
// some other complications besides main stuff
|
||||||
|
// -----------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private void initializeVcfWriter() {
|
||||||
|
|
||||||
|
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
|
||||||
|
|
||||||
|
// setup the header fields
|
||||||
|
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
|
hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||||
|
|
||||||
|
// need AC, AN and AF since output if set filtered genotypes to no-call
|
||||||
|
if ( setFilteredGenotypesToNocall ) {
|
||||||
|
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
|
||||||
|
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
|
||||||
|
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( clusterWindow > 0 ) {
|
||||||
|
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( !genotypeFilterExps.isEmpty() ) {
|
||||||
|
hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
for ( final VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
|
||||||
|
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
|
||||||
|
}
|
||||||
|
for ( final VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
|
||||||
|
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( mask.isBound() ) {
|
||||||
|
hInfo.add(new VCFFilterHeaderLine(maskName, filterRecordsNotInMask ? "Doesn't overlap a user-input mask" : "Overlaps a user-input mask"));
|
||||||
|
}
|
||||||
|
} catch (final IllegalArgumentException e) {
|
||||||
|
throw new UserException.BadInput(e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepend inverse phrase to description if {@code --invertFilterExpression}.
|
||||||
|
*
|
||||||
|
* @param description the description
|
||||||
|
* @return the description with inverse prepended if {@code --invert_filter_expression}.
|
||||||
|
*/
|
||||||
|
private String possiblyInvertFilterExpression( final String description ){
|
||||||
|
return invertFilterExpression ? "Inverse of: " + description : description;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add mask to variant context filters if it covers its location.
|
||||||
|
* @param vc VariantContext
|
||||||
|
* @param genomeLoc genome location
|
||||||
|
* @param maskName name of the mask
|
||||||
|
* @param maskExtension bases beyond the mask
|
||||||
|
* @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
|
||||||
|
* @return VariantContext with the mask added if the VariantContext is within the extended mask area
|
||||||
|
*/
|
||||||
|
private VariantContext addMaskIfCoversVariant(VariantContext vc, final GenomeLoc genomeLoc, final String maskName, final int maskExtension, final boolean locStart) {
|
||||||
|
if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
|
||||||
|
final Set<String> filters = new LinkedHashSet<>(vc.getFilters());
|
||||||
|
filters.add(maskName);
|
||||||
|
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||||
|
}
|
||||||
|
|
||||||
|
return vc;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper function to check if a mask covers the variant location.
|
* Helper function to check if a mask covers the variant location.
|
||||||
*
|
*
|
||||||
|
|
@ -364,141 +569,14 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
||||||
* @return true if the genome location is within the extended mask area, false otherwise
|
* @return true if the genome location is within the extended mask area, false otherwise
|
||||||
*/
|
*/
|
||||||
protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) {
|
protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) {
|
||||||
boolean logic = genomeLoc != null && // have a location
|
final boolean needToCheckOveralpping = genomeLoc != null && // have a location
|
||||||
genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
|
genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
|
||||||
(vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied
|
(vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied
|
||||||
if ( logic ) {
|
if ( needToCheckOveralpping ) {
|
||||||
if (vcBeforeLoc)
|
return vcBeforeLoc ? (genomeLoc.getStart() - vc.getEnd() <= maskExtension) // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
|
||||||
return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
|
: (vc.getStart() - genomeLoc.getStop() <= maskExtension);
|
||||||
else
|
|
||||||
return vc.getStart() - genomeLoc.getStop() <= maskExtension;
|
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Add mask to variant context filters if it covers the it's location
|
|
||||||
*
|
|
||||||
* @param vc VariantContext
|
|
||||||
* @param genomeLoc genome location
|
|
||||||
* @param maskName name of the mask
|
|
||||||
* @param maskExtension bases beyond the mask
|
|
||||||
* @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
|
|
||||||
* @return VariantContext with the mask added if the VariantContext is within the extended mask area
|
|
||||||
*/
|
|
||||||
private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) {
|
|
||||||
if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
|
|
||||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
|
||||||
filters.add(maskName);
|
|
||||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
|
||||||
}
|
|
||||||
|
|
||||||
return vc;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void filter() {
|
|
||||||
// get the current context
|
|
||||||
FiltrationContext context = variantContextWindow.getContext();
|
|
||||||
if ( context == null )
|
|
||||||
return;
|
|
||||||
|
|
||||||
final VariantContext vc = context.getVariantContext();
|
|
||||||
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
|
||||||
|
|
||||||
// make new Genotypes based on filters
|
|
||||||
if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) {
|
|
||||||
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
|
|
||||||
|
|
||||||
//
|
|
||||||
// recompute AC, AN and AF if filtered genotypes are set to no-call
|
|
||||||
//
|
|
||||||
// occurrences of alternate alleles over all genotypes
|
|
||||||
final Map<Allele, Integer> calledAltAlleles = new LinkedHashMap<Allele, Integer>(vc.getNAlleles()-1);
|
|
||||||
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
|
|
||||||
calledAltAlleles.put(altAllele, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
int calledAlleles = 0;
|
|
||||||
boolean haveFilteredNoCallAlleles = false;
|
|
||||||
|
|
||||||
// for each genotype, check filters then create a new object
|
|
||||||
for ( final Genotype g : vc.getGenotypes() ) {
|
|
||||||
if ( g.isCalled() ) {
|
|
||||||
final List<String> filters = new ArrayList<String>();
|
|
||||||
if ( g.isFiltered() ) filters.add(g.getFilters());
|
|
||||||
|
|
||||||
// Add if expression filters the variant context
|
|
||||||
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
|
|
||||||
if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) )
|
|
||||||
filters.add(exp.name);
|
|
||||||
}
|
|
||||||
|
|
||||||
// if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
|
|
||||||
if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) {
|
|
||||||
haveFilteredNoCallAlleles = true;
|
|
||||||
genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make());
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
|
|
||||||
calledAlleles = GATKVariantContextUtils.incrementChromosomeCountsInfo(calledAltAlleles, calledAlleles, g);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
genotypes.add(g);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.genotypes(genotypes);
|
|
||||||
// if filtered genotypes are set to no-call, output recomputed AC, AN, AF
|
|
||||||
if ( haveFilteredNoCallAlleles )
|
|
||||||
GATKVariantContextUtils.updateChromosomeCountsInfo(calledAltAlleles, calledAlleles, builder);
|
|
||||||
}
|
|
||||||
|
|
||||||
// make a new variant context based on filters
|
|
||||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
|
||||||
|
|
||||||
// test for clustered SNPs if requested
|
|
||||||
if ( clusteredSNPs != null && clusteredSNPs.filter(variantContextWindow) )
|
|
||||||
filters.add(CLUSTERED_SNP_FILTER_NAME);
|
|
||||||
|
|
||||||
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
|
|
||||||
try {
|
|
||||||
if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) )
|
|
||||||
filters.add(exp.name);
|
|
||||||
} catch (Exception e) {
|
|
||||||
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
|
|
||||||
if ( failMissingValues )
|
|
||||||
filters.add(exp.name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( filters.isEmpty() )
|
|
||||||
builder.passFilters();
|
|
||||||
else
|
|
||||||
builder.filters(filters);
|
|
||||||
|
|
||||||
writer.add(builder.make());
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
|
||||||
return sum + value;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tell the user the number of loci processed and close out the new variants file.
|
|
||||||
*
|
|
||||||
* @param result the number of loci seen.
|
|
||||||
*/
|
|
||||||
public void onTraversalDone(Integer result) {
|
|
||||||
// move the window over so that we can filter the last few variants
|
|
||||||
if ( windowInitializer != null ) {
|
|
||||||
while ( windowInitializer.size() < WINDOW_SIZE )
|
|
||||||
windowInitializer.add(null);
|
|
||||||
variantContextWindow = new FiltrationContextWindow(windowInitializer);
|
|
||||||
}
|
|
||||||
for (int i=0; i < WINDOW_SIZE; i++) {
|
|
||||||
variantContextWindow.moveWindow(null);
|
|
||||||
filter();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,21 +26,24 @@
|
||||||
package org.broadinstitute.gatk.tools.walkers.filters;
|
package org.broadinstitute.gatk.tools.walkers.filters;
|
||||||
|
|
||||||
import htsjdk.samtools.reference.ReferenceSequenceFile;
|
import htsjdk.samtools.reference.ReferenceSequenceFile;
|
||||||
|
import htsjdk.variant.variantcontext.*;
|
||||||
|
import htsjdk.variant.vcf.VCFConstants;
|
||||||
import org.broadinstitute.gatk.utils.BaseTest;
|
import org.broadinstitute.gatk.utils.BaseTest;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||||
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.gatk.utils.Utils;
|
import org.broadinstitute.gatk.utils.Utils;
|
||||||
import htsjdk.variant.variantcontext.Allele;
|
|
||||||
import htsjdk.variant.variantcontext.VariantContext;
|
|
||||||
import htsjdk.variant.variantcontext.VariantContextBuilder;
|
|
||||||
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeTest;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.testng.Assert;
|
import java.util.Set;
|
||||||
import org.testng.annotations.*;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class VariantFiltrationUnitTest extends BaseTest {
|
public class VariantFiltrationUnitTest extends BaseTest {
|
||||||
|
|
||||||
|
|
@ -58,7 +61,7 @@ public class VariantFiltrationUnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@DataProvider(name = "VariantMaskData")
|
@DataProvider(name = "VariantMaskData")
|
||||||
public Object[][] DoesMaskCoverVariantTestData() {
|
public Object[][] doesMaskCoverVariantTestData() {
|
||||||
|
|
||||||
final String maskName = "testMask";
|
final String maskName = "testMask";
|
||||||
|
|
||||||
|
|
@ -85,8 +88,8 @@ public class VariantFiltrationUnitTest extends BaseTest {
|
||||||
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
|
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
|
||||||
* @param expectedValue return the expected return value from doesMaskCoverVariant()
|
* @param expectedValue return the expected return value from doesMaskCoverVariant()
|
||||||
*/
|
*/
|
||||||
@Test(dataProvider = "VariantMaskData")
|
@Test(dataProvider = "doesMaskCoverVariantTestData")
|
||||||
public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
|
public void testDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
|
||||||
final boolean vcBeforeLoc, final boolean expectedValue) {
|
final boolean vcBeforeLoc, final boolean expectedValue) {
|
||||||
|
|
||||||
// Build VariantContext
|
// Build VariantContext
|
||||||
|
|
@ -104,4 +107,126 @@ public class VariantFiltrationUnitTest extends BaseTest {
|
||||||
boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc);
|
boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc);
|
||||||
Assert.assertEquals(coversVariant, expectedValue);
|
Assert.assertEquals(coversVariant, expectedValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testApplyGenotypeFilters(){
|
||||||
|
|
||||||
|
final VariantContext vc = buildDataForFilters().make();
|
||||||
|
|
||||||
|
final String filterName = "LowDP";
|
||||||
|
final String filterExpr = "DP < 10";
|
||||||
|
|
||||||
|
final List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
|
||||||
|
|
||||||
|
final VariantContextBuilder anotherVCBuilder = VariantFiltration.applyGenotypeFilters(vc, genotypeFilterExps, false, false, false);
|
||||||
|
final VariantContext anotherVC = anotherVCBuilder.filters().make();
|
||||||
|
|
||||||
|
Assert.assertEquals(anotherVC.getGenotype("one").isFiltered(), true);
|
||||||
|
Assert.assertTrue(anotherVC.getGenotype("one").getFilters().equals(filterName));
|
||||||
|
|
||||||
|
Assert.assertEquals(anotherVC.getGenotype("two").isFiltered(), false);
|
||||||
|
|
||||||
|
Assert.assertEquals(anotherVC.getGenotype("three").isFiltered(), false);
|
||||||
|
|
||||||
|
Assert.assertEquals(anotherVC.getGenotype("four").isFiltered(), false);
|
||||||
|
|
||||||
|
Assert.assertEquals(anotherVC.getGenotype("five").isFiltered(), false);
|
||||||
|
|
||||||
|
Assert.assertEquals(anotherVC.getGenotype("six").isFiltered(), false);
|
||||||
|
|
||||||
|
final VariantContextBuilder yetAnotherVCBuilder = VariantFiltration.applyGenotypeFilters(anotherVC, genotypeFilterExps, false, true, false);
|
||||||
|
final VariantContext yetAnotherVC = yetAnotherVCBuilder.filters().make();
|
||||||
|
Assert.assertEquals(yetAnotherVC.getGenotype("six").isFiltered(), true);
|
||||||
|
Assert.assertTrue(yetAnotherVC.getGenotype("six").getFilters().equals(filterName));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testApplyVCFilters(){
|
||||||
|
|
||||||
|
final VariantContext vcNoFilters = buildDataForFilters().make(); // assumes this vc doesn't hold any filters yet
|
||||||
|
|
||||||
|
String filterName = "LowDP";
|
||||||
|
String filterExpr = "DP < 23";
|
||||||
|
List<VariantContextUtils.JexlVCMatchExp> vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
|
||||||
|
|
||||||
|
final Set<String> filters = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false);
|
||||||
|
Assert.assertFalse(vcNoFilters.isFiltered());
|
||||||
|
Assert.assertEquals(filters.size(), 1);
|
||||||
|
Assert.assertTrue(filters.contains(filterName));
|
||||||
|
|
||||||
|
filterName = "ID";
|
||||||
|
filterExpr = "ID = rs123";
|
||||||
|
vcFilterExps = VariantContextUtils.initializeMatchExps(Arrays.asList(filterName), Arrays.asList(filterExpr));
|
||||||
|
Set<String> filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, true);
|
||||||
|
// Assert.assertEquals(filterWhenFailMissing.size(), 1);
|
||||||
|
// Assert.assertTrue(filterWhenFailMissing.contains(filterName));
|
||||||
|
filterWhenFailMissing = VariantFiltration.buildVCfilters(vcNoFilters, vcFilterExps, false, false);
|
||||||
|
Assert.assertTrue(filterWhenFailMissing.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static VariantContextBuilder buildDataForFilters() {
|
||||||
|
/**
|
||||||
|
* Uses (part of) the following (semi fake) data for testing (data was modified from real data so expect some minor inconsistencies in annotations)
|
||||||
|
* 1 1234567 . T C 152.03 .
|
||||||
|
* AC=6;AF=1.00;AN=6;DP=22;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;SOR=0.693;set=variant3-variant4-variant6
|
||||||
|
* GT:AD:DP:GQ:PGT:PID:PL:RGQ
|
||||||
|
* 1/1:0,2:9:6:1|1:15870493_CT_C:90,6,0
|
||||||
|
* 1/1:0,4:10:12:1|1:15870493_CT_C:180,12,0
|
||||||
|
* ./.:0:3:.:.:.:.:0
|
||||||
|
* ./.:0:0:.:.:.:.:0
|
||||||
|
* ./.:0:0:.:.:.:.:0
|
||||||
|
* 1/1:0,0:.:6:1|1:15870493_CT_C:90,6,0
|
||||||
|
*/
|
||||||
|
|
||||||
|
final Allele refT = Allele.create("T", true);
|
||||||
|
final Allele altC = Allele.create("C", false);
|
||||||
|
final Allele nocall = Allele.NO_CALL;
|
||||||
|
|
||||||
|
final VariantContextBuilder vcBuilder = new VariantContextBuilder("", "1", 1234567, 1234567, Arrays.asList(refT, altC));
|
||||||
|
|
||||||
|
vcBuilder.noID();
|
||||||
|
vcBuilder.attribute("AC", 6);
|
||||||
|
vcBuilder.attribute("AF", 1.00);
|
||||||
|
vcBuilder.attribute("AN", 6);
|
||||||
|
vcBuilder.attribute("DP", 22);
|
||||||
|
vcBuilder.attribute("ExcessHet", 3.0103);
|
||||||
|
vcBuilder.attribute("FS", 0.000);
|
||||||
|
vcBuilder.attribute("MLEAC", 2);
|
||||||
|
vcBuilder.attribute("MLEAF", 1.00);
|
||||||
|
vcBuilder.attribute("SOR", 0.693);
|
||||||
|
|
||||||
|
GenotypeBuilder gtBuilder = new GenotypeBuilder("one", Arrays.asList(altC,altC));
|
||||||
|
final Genotype firstSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
|
||||||
|
.DP(9) // edge case not passing "DP < 10"
|
||||||
|
.make();
|
||||||
|
|
||||||
|
gtBuilder = new GenotypeBuilder("two", Arrays.asList(altC,altC));
|
||||||
|
final Genotype secondSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
|
||||||
|
.DP(10) // edge case passing "DP < 10"
|
||||||
|
.make();
|
||||||
|
|
||||||
|
gtBuilder = new GenotypeBuilder("three", Arrays.asList(nocall,nocall));
|
||||||
|
final Genotype thirdSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
|
||||||
|
.DP(3)
|
||||||
|
.make();
|
||||||
|
|
||||||
|
gtBuilder = new GenotypeBuilder("four", Arrays.asList(nocall,nocall));
|
||||||
|
final Genotype fourthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
|
||||||
|
.DP(0)
|
||||||
|
.make();
|
||||||
|
|
||||||
|
gtBuilder = new GenotypeBuilder("five", Arrays.asList(nocall,nocall));
|
||||||
|
final Genotype fifthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.NO_CALL)
|
||||||
|
.DP(0)
|
||||||
|
.make();
|
||||||
|
|
||||||
|
gtBuilder = new GenotypeBuilder("six", Arrays.asList(altC,altC));
|
||||||
|
final Genotype sixthSample = gtBuilder.attribute(VCFConstants.GENOTYPE_KEY, GenotypeType.HOM_VAR)
|
||||||
|
.DP(-1)
|
||||||
|
.make();
|
||||||
|
|
||||||
|
vcBuilder.genotypes(firstSample, secondSample, thirdSample, fourthSample, fifthSample, sixthSample);
|
||||||
|
return vcBuilder;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue