From a6ca97ef141443eb73abce4d2bca692669e6a8db Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 15 May 2015 10:15:42 -0400 Subject: [PATCH] Site-level selection based on genotype filter status --- .../VariantFiltrationIntegrationTest.java | 44 ++++- .../SelectVariantsIntegrationTest.java | 88 ++++++++- .../walkers/filters/VariantFiltration.java | 23 ++- .../walkers/variantutils/SelectVariants.java | 175 +++++++++++++++--- 4 files changed, 287 insertions(+), 43 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java index 0816b00f2..6c139b2e9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java @@ -147,11 +147,19 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testInvertFilter() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invert_filter_expression", 1, + baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invertFilterExpression", 1, Arrays.asList("d478fd6bcf0884133fe2a47adf4cd765")); executeTest("test inversion of selection of filter with separate names #2", spec); } + @Test + public void testInvertJexlFilter() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --filterName ABF -filter 'AlleleBalance >= 0.7' --filterName FSF -filter 'FisherStrand != 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + Arrays.asList("6fa6cd89bfc8b6b4dfc3da25eb36d08b")); // Differs from testInvertFilter() because their VCF header FILTER description uses the -filter argument. Their filter statuses are identical. + executeTest("test inversion of selection of filter via JEXL with separate names #2", spec); + } + @Test public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( @@ -207,8 +215,40 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testInvertGenotypeFilterExpression() { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference - + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invert_genotype_filter_expression", 1, + + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invertGenotypeFilterExpression", 1, Arrays.asList("d2664870e7145eb73a2295766482c823")); executeTest("testInvertGenotypeFilterExpression", spec); } + + @Test + public void testInvertJexlGenotypeFilterExpression() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP >= 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1, + Arrays.asList("8ddd8f3b5ee351c4ab79cb186b1d45ba")); // Differs from testInvertFilter because FILTER description uses the -genotypeFilterExpression argument + executeTest("testInvertJexlGenotypeFilterExpression", spec); + } + + @Test + public void testSetFilteredGtoNocall() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf --setFilteredGtToNocall", 1, + Arrays.asList("9ff801dd726eb4fc562b278ccc6854b1")); + executeTest("testSetFilteredGtoNocall", spec); + } + + @Test + public void testSetVcfFilteredGtoNocall() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("81b99386a64a8f2b857a7ef2bca5856e") + ); + + spec.disableShadowBCF(); + executeTest("testSetVcfFilteredGtoNocall--" + testfile, spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index e0ce9405e..dce5ff522 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -64,8 +64,10 @@ public class SelectVariantsIntegrationTest extends WalkerTest { return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; } - private static final String sampleExclusionMD5 = "eea22fbf1e490e59389a663c3d6a6537"; - private static final String invertSelectionMD5 = "831bc0a5a723b0681a910d668ff3757b"; + private static final String SAMPLE_EXCLUSION_MD5 = "eea22fbf1e490e59389a663c3d6a6537"; + private static final String INVERT_SELECTION_MD5 = "831bc0a5a723b0681a910d668ff3757b"; + private static final String MAX_FILTERED_GT_SELECTION_MD5 = "0365de1bbf7c037be00badace0a74d02"; + private static final String MIN_FILTERED_GT_SELECTION_MD5 = "fcee8c8caa0696a6675961bb12664878"; @Test public void testDiscordanceNoSampleSpecified() { @@ -199,7 +201,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_se '[CDH]' --variant " + testfile, 1, - Arrays.asList(sampleExclusionMD5) + Arrays.asList(SAMPLE_EXCLUSION_MD5) ); spec.disableShadowBCF(); @@ -216,7 +218,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -se '[^CDH]' --variant " + testfile, 1, - Arrays.asList(sampleExclusionMD5) + Arrays.asList(SAMPLE_EXCLUSION_MD5) ); spec.disableShadowBCF(); @@ -557,7 +559,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { } /** - * Test inverting the variant selection criteria + * Test inverting the variant selection criteria by the -invertSelect argument */ @Test public void testInvertSelection() { @@ -565,16 +567,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' --invert_selection --variant " + testfile), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' -invertSelect --variant " + testfile), 1, - Arrays.asList(invertSelectionMD5) + Arrays.asList(INVERT_SELECTION_MD5) ); spec.disableShadowBCF(); executeTest("testInvertSelection--" + testfile, spec); } /** - * Test inverting the variant JEXL selection criteria + * Test inverting the variant selection criteria by inverting the JEXL expression logic following -select */ @Test public void testInvertJexlSelection() { @@ -584,7 +586,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP >= 20000'--variant " + testfile), 1, - Arrays.asList(invertSelectionMD5) + Arrays.asList(INVERT_SELECTION_MD5) ); spec.disableShadowBCF(); executeTest("testInvertJexlSelection--" + testfile, spec); @@ -659,10 +661,76 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String pedFile = privateTestDir + "CEUtrio.ped"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -inv_mv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", + "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -invMv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", 1, Arrays.asList("35921fb2dedca0ead83027a66b725794")); executeTest("testMendelianViolationSelection--" + testFile, spec); } + + @Test + public void testMaxFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --maxFilteredGenotypes 1 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MAX_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMaxFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testMinFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --minFilteredGenotypes 2 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MIN_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMinFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testMaxFractionFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --maxFractionFilteredGenotypes 0.4 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MAX_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMaxFractionFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testMinFractionFilteredGenotypesSelection() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --minFractionFilteredGenotypes 0.6 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList(MIN_FILTERED_GT_SELECTION_MD5) + ); + spec.disableShadowBCF(); + executeTest("testMinFractionFilteredGenotypesSelection--" + testfile, spec); + } + + @Test + public void testSetFilteredGtoNocall() { + String testfile = privateTestDir + "filteredSamples.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("81b99386a64a8f2b857a7ef2bca5856e") + ); + + spec.disableShadowBCF(); + executeTest("testSetFilteredGtoNocall--" + testfile, spec); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index aa2e96f1a..4c4d6f02d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -179,15 +179,21 @@ public class VariantFiltration extends RodWalker { /** * Invert the selection criteria for --filterExpression */ - @Argument(fullName="invert_filter_expression", shortName="inv_fil_sel", doc="Invert the selection criteria for --filterExpression", required=false) + @Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false) protected boolean invertFilterExpression = false; /** * Invert the selection criteria for --genotypeFilterExpression */ - @Argument(fullName="invert_genotype_filter_expression", shortName="inv_gen_fil_sel", doc="Invert the selection criteria for --genotypeFilterExpression", required=false) + @Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false) protected boolean invertGenotypeFilterExpression = false; + /** + * If this argument is provided, set filtered genotypes to no-call (./.). + */ + @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call") + private boolean setFilteredGenotypesToNocall = false; + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; @@ -201,8 +207,10 @@ public class VariantFiltration extends RodWalker { private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one private ArrayList windowInitializer = new ArrayList(); + private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + /** - * Prepend inverse phrase to description if --invert_filter_expression + * Prepend inverse phrase to description if --invertFilterExpression * * @param description the description * @return the description with inverse prepended if --invert_filter_expression @@ -379,7 +387,7 @@ public class VariantFiltration extends RodWalker { final VariantContextBuilder builder = new VariantContextBuilder(vc); // make new Genotypes based on filters - if ( !genotypeFilterExps.isEmpty() ) { + if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) { GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); // for each genotype, check filters then create a new object @@ -388,12 +396,17 @@ public class VariantFiltration extends RodWalker { final List filters = new ArrayList(); if ( g.isFiltered() ) filters.add(g.getFilters()); + // Add if expression filters the variant context for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) ) filters.add(exp.name); } - genotypes.add(new GenotypeBuilder(g).filters(filters).make()); + // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call + if ( !filters.isEmpty() && setFilteredGenotypesToNocall ) + genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make()); + else + genotypes.add(new GenotypeBuilder(g).filters(filters).make()); } else { genotypes.add(g); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 63971c497..d44fbc84f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -146,7 +146,7 @@ import java.util.*; * -o output.vcf \ * -se 'SAMPLE.+PARC' \ * -select "QD > 10.0" - * --invert_selection + * -invertSelect * * *

Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):

@@ -207,7 +207,7 @@ import java.util.*; * -sn mySample * * - *

Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more

+ *

Generating a VCF of all the variants that are mendelian violations. The optional argument '-mvq' restricts the selection to sites that have a QUAL score of 50 or more

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -218,14 +218,14 @@ import java.util.*;
  *   -o violations.vcf
  * 
* - *

Generating a VCF of all the variants that are not mendelian violations. The optional argument `-mvq` together with '-inv_mv' restricts the selection to sites that have a QUAL score of 50 or less

+ *

Generating a VCF of all the variants that are not mendelian violations. The optional argument '-mvq' together with '-invMv' restricts the selection to sites that have a QUAL score of 50 or less

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
  *   -R reference.fasta \
  *   -V input.vcf \
  *   -ped family.ped \
- *   -mv -mvq 50 -inv_mv \
+ *   -mv -mvq 50 -invMv \
  *   -o violations.vcf
  * 
* @@ -275,7 +275,7 @@ import java.util.*; * *

Select IDs in fileKeep and exclude IDs in fileExclude:

*
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * java -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
  *   -T SelectVariants \
  *   --variant input.vcf \
@@ -284,9 +284,35 @@ import java.util.*;
  *   -excludeIDs fileExclude
  * 
* + *

Select sites where there are between 2 and 5 samples and between 10 and 50 percent of the sample genotypes are filtered:

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   --maxFilteredGenotypes 5
+ *   --minFilteredGenotypes 2
+ *   --maxFractionFilteredGenotypes 0.60
+ *   --minFractionFilteredGenotypes 0.10
+ * 
+ * + *

Set filtered genotypes to no-call (./.):

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   --setFilteredGenotypesToNocall
+ * 
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class SelectVariants extends RodWalker implements TreeReducible { + static final int MAX_FILTERED_GENOTYPES_DEFAULT_VALUE = Integer.MAX_VALUE; + static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0; + static final double MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 1.0; + static final double MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 0.0; + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** @@ -359,10 +385,10 @@ public class SelectVariants extends RodWalker implements TreeR public ArrayList selectExpressions = new ArrayList<>(); /** - * Invert the selection criteria for --select. + * Invert the selection criteria for -select. */ - @Argument(fullName="invert_selection", shortName="inv_sel", doc="Invert the selection criteria for --select", required=false) - protected boolean invertSelection = false; + @Argument(shortName="invertSelect", doc="Invert the selection criteria for -select", required=false) + protected boolean invertSelect = false; /* * If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none @@ -436,8 +462,8 @@ public class SelectVariants extends RodWalker implements TreeR * determined on the basis of family structure. Requires passing a pedigree file using the engine-level * `-ped` argument. */ - @Argument(fullName="invert_mendelianViolation", shortName="inv_mv", doc="Output non-mendelian violation sites only", required=false) - private Boolean invert_mendelianViolations = false; + @Argument(fullName="invertMendelianViolation", shortName="invMv", doc="Output non-mendelian violation sites only", required=false) + private Boolean invertMendelianViolations = false; /** * This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order @@ -514,6 +540,36 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="minIndelSize", required=false, doc="Minimum size of indels to include") private int minIndelSize = 0; + /** + * If this argument is provided, select sites where at most a maximum number of samples are filtered at the genotype level. + */ + @Argument(fullName="maxFilteredGenotypes", required=false, doc="Maximum number of samples filtered at the genotype level") + private int maxFilteredGenotypes = MAX_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, select sites where at least a minimum number of samples are filtered at the genotype level. + */ + @Argument(fullName="minFilteredGenotypes", required=false, doc="Minimum number of samples filtered at the genotype level") + private int minFilteredGenotypes = MIN_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, select sites where a fraction or less of the samples are filtered at the genotype level. + */ + @Argument(fullName="maxFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level") + private double maxFractionFilteredGenotypes = MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, select sites where a fraction or more of the samples are filtered at the genotype level. + */ + @Argument(fullName="minFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level") + private double minFractionFilteredGenotypes = MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; + + /** + * If this argument is provided, set filtered genotypes to no-call (./.). + */ + @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call") + private boolean setFilteredGenotypesToNocall = false; + @Hidden @Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.") private boolean allowNonOverlappingCommandLineSamples = false; @@ -547,6 +603,8 @@ public class SelectVariants extends RodWalker implements TreeR private Set IDsToRemove = null; private Map vcfRods; + private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ @@ -619,7 +677,7 @@ public class SelectVariants extends RodWalker implements TreeR selectedTypes.add(t); } - // remove specifed types + // remove specified types for (VariantContext.Type t : typesToExclude) selectedTypes.remove(t); @@ -713,16 +771,16 @@ public class SelectVariants extends RodWalker implements TreeR for (VariantContext vc : vcs) { // an option for performance testing only - if ( fullyDecode ) - vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() ); + if (fullyDecode) + vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing()); - if ( IDsToKeep != null && !IDsToKeep.contains(vc.getID()) ) + if (IDsToKeep != null && !IDsToKeep.contains(vc.getID())) continue; - if ( IDsToRemove != null && IDsToRemove.contains(vc.getID()) ) + if (IDsToRemove != null && IDsToRemove.contains(vc.getID())) continue; - if ( mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invert_mendelianViolations) ) + if (mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invertMendelianViolations)) break; if (discordanceOnly) { @@ -745,20 +803,30 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; - if ( containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize) ) + if (containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize)) continue; + if ( needNumFilteredGenotypes()) { + int numFilteredSamples = numFilteredGenotypes(vc); + double fractionFilteredGenotypes = samples.isEmpty() ? 0.0 : numFilteredSamples / samples.size(); + if (numFilteredSamples > maxFilteredGenotypes || numFilteredSamples < minFilteredGenotypes || + fractionFilteredGenotypes > maxFractionFilteredGenotypes || fractionFilteredGenotypes < minFractionFilteredGenotypes) + continue; + } + VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates); + VariantContext filteredGenotypeToNocall = setFilteredGenotypeToNocall(sub, setFilteredGenotypesToNocall); + // Not excluding non-variants or subsetted polymorphic variants AND including filtered loci or subsetted variant is not filtered - if ( (!XLnonVariants || sub.isPolymorphicInSamples()) && (!XLfiltered || !sub.isFiltered()) ) { + if ( (!XLnonVariants || filteredGenotypeToNocall.isPolymorphicInSamples()) && (!XLfiltered || !filteredGenotypeToNocall.isFiltered()) ) { // Write the subsetted variant if it matches all of the expressions boolean failedJexlMatch = false; try { for (VariantContextUtils.JexlVCMatchExp jexl : jexls) { - if ( Utils.invertLogic(!VariantContextUtils.match(sub, jexl), invertSelection) ){ + if ( Utils.invertLogic(!VariantContextUtils.match(filteredGenotypeToNocall, jexl), invertSelect) ){ failedJexlMatch = true; break; } @@ -771,7 +839,7 @@ public class SelectVariants extends RodWalker implements TreeR if ( !failedJexlMatch && !justRead && ( !selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) { - vcfWriter.add(sub); + vcfWriter.add(filteredGenotypeToNocall); } } } @@ -800,6 +868,26 @@ public class SelectVariants extends RodWalker implements TreeR return false; } + /** + * Find the number of filtered samples + * + * @param vc the variant rod VariantContext + * @return number of filtered samples + */ + private int numFilteredGenotypes(final VariantContext vc){ + if (vc == null) + return 0; + + int numFiltered = 0; + // check if we find it in the variant rod + GenotypesContext genotypes = vc.getGenotypes(samples); + for (final Genotype g : genotypes) + if ( g.isFiltered() && !g.getFilters().isEmpty()) + numFiltered++; + + return numFiltered; + } + /** * Checks if vc has a variant call for (at least one of) the samples. * @@ -807,7 +895,7 @@ public class SelectVariants extends RodWalker implements TreeR * @param compVCs the comparison VariantContext (discordance) * @return true VariantContexts are discordant, false otherwise */ - private boolean isDiscordant (VariantContext vc, Collection compVCs) { + private boolean isDiscordant (final VariantContext vc, final Collection compVCs) { if (vc == null) return false; @@ -845,7 +933,7 @@ public class SelectVariants extends RodWalker implements TreeR * @param compVCs the comparison VariantContext * @return true if VariantContexts are concordant, false otherwise */ - private boolean isConcordant (VariantContext vc, Collection compVCs) { + private boolean isConcordant (final VariantContext vc, final Collection compVCs) { if (vc == null || compVCs == null || compVCs.isEmpty()) return false; @@ -876,7 +964,7 @@ public class SelectVariants extends RodWalker implements TreeR return true; } - private boolean sampleHasVariant(Genotype g) { + private boolean sampleHasVariant(final Genotype g) { return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !XLfiltered))); } @@ -945,8 +1033,7 @@ public class SelectVariants extends RodWalker implements TreeR for ( Genotype genotype : newGC ) { //Set genotype to no call if it falls in the fraction. if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make()); + genotypes.add(new GenotypeBuilder(genotype).alleles(diploidNoCallAlleles).noGQ().make()); } else{ genotypes.add(genotype); @@ -966,6 +1053,30 @@ public class SelectVariants extends RodWalker implements TreeR return trimmed; } + /** + * If --setFilteredGtToNocall, set filtered genotypes to no-call + * + * @param vc the VariantContext record to set filtered genotypes to no-call + * @param filteredGenotypesToNocall set filtered genotypes to non-call? + * @return the VariantContext with no-call genotypes if the sample was filtered + */ + private VariantContext setFilteredGenotypeToNocall(final VariantContext vc, final boolean filteredGenotypesToNocall) { + + if ( !filteredGenotypesToNocall ) + return vc; + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); + + for ( final Genotype g : vc.getGenotypes() ) { + if ( g.isCalled() && g.isFiltered() ) + genotypes.add(new GenotypeBuilder(g).alleles(diploidNoCallAlleles).make()); + else + genotypes.add(g); + } + + return builder.genotypes(genotypes).make(); + } /* * Add annotations to the new VC * @@ -1061,4 +1172,16 @@ public class SelectVariants extends RodWalker implements TreeR } return result; } -} + + /** + * Need the number of filtered genotypes samples? + * + * @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise + */ + private boolean needNumFilteredGenotypes(){ + return maxFilteredGenotypes != MAX_FILTERED_GENOTYPES_DEFAULT_VALUE || + minFilteredGenotypes != MIN_FILTERED_GENOTYPES_DEFAULT_VALUE || + maxFractionFilteredGenotypes != MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE || + minFractionFilteredGenotypes != MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; + } +} \ No newline at end of file