From d16ed98c9eac696aee83330bf2121efab8cae101 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Mon, 28 Dec 2015 12:23:14 -0500 Subject: [PATCH] Backport maxNoCall functionality from GATK4 --- .../SelectVariantsIntegrationTest.java | 54 +++++++++++++++++ .../walkers/variantutils/SelectVariants.java | 58 +++++++++++++++++-- 2 files changed, 108 insertions(+), 4 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 64636d182..56c3a7d8d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -69,6 +69,8 @@ public class SelectVariantsIntegrationTest extends WalkerTest { private static final String INVERT_SELECTION_MD5 = "26d192b868746ab14133f145ae812e7c"; private static final String MAX_FILTERED_GT_SELECTION_MD5 = "f83ac0deb7a8b022d6d40a85627a71ec"; private static final String MIN_FILTERED_GT_SELECTION_MD5 = "346620b7a5d66dabf89d3f42d6e27db7"; + private static final String NO_CALL_FILTERING_KEEP_ONE = "6e2401190c5ada6a3bed2640c068f43b"; + private static final String NO_CALL_FILTERING_KEEP_TWO = "6bced1ab6a3d58f1fd905b7f601987a3"; @Test public void testDiscordanceNoSampleSpecified() { @@ -771,4 +773,56 @@ public class SelectVariantsIntegrationTest extends WalkerTest { spec.disableShadowBCF(); executeTest("testSACNonDiploid", spec); } + + @Test + public void testMaxNoCall1() { + final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLnumber 1", + 1, + Arrays.asList(NO_CALL_FILTERING_KEEP_ONE)); + spec.disableShadowBCF(); + + executeTest("testMaxNoCall1", spec); + } + + @Test + public void testMaxNoCall0_25() { + final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLfraction 0.25", + 1, + Arrays.asList(NO_CALL_FILTERING_KEEP_ONE)); + spec.disableShadowBCF(); + + executeTest("testMaxNoCall0_25", spec); + } + + @Test + public void testMaxNoCall2() { + final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLnumber 2", + 1, + Arrays.asList(NO_CALL_FILTERING_KEEP_TWO)); + spec.disableShadowBCF(); + + executeTest("testMaxNoCall2", spec); + } + + @Test + public void testMaxNoCall0_5() { + final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLfraction 0.5", + 1, + Arrays.asList(NO_CALL_FILTERING_KEEP_TWO)); + spec.disableShadowBCF(); + + executeTest("testMaxNoCall0_5", spec); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index abd53f9f0..62918c4f9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -309,9 +309,11 @@ import java.util.*; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class SelectVariants extends RodWalker implements TreeReducible { static final int MAX_FILTERED_GENOTYPES_DEFAULT_VALUE = Integer.MAX_VALUE; - static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0; static final double MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 1.0; + static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0; static final double MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 0.0; + private static final int MAX_NOCALL_NUMBER_DEFAULT_VALUE = Integer.MAX_VALUE; + private static final double MAX_NOCALL_FRACTION_DEFAULT_VALUE = 1.0; @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -564,6 +566,18 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="minFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level") private double minFractionFilteredGenotypes = MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; + /** + * If this argument is provided, select sites where at most the given number of samples have no-call genotypes. + */ + @Argument(fullName="maxNOCALLnumber", required=false, doc="Maximum number of samples with no-call genotypes") + private int maxNOCALLnumber = MAX_NOCALL_NUMBER_DEFAULT_VALUE; + + /** + * If this argument is provided, select sites where at most the given fraction of samples have no-call genotypes. + */ + @Argument(fullName="maxNOCALLfraction", required=false, doc="Maximum fraction of samples with no-call genotypes") + private double maxNOCALLfraction = MAX_NOCALL_FRACTION_DEFAULT_VALUE; + /** * If this argument is provided, set filtered genotypes to no-call (./.). */ @@ -815,7 +829,7 @@ public class SelectVariants extends RodWalker implements TreeR if (containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize)) continue; - if ( needNumFilteredGenotypes()) { + if ( considerFilteredGenotypes()) { int numFilteredSamples = numFilteredGenotypes(vc); double fractionFilteredGenotypes = samples.isEmpty() ? 0.0 : numFilteredSamples / samples.size(); if (numFilteredSamples > maxFilteredGenotypes || numFilteredSamples < minFilteredGenotypes || @@ -823,6 +837,13 @@ public class SelectVariants extends RodWalker implements TreeR continue; } + if (considerNoCallGenotypes()) { + final int numNoCallSamples = numNoCallGenotypes(vc); + final double fractionNoCallGenotypes = samples.isEmpty() ? 0.0 : ((double) numNoCallSamples) / samples.size(); + if (numNoCallSamples > maxNOCALLnumber || fractionNoCallGenotypes > maxNOCALLfraction) + continue; + } + VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates); VariantContext filteredGenotypeToNocall = setFilteredGenotypeToNocall(sub, setFilteredGenotypesToNocall); @@ -877,6 +898,25 @@ public class SelectVariants extends RodWalker implements TreeR return false; } + /** + * Find the number of no-call genotypes + * + * @param vc the variant context + * @return number of no-call genotypes + */ + private int numNoCallGenotypes(final VariantContext vc) { + if (vc == null) + return 0; + + int numFiltered = 0; + for ( final Genotype g : vc.getGenotypes() ) { + if ( g.isNoCall() ) + numFiltered++; + } + + return numFiltered; + } + /** * Find the number of filtered samples * @@ -1183,14 +1223,24 @@ public class SelectVariants extends RodWalker implements TreeR } /** - * Need the number of filtered genotypes samples? + * Should the number of filtered genotypes be considered for filtering? * * @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise */ - private boolean needNumFilteredGenotypes(){ + private boolean considerFilteredGenotypes(){ return maxFilteredGenotypes != MAX_FILTERED_GENOTYPES_DEFAULT_VALUE || minFilteredGenotypes != MIN_FILTERED_GENOTYPES_DEFAULT_VALUE || maxFractionFilteredGenotypes != MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE || minFractionFilteredGenotypes != MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE; } + + /** + * Should the number of no-call genotypes be considered for filtering? + * + * @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise + * + */ + private boolean considerNoCallGenotypes(){ + return maxNOCALLnumber != MAX_NOCALL_NUMBER_DEFAULT_VALUE || maxNOCALLfraction != MAX_NOCALL_FRACTION_DEFAULT_VALUE; + } } \ No newline at end of file