diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 2837c405d..709185821 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -53,8 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; public class SelectVariantsIntegrationTest extends WalkerTest { @@ -280,7 +282,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + "-T SelectVariants --keepOriginalAC -env -trimAlternates -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, Arrays.asList("4695c99d96490ed4e5b1568c5b52dea6") ); @@ -319,7 +321,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; String samplesFile = privateTestDir + "GIH.samples.list"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, + "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants -trimAlternates --variant " + testfile, 1, Arrays.asList("69862fb97e8e895fe65c7abb14b03cee") ); @@ -382,8 +384,35 @@ public class SelectVariantsIntegrationTest extends WalkerTest { @Test public void testAlleleTrimming() { final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf"; - final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12878 -env " - + testFile + " -o %s --no_cmdline_in_header"; - WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("69c3f59c132418ec10117aa395addfea")); + final String cmd = "-T SelectVariants -R " + b37KGReference + " -sn NA12878 -env -trimAlternates " + + "-V " + testFile + " -o %s --no_cmdline_in_header"; + WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9df942000eb18b12d9008c7d9b5c4178")); + executeTest("testAlleleTrimming", spec); + } + + @DataProvider(name="unusedAlleleTrimmingProvider") + public Object[][] unusedAlleleTrimmingProvider() { + return new Object[][] { + { privateTestDir+"forHardLeftAlignVariantsTest.vcf", "-trimAlternates", "9df942000eb18b12d9008c7d9b5c4178"}, + { privateTestDir+"forHardLeftAlignVariantsTest.vcf", "", "981b757e3dc6bf3864ac7e493cf9d30d"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT", "8ded359dd87fd498ff38736ea0fa4c28"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env", "a7e7288dcd779cfac6983069de45b79c"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -trimAlternates", "2e726d06a8d317199e8dda74691948a3"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env -trimAlternates", "1e5585f86c347da271a79fbfc61ac849"} + }; + } + + @Test(dataProvider = "unusedAlleleTrimmingProvider") + public void testUnusedAlleleTrimming(final String vcf, final String extraArgs, final String md5) { + final WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants" + + " -R "+b37KGReference + + " -V "+vcf + + " -o %s --no_cmdline_in_header" + + " "+extraArgs, + 1, + Arrays.asList(md5) + ); + executeTest(String.format("testUnusedAlleleTrimming: (%s,%s)", new File(vcf).getName(), extraArgs), spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java index 6d753aae9..8116b6b11 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -99,7 +99,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest { } { // AD and PL decoding race condition final String testfile = privateTestDir + "race_condition.vcf"; - final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; + final String args = "-env -trimAlternates -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt); } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 3da616632..98393596f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -253,6 +253,15 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false) protected boolean preserveAlleles = false; + /** + * When this argument is present, all alternate alleles that are not present in the (output) samples will be removed. + * Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be + * removed and the record will contain a '.' in the ALT column. Also note that sites-only VCFs, by definition, do + * not include the alternate allele in any genotype calls. + */ + @Argument(fullName="removeUnusedAlternates", shortName="trimAlternates", doc="Remove alternate alleles not present in any genotypes", required=false) + protected boolean removeUnusedAlternates = false; + /** * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. * For example, a multiallelic record such as: @@ -528,7 +537,7 @@ public class SelectVariants extends RodWalker implements TreeR if ( containsIndelLargerThan(vc, maxIndelSize) ) continue; - VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS, preserveAlleles); + VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates); if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) { boolean failedJexlMatch = false; @@ -681,25 +690,30 @@ public class SelectVariants extends RodWalker implements TreeR * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF). * * @param vc the VariantContext record to subset - * @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles? + * @param preserveAlleles should we trim constant sequence from the beginning and/or end of all alleles, or preserve it? + * @param removeUnusedAlternates removes alternate alleles with AC=0 * @return the subsetted VariantContext */ - private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants, final boolean preserveAlleles) { - if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() ) + private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) { + //subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible + if ( NO_SAMPLES_SPECIFIED && !removeUnusedAlternates ) return vc; - final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used + // strip out the alternate alleles that aren't being used + final VariantContext sub = vc.subContextFromSamples(samples, removeUnusedAlternates); + + //If no subsetting happened, exit now + if ( sub.getNSamples() == vc.getNSamples() && sub.getNAlleles() == vc.getNAlleles() ) + return vc; final VariantContextBuilder builder = new VariantContextBuilder(sub); // if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc); - // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags - if ( vc.getNSamples() != sub.getNSamples() ) { - builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY); - builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY); - } + // since the VC has been subset (either by sample or allele), we need to strip out the MLE tags + builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY); + builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY); // Remove a fraction of the genotypes if needed if ( fractionGenotypes > 0 ){