From 783a4fd651306553dcbbbc5f3a08973826b65d19 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Fri, 7 Nov 2014 10:28:53 -0500 Subject: [PATCH] Change default behavior of SelectVariants to trim remaining alleles when samples are subset. -noTrim argument preserves original alleles. Add test for trimming. --- .../SelectVariantsIntegrationTest.java | 8 +++++ ...SelectVariantsParallelIntegrationTest.java | 2 +- .../walkers/variantutils/SelectVariants.java | 29 ++++++++++++++++--- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index ce7834a49..2837c405d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -378,4 +378,12 @@ public class SelectVariantsIntegrationTest extends WalkerTest { UserException.class); executeTest("InvalidJexl", spec); } + + @Test + public void testAlleleTrimming() { + final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf"; + final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12878 -env " + + testFile + " -o %s --no_cmdline_in_header"; + WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("69c3f59c132418ec10117aa395addfea")); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java index 1ae01f179..6d753aae9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -100,7 +100,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest { { // AD and PL decoding race condition final String testfile = privateTestDir + "race_condition.vcf"; final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; - new ParallelSelectTestProvider(b37KGReference, args, "62e6156387d6e91bd2b08ef649cb1129", nt); + new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt); } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 72dbcdf18..50c8ca61f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -104,7 +104,7 @@ import java.util.*; * -se 'SAMPLE.+PARC' * -select "QD > 10.0" * - * Select a sample and exclude non-variant loci and filtered loci: + * Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default): * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T SelectVariants \ @@ -114,6 +114,16 @@ import java.util.*; * -env \ * -ef * + * Select a sample, subset remaining alleles, but don't trim: + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T SelectVariants \ + * --variant input.vcf \ + * -o output.vcf \ + * -sn SAMPLE_1_ACTG \ + * -env \ + * -noTrim + * * Select a sample and restrict the output vcf to a set of intervals: * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ @@ -234,6 +244,13 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) protected boolean EXCLUDE_FILTERED = false; + /** + * Default is to remove bases common to all remaining alleles, leaving only their minimal representation. + * If this argument is set, original alleles from input VCF will be preserved. + */ + @Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false) + protected boolean preserveAlleles = false; + /** * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. * For example, a multiallelic record such as: @@ -509,7 +526,7 @@ public class SelectVariants extends RodWalker implements TreeR if ( containsIndelLargerThan(vc, maxIndelSize) ) continue; - VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); + VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS, preserveAlleles); if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) { boolean failedJexlMatch = false; @@ -665,7 +682,7 @@ public class SelectVariants extends RodWalker implements TreeR * @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles? * @return the subsetted VariantContext */ - private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) { + private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants, final boolean preserveAlleles) { if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() ) return vc; @@ -702,7 +719,11 @@ public class SelectVariants extends RodWalker implements TreeR addAnnotations(builder, vc, sub.getSampleNames()); - return builder.make(); + final VariantContext subset = builder.make(); + + final VariantContext trimmed = preserveAlleles? subset : GATKVariantContextUtils.trimAlleles(subset,true,true); + + return trimmed; } /*