From 36ca9fe8986af18ebe4714f935b28b4932cadc0d Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 23 Oct 2015 20:02:52 -0400 Subject: [PATCH] Allow LeftAlignAndTrimVariants to handle alleles longer than the default processing window --- ...ftAlignAndTrimVariantsIntegrationTest.java | 52 +++++++++++++++++++ .../LeftAlignAndTrimVariants.java | 38 ++++++++++++-- .../variantutils/ValidateVariants.java | 2 +- 3 files changed, 86 insertions(+), 6 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java index a0e3f862b..4b5d9979f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java @@ -51,9 +51,14 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.io.IOException; import java.util.Arrays; /** @@ -70,6 +75,53 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { executeTest("test left alignment", spec); } + @Test + public void testLeftAlignmentLongAllelesError() throws IOException { + + // Need to see log INFO messages + Level level = logger.getLevel(); + logger.setLevel(Level.INFO); + + File logFile = createTempFile("testLargeReferenceAlleleError.log", ".tmp"); + String logFileName = logFile.getAbsolutePath(); + + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "longAlleles.vcf --no_cmdline_in_header -log " + logFileName, + 1, + Arrays.asList("136f88a5bd07a022a3404089359cb8ee")); + executeTest("test left alignment with long alleles with an error", spec); + + // Make sure the "reference allele too long" message is in the log + Assert.assertTrue(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG)); + + // Set the log level back + logger.setLevel(level); + } + + @Test + public void testLeftAlignmentLongAllelesFix() throws IOException { + + // Need to see log INFO messages + Level level = logger.getLevel(); + logger.setLevel(Level.INFO); + + File logFile = createTempFile("testLargeReferenceAlleleError.log", ".tmp"); + String logFileName = logFile.getAbsolutePath(); + + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + + "longAlleles.vcf --no_cmdline_in_header --reference_window_stop 208 -log " + logFileName, + 1, + Arrays.asList("c4ca5520ee499da171053059e3717b2f")); + executeTest("test left alignment with long alleles fix", spec); + + // Make sure the "reference allele too long" message is in the log + Assert.assertFalse(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG)); + + // Set the log level back + logger.setLevel(level); + } + @Test public void testLeftAlignmentDontTrim() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java index d62a944c9..0ba7e1013 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -94,17 +94,27 @@ import java.util.*; * -T LeftAlignAndTrimVariants \ * -R reference.fasta \ * --variant input.vcf \ - * -o output.vcf + * -o output.vcf \ * --dontTrimAlleles * * + *

Left align and trim alleles, process alleles <= 208 bases

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T LeftAlignAndTrimVariants \
+ *   -R reference.fasta \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   --reference_window_stop 208
+ * 
+ * *

Split multiallics into biallelics, left align and trim alleles

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T LeftAlignAndTrimVariants \
  *   -R reference.fasta \
  *   --variant input.vcf \
- *   -o output.vcf
+ *   -o output.vcf \
  *   --splitMultiallelics
  * 
* @@ -114,8 +124,8 @@ import java.util.*; * -T LeftAlignAndTrimVariants \ * -R reference.fasta \ * --variant input.vcf \ - * -o output.vcf - * --splitMultiallelics + * -o output.vcf \ + * --splitMultiallelics \ * --dontTrimAlleles * * @@ -124,6 +134,9 @@ import java.util.*; @Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! public class LeftAlignAndTrimVariants extends RodWalker { + // Log message for a reference allele that is too long + protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long"; + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -147,6 +160,10 @@ public class LeftAlignAndTrimVariants extends RodWalker { private VariantContextWriter writer; private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size! + + // Stop of the expanded window for which the reference context should be provided, relative to the locus. + private int referenceWindowStop; + public void initialize() { String trackName = variantCollection.variants.getName(); Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); @@ -155,7 +172,9 @@ public class LeftAlignAndTrimVariants extends RodWalker { Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); baseWriter.writeHeader(new VCFHeader(headerLines, samples)); - writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); + writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, MAX_INDEL_LENGTH); + + referenceWindowStop = getToolkit().getArguments().reference_window_stop; } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -203,6 +222,15 @@ public class LeftAlignAndTrimVariants extends RodWalker { @Requires("vc != null") protected int trimAlignWrite(final VariantContext vc, final ReferenceContext ref, final int numBiallelics ){ + final int refLength = vc.getReference().length(); + + // ignore if the reference length is greater than the reference window stop before and after expansion + if ( refLength > MAX_INDEL_LENGTH && refLength > referenceWindowStop ) { + logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d", + REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength)); + return 0; + } + // optionally don't trim VC final VariantContext v = dontTrimAlleles ? vc : GATKVariantContextUtils.trimAlleles(vc, true, true); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index 1bfbf753f..a5b710acc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -91,7 +91,7 @@ import java.util.*; * --dbsnp dbsnp.vcf * * - *

To perform VCF format tests and all strict validations with the VCFs containing alleles > 208 bases

+ *

To perform VCF format tests and all strict validations with the VCFs containing alleles <= 208 bases

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T ValidateVariants \