From 795fe75886dcbadb60e0fb845eab2bba86f33ba5 Mon Sep 17 00:00:00 2001
From: Ron Levine
Date: Fri, 16 Oct 2015 17:21:52 -0400
Subject: [PATCH] Update doc for multiallelics, trimming is the default
behavior
---
...ftAlignAndTrimVariantsIntegrationTest.java | 27 ++++-
.../LeftAlignAndTrimVariants.java | 106 +++++++++++++-----
2 files changed, 99 insertions(+), 34 deletions(-)
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java
index 5345130c2..a0e3f862b 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java
@@ -66,17 +66,36 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header",
1,
- Arrays.asList("dd238fe14b4a495a489907c1e021221e"));
+ Arrays.asList("5d82f53b036d9a0fca170e5be68d5ab2"));
executeTest("test left alignment", spec);
}
@Test
- public void testLeftAlignmentWithTrimmingAndMultialleliecs() {
+ public void testLeftAlignmentDontTrim() {
WalkerTestSpec spec = new WalkerTestSpec(
- "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split",
+ "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header",
+ 1,
+ Arrays.asList("dd238fe14b4a495a489907c1e021221e"));
+ executeTest("test left alignment, don't trim", spec);
+ }
+
+ @Test
+ public void testLeftAlignmentWithMultialleliecs() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -split",
1,
Arrays.asList("534bea653d4a0e59e74f4107c1768558"));
- executeTest("test left alignment with trimming and hard multiple alleles", spec);
+ executeTest("test left alignment with hard multiple alleles", spec);
+
+ }
+
+ @Test
+ public void testLeftAlignmentDontTrimWithMultialleliecs() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header -split",
+ 1,
+ Arrays.asList("189b8136ee62b54bf7b227e99c892440"));
+ executeTest("test left alignment with hard multiple alleles, don't trim", spec);
}
}
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java
index 70f73ac51..d62a944c9 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java
@@ -60,11 +60,12 @@ import java.util.*;
* Left-align indels in a variant callset
*
*
- * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be
- * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to
- * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
- * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged.
- * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation.
+ * LeftAlignAndTrimVariants is a tool that takes a VCF file, left-aligns the indels and trims common bases from indels,
+ * leaving them with a minimum representation. The same indel can often be placed at multiple positions and still
+ * represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position
+ * this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic
+ * sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.
+ *
*
* Input
*
@@ -76,7 +77,9 @@ import java.util.*;
* A left-aligned VCF.
*
*
- * Usage example
+ * Usage examples
+ *
+ * Left align and trim alleles
*
* java -jar GenomeAnalysisTK.jar \
* -T LeftAlignAndTrimVariants \
@@ -85,6 +88,37 @@ import java.util.*;
* -o output.vcf
*
*
+ * Left align and don't trim alleles
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -T LeftAlignAndTrimVariants \
+ * -R reference.fasta \
+ * --variant input.vcf \
+ * -o output.vcf
+ * --dontTrimAlleles
+ *
+ *
+ * Split multiallics into biallelics, left align and trim alleles
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -T LeftAlignAndTrimVariants \
+ * -R reference.fasta \
+ * --variant input.vcf \
+ * -o output.vcf
+ * --splitMultiallelics
+ *
+ *
+ * Split multiallelics into biallics, left align but don't trim alleles
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -T LeftAlignAndTrimVariants \
+ * -R reference.fasta \
+ * --variant input.vcf \
+ * -o output.vcf
+ * --splitMultiallelics
+ * --dontTrimAlleles
+ *
+ *
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well!
@@ -94,10 +128,10 @@ public class LeftAlignAndTrimVariants extends RodWalker {
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
- * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation.
+ * If this argument is set, bases common to all alleles will not be removed and will not leave their minimal representation.
*/
- @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false)
- protected boolean trimAlleles = false;
+ @Argument(fullName="dontTrimAlleles", shortName="notrim", doc="Do not Trim alleles to remove bases common to all of them", required=false)
+ protected boolean dontTrimAlleles = false;
/**
* If this argument is set, split multiallelic records and left-align individual alleles.
@@ -132,32 +166,16 @@ public class LeftAlignAndTrimVariants extends RodWalker {
int changedSites = 0;
for ( final VariantContext vc : VCs ) {
- // split first into biallelics, and optionally trim alleles to minimal representation
- Pair result = new Pair(vc,0); // default value
+ // split first into biallelics, and optionally don't trim alleles to minimal representation
if (splitMultiallelics) {
final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
for (final VariantContext biallelicVC: vcList) {
- final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true) : biallelicVC);
- result = alignAndWrite(v, ref);
-
- // strip out PLs and AD if we've subsetted the alleles
- if ( vcList.size() > 1 )
- result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make();
-
- writer.add(result.first);
- changedSites += result.second;
+ changedSites += trimAlignWrite(biallelicVC, ref, vcList.size());
}
}
else {
- if (trimAlleles)
- result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref);
- else
- result = alignAndWrite(vc,ref);
- writer.add(result.first);
- changedSites += result.second;
-
+ changedSites += trimAlignWrite(vc, ref, 1);
}
-
}
return changedSites;
@@ -175,11 +193,39 @@ public class LeftAlignAndTrimVariants extends RodWalker {
}
/**
- * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be
+ * Trim, align and write out the vc.
+ *
+ * @param vc Input VC with variants to left align
+ * @param ref Reference context
+ * @param numBiallelics Number of biallelics from the original VC
+ * @return Number of records left-aligned (0 or 1)
+ */
+ @Requires("vc != null")
+ protected int trimAlignWrite(final VariantContext vc, final ReferenceContext ref, final int numBiallelics ){
+
+ // optionally don't trim VC
+ final VariantContext v = dontTrimAlleles ? vc : GATKVariantContextUtils.trimAlleles(vc, true, true);
+
+ // align the VC
+ final Pair result = alignAndWrite(v, ref);
+
+ // strip out PLs and AD if we've subsetted the alleles
+ if ( numBiallelics > 1 )
+ result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make();
+
+ // write out new VC
+ writer.add(result.first);
+
+ // number of records left aligned
+ return result.second;
+ }
+
+ /**
+ * Main routine workhorse. By definition, it will only take biallelic vc's. Splitting into multiple alleles has to be
* handled by calling routine.
* @param vc Input VC with variants to left align
* @param ref Reference context
- * @return # of records left-aligned (0 or 1) and new VC.
+ * @return Number of records left-aligned (0 or 1) and new VC.
*/
@Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"})
@Ensures({"result != null","result.first != null", "result.second >=0"})