diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java index 5345130c2..a0e3f862b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java @@ -66,17 +66,36 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header", 1, - Arrays.asList("dd238fe14b4a495a489907c1e021221e")); + Arrays.asList("5d82f53b036d9a0fca170e5be68d5ab2")); executeTest("test left alignment", spec); } @Test - public void testLeftAlignmentWithTrimmingAndMultialleliecs() { + public void testLeftAlignmentDontTrim() { WalkerTestSpec spec = new WalkerTestSpec( - "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split", + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header", + 1, + Arrays.asList("dd238fe14b4a495a489907c1e021221e")); + executeTest("test left alignment, don't trim", spec); + } + + @Test + public void testLeftAlignmentWithMultialleliecs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -split", 1, Arrays.asList("534bea653d4a0e59e74f4107c1768558")); - executeTest("test left alignment with trimming and hard multiple alleles", spec); + executeTest("test left alignment with hard multiple alleles", spec); + + } + + @Test + public void testLeftAlignmentDontTrimWithMultialleliecs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header -split", + 1, + Arrays.asList("189b8136ee62b54bf7b227e99c892440")); + executeTest("test left alignment with hard multiple alleles, don't trim", spec); } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java index 70f73ac51..d62a944c9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -60,11 +60,12 @@ import java.util.*; * Left-align indels in a variant callset * *

- * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be - * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to - * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. - * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. - * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation.

+ * LeftAlignAndTrimVariants is a tool that takes a VCF file, left-aligns the indels and trims common bases from indels, + * leaving them with a minimum representation. The same indel can often be placed at multiple positions and still + * represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position + * this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic + * sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels. + *

* *

Input

*

@@ -76,7 +77,9 @@ import java.util.*; * A left-aligned VCF. *

* - *

Usage example

+ *

Usage examples

+ * + *

Left align and trim alleles

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T LeftAlignAndTrimVariants \
@@ -85,6 +88,37 @@ import java.util.*;
  *   -o output.vcf
  * 
* + *

Left align and don't trim alleles

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T LeftAlignAndTrimVariants \
+ *   -R reference.fasta \
+ *   --variant input.vcf \
+ *   -o output.vcf
+ *   --dontTrimAlleles
+ * 
+ * + *

Split multiallics into biallelics, left align and trim alleles

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T LeftAlignAndTrimVariants \
+ *   -R reference.fasta \
+ *   --variant input.vcf \
+ *   -o output.vcf
+ *   --splitMultiallelics
+ * 
+ * + *

Split multiallelics into biallics, left align but don't trim alleles

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T LeftAlignAndTrimVariants \
+ *   -R reference.fasta \
+ *   --variant input.vcf \
+ *   -o output.vcf
+ *   --splitMultiallelics
+ *   --dontTrimAlleles
+ * 
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! @@ -94,10 +128,10 @@ public class LeftAlignAndTrimVariants extends RodWalker { protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** - * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation. + * If this argument is set, bases common to all alleles will not be removed and will not leave their minimal representation. */ - @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false) - protected boolean trimAlleles = false; + @Argument(fullName="dontTrimAlleles", shortName="notrim", doc="Do not Trim alleles to remove bases common to all of them", required=false) + protected boolean dontTrimAlleles = false; /** * If this argument is set, split multiallelic records and left-align individual alleles. @@ -132,32 +166,16 @@ public class LeftAlignAndTrimVariants extends RodWalker { int changedSites = 0; for ( final VariantContext vc : VCs ) { - // split first into biallelics, and optionally trim alleles to minimal representation - Pair result = new Pair(vc,0); // default value + // split first into biallelics, and optionally don't trim alleles to minimal representation if (splitMultiallelics) { final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); for (final VariantContext biallelicVC: vcList) { - final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true) : biallelicVC); - result = alignAndWrite(v, ref); - - // strip out PLs and AD if we've subsetted the alleles - if ( vcList.size() > 1 ) - result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make(); - - writer.add(result.first); - changedSites += result.second; + changedSites += trimAlignWrite(biallelicVC, ref, vcList.size()); } } else { - if (trimAlleles) - result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); - else - result = alignAndWrite(vc,ref); - writer.add(result.first); - changedSites += result.second; - + changedSites += trimAlignWrite(vc, ref, 1); } - } return changedSites; @@ -175,11 +193,39 @@ public class LeftAlignAndTrimVariants extends RodWalker { } /** - * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be + * Trim, align and write out the vc. + * + * @param vc Input VC with variants to left align + * @param ref Reference context + * @param numBiallelics Number of biallelics from the original VC + * @return Number of records left-aligned (0 or 1) + */ + @Requires("vc != null") + protected int trimAlignWrite(final VariantContext vc, final ReferenceContext ref, final int numBiallelics ){ + + // optionally don't trim VC + final VariantContext v = dontTrimAlleles ? vc : GATKVariantContextUtils.trimAlleles(vc, true, true); + + // align the VC + final Pair result = alignAndWrite(v, ref); + + // strip out PLs and AD if we've subsetted the alleles + if ( numBiallelics > 1 ) + result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make(); + + // write out new VC + writer.add(result.first); + + // number of records left aligned + return result.second; + } + + /** + * Main routine workhorse. By definition, it will only take biallelic vc's. Splitting into multiple alleles has to be * handled by calling routine. * @param vc Input VC with variants to left align * @param ref Reference context - * @return # of records left-aligned (0 or 1) and new VC. + * @return Number of records left-aligned (0 or 1) and new VC. */ @Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"}) @Ensures({"result != null","result.first != null", "result.second >=0"})