Update doc for multiallelics, trimming is the default behavior
This commit is contained in:
parent
7b067c13bd
commit
795fe75886
|
|
@ -66,17 +66,36 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("dd238fe14b4a495a489907c1e021221e"));
|
||||
Arrays.asList("5d82f53b036d9a0fca170e5be68d5ab2"));
|
||||
executeTest("test left alignment", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLeftAlignmentWithTrimmingAndMultialleliecs() {
|
||||
public void testLeftAlignmentDontTrim() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split",
|
||||
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("dd238fe14b4a495a489907c1e021221e"));
|
||||
executeTest("test left alignment, don't trim", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLeftAlignmentWithMultialleliecs() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -split",
|
||||
1,
|
||||
Arrays.asList("534bea653d4a0e59e74f4107c1768558"));
|
||||
executeTest("test left alignment with trimming and hard multiple alleles", spec);
|
||||
executeTest("test left alignment with hard multiple alleles", spec);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLeftAlignmentDontTrimWithMultialleliecs() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header -split",
|
||||
1,
|
||||
Arrays.asList("189b8136ee62b54bf7b227e99c892440"));
|
||||
executeTest("test left alignment with hard multiple alleles, don't trim", spec);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -60,11 +60,12 @@ import java.util.*;
|
|||
* Left-align indels in a variant callset
|
||||
*
|
||||
* <p>
|
||||
* LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be
|
||||
* placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to
|
||||
* place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
|
||||
* Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged.
|
||||
* Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation.</p>
|
||||
* LeftAlignAndTrimVariants is a tool that takes a VCF file, left-aligns the indels and trims common bases from indels,
|
||||
* leaving them with a minimum representation. The same indel can often be placed at multiple positions and still
|
||||
* represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position
|
||||
* this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic
|
||||
* sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
|
|
@ -76,7 +77,9 @@ import java.util.*;
|
|||
* A left-aligned VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Usage example</h3>
|
||||
* <h3>Usage examples</h3>
|
||||
*
|
||||
* <h4>Left align and trim alleles</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T LeftAlignAndTrimVariants \
|
||||
|
|
@ -85,6 +88,37 @@ import java.util.*;
|
|||
* -o output.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Left align and don't trim alleles</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T LeftAlignAndTrimVariants \
|
||||
* -R reference.fasta \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* --dontTrimAlleles
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Split multiallics into biallelics, left align and trim alleles</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T LeftAlignAndTrimVariants \
|
||||
* -R reference.fasta \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* --splitMultiallelics
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Split multiallelics into biallics, left align but don't trim alleles</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T LeftAlignAndTrimVariants \
|
||||
* -R reference.fasta \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* --splitMultiallelics
|
||||
* --dontTrimAlleles
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
|
||||
@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well!
|
||||
|
|
@ -94,10 +128,10 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
|
|||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
/**
|
||||
* If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation.
|
||||
* If this argument is set, bases common to all alleles will not be removed and will not leave their minimal representation.
|
||||
*/
|
||||
@Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false)
|
||||
protected boolean trimAlleles = false;
|
||||
@Argument(fullName="dontTrimAlleles", shortName="notrim", doc="Do not Trim alleles to remove bases common to all of them", required=false)
|
||||
protected boolean dontTrimAlleles = false;
|
||||
|
||||
/**
|
||||
* If this argument is set, split multiallelic records and left-align individual alleles.
|
||||
|
|
@ -132,32 +166,16 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
int changedSites = 0;
|
||||
for ( final VariantContext vc : VCs ) {
|
||||
// split first into biallelics, and optionally trim alleles to minimal representation
|
||||
Pair<VariantContext,Integer> result = new Pair<VariantContext, Integer>(vc,0); // default value
|
||||
// split first into biallelics, and optionally don't trim alleles to minimal representation
|
||||
if (splitMultiallelics) {
|
||||
final List<VariantContext> vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
|
||||
for (final VariantContext biallelicVC: vcList) {
|
||||
final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true) : biallelicVC);
|
||||
result = alignAndWrite(v, ref);
|
||||
|
||||
// strip out PLs and AD if we've subsetted the alleles
|
||||
if ( vcList.size() > 1 )
|
||||
result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make();
|
||||
|
||||
writer.add(result.first);
|
||||
changedSites += result.second;
|
||||
changedSites += trimAlignWrite(biallelicVC, ref, vcList.size());
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (trimAlleles)
|
||||
result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref);
|
||||
else
|
||||
result = alignAndWrite(vc,ref);
|
||||
writer.add(result.first);
|
||||
changedSites += result.second;
|
||||
|
||||
changedSites += trimAlignWrite(vc, ref, 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return changedSites;
|
||||
|
|
@ -175,11 +193,39 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be
|
||||
* Trim, align and write out the vc.
|
||||
*
|
||||
* @param vc Input VC with variants to left align
|
||||
* @param ref Reference context
|
||||
* @param numBiallelics Number of biallelics from the original VC
|
||||
* @return Number of records left-aligned (0 or 1)
|
||||
*/
|
||||
@Requires("vc != null")
|
||||
protected int trimAlignWrite(final VariantContext vc, final ReferenceContext ref, final int numBiallelics ){
|
||||
|
||||
// optionally don't trim VC
|
||||
final VariantContext v = dontTrimAlleles ? vc : GATKVariantContextUtils.trimAlleles(vc, true, true);
|
||||
|
||||
// align the VC
|
||||
final Pair<VariantContext,Integer> result = alignAndWrite(v, ref);
|
||||
|
||||
// strip out PLs and AD if we've subsetted the alleles
|
||||
if ( numBiallelics > 1 )
|
||||
result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make();
|
||||
|
||||
// write out new VC
|
||||
writer.add(result.first);
|
||||
|
||||
// number of records left aligned
|
||||
return result.second;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main routine workhorse. By definition, it will only take biallelic vc's. Splitting into multiple alleles has to be
|
||||
* handled by calling routine.
|
||||
* @param vc Input VC with variants to left align
|
||||
* @param ref Reference context
|
||||
* @return # of records left-aligned (0 or 1) and new VC.
|
||||
* @return Number of records left-aligned (0 or 1) and new VC.
|
||||
*/
|
||||
@Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"})
|
||||
@Ensures({"result != null","result.first != null", "result.second >=0"})
|
||||
|
|
|
|||
Loading…
Reference in New Issue