Allow LeftAlignAndTrimVariants to handle alleles longer than the default processing window

This commit is contained in:
Ron Levine 2015-10-23 20:02:52 -04:00
parent 0d936b28c4
commit 36ca9fe898
3 changed files with 86 additions and 6 deletions

View File

@ -51,9 +51,14 @@
package org.broadinstitute.gatk.tools.walkers.variantutils;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Level;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
/**
@ -70,6 +75,53 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest {
executeTest("test left alignment", spec);
}
@Test
public void testLeftAlignmentLongAllelesError() throws IOException {
// Need to see log INFO messages
Level level = logger.getLevel();
logger.setLevel(Level.INFO);
File logFile = createTempFile("testLargeReferenceAlleleError.log", ".tmp");
String logFileName = logFile.getAbsolutePath();
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "longAlleles.vcf --no_cmdline_in_header -log " + logFileName,
1,
Arrays.asList("136f88a5bd07a022a3404089359cb8ee"));
executeTest("test left alignment with long alleles with an error", spec);
// Make sure the "reference allele too long" message is in the log
Assert.assertTrue(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG));
// Set the log level back
logger.setLevel(level);
}
@Test
public void testLeftAlignmentLongAllelesFix() throws IOException {
// Need to see log INFO messages
Level level = logger.getLevel();
logger.setLevel(Level.INFO);
File logFile = createTempFile("testLargeReferenceAlleleError.log", ".tmp");
String logFileName = logFile.getAbsolutePath();
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir +
"longAlleles.vcf --no_cmdline_in_header --reference_window_stop 208 -log " + logFileName,
1,
Arrays.asList("c4ca5520ee499da171053059e3717b2f"));
executeTest("test left alignment with long alleles fix", spec);
// Make sure the "reference allele too long" message is in the log
Assert.assertFalse(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG));
// Set the log level back
logger.setLevel(level);
}
@Test
public void testLeftAlignmentDontTrim() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -94,17 +94,27 @@ import java.util.*;
* -T LeftAlignAndTrimVariants \
* -R reference.fasta \
* --variant input.vcf \
* -o output.vcf
* -o output.vcf \
* --dontTrimAlleles
* </pre>
*
* <h4>Left align and trim alleles, process alleles <= 208 bases</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T LeftAlignAndTrimVariants \
* -R reference.fasta \
* --variant input.vcf \
* -o output.vcf \
* --reference_window_stop 208
* </pre>
*
* <h4>Split multiallics into biallelics, left align and trim alleles</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T LeftAlignAndTrimVariants \
* -R reference.fasta \
* --variant input.vcf \
* -o output.vcf
* -o output.vcf \
* --splitMultiallelics
* </pre>
*
@ -114,8 +124,8 @@ import java.util.*;
* -T LeftAlignAndTrimVariants \
* -R reference.fasta \
* --variant input.vcf \
* -o output.vcf
* --splitMultiallelics
* -o output.vcf \
* --splitMultiallelics \
* --dontTrimAlleles
* </pre>
*
@ -124,6 +134,9 @@ import java.util.*;
@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well!
public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
// Log message for a reference allele that is too long
protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long";
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ -147,6 +160,10 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
private VariantContextWriter writer;
private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size!
// Stop of the expanded window for which the reference context should be provided, relative to the locus.
private int referenceWindowStop;
public void initialize() {
String trackName = variantCollection.variants.getName();
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
@ -155,7 +172,9 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
Set<VCFHeaderLine> headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder();
baseWriter.writeHeader(new VCFHeader(headerLines, samples));
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200);
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, MAX_INDEL_LENGTH);
referenceWindowStop = getToolkit().getArguments().reference_window_stop;
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -203,6 +222,15 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
@Requires("vc != null")
protected int trimAlignWrite(final VariantContext vc, final ReferenceContext ref, final int numBiallelics ){
final int refLength = vc.getReference().length();
// ignore if the reference length is greater than the reference window stop before and after expansion
if ( refLength > MAX_INDEL_LENGTH && refLength > referenceWindowStop ) {
logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d",
REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength));
return 0;
}
// optionally don't trim VC
final VariantContext v = dontTrimAlleles ? vc : GATKVariantContextUtils.trimAlleles(vc, true, true);

View File

@ -91,7 +91,7 @@ import java.util.*;
* --dbsnp dbsnp.vcf
* </pre>
*
* <h4>To perform VCF format tests and all strict validations with the VCFs containing alleles > 208 bases</h4>
* <h4>To perform VCF format tests and all strict validations with the VCFs containing alleles <= 208 bases</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T ValidateVariants \