Allow LeftAlignAndTrimVariants to handle alleles longer than the default processing window
This commit is contained in:
parent
0d936b28c4
commit
36ca9fe898
|
|
@ -51,9 +51,14 @@
|
|||
|
||||
package org.broadinstitute.gatk.tools.walkers.variantutils;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.log4j.Level;
|
||||
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
|
|
@ -70,6 +75,53 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest {
|
|||
executeTest("test left alignment", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLeftAlignmentLongAllelesError() throws IOException {
|
||||
|
||||
// Need to see log INFO messages
|
||||
Level level = logger.getLevel();
|
||||
logger.setLevel(Level.INFO);
|
||||
|
||||
File logFile = createTempFile("testLargeReferenceAlleleError.log", ".tmp");
|
||||
String logFileName = logFile.getAbsolutePath();
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "longAlleles.vcf --no_cmdline_in_header -log " + logFileName,
|
||||
1,
|
||||
Arrays.asList("136f88a5bd07a022a3404089359cb8ee"));
|
||||
executeTest("test left alignment with long alleles with an error", spec);
|
||||
|
||||
// Make sure the "reference allele too long" message is in the log
|
||||
Assert.assertTrue(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG));
|
||||
|
||||
// Set the log level back
|
||||
logger.setLevel(level);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLeftAlignmentLongAllelesFix() throws IOException {
|
||||
|
||||
// Need to see log INFO messages
|
||||
Level level = logger.getLevel();
|
||||
logger.setLevel(Level.INFO);
|
||||
|
||||
File logFile = createTempFile("testLargeReferenceAlleleError.log", ".tmp");
|
||||
String logFileName = logFile.getAbsolutePath();
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir +
|
||||
"longAlleles.vcf --no_cmdline_in_header --reference_window_stop 208 -log " + logFileName,
|
||||
1,
|
||||
Arrays.asList("c4ca5520ee499da171053059e3717b2f"));
|
||||
executeTest("test left alignment with long alleles fix", spec);
|
||||
|
||||
// Make sure the "reference allele too long" message is in the log
|
||||
Assert.assertFalse(FileUtils.readFileToString(logFile).contains(ValidateVariants.REFERENCE_ALLELE_TOO_LONG_MSG));
|
||||
|
||||
// Set the log level back
|
||||
logger.setLevel(level);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLeftAlignmentDontTrim() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -94,17 +94,27 @@ import java.util.*;
|
|||
* -T LeftAlignAndTrimVariants \
|
||||
* -R reference.fasta \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* -o output.vcf \
|
||||
* --dontTrimAlleles
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Left align and trim alleles, process alleles <= 208 bases</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T LeftAlignAndTrimVariants \
|
||||
* -R reference.fasta \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* --reference_window_stop 208
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Split multiallics into biallelics, left align and trim alleles</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T LeftAlignAndTrimVariants \
|
||||
* -R reference.fasta \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* -o output.vcf \
|
||||
* --splitMultiallelics
|
||||
* </pre>
|
||||
*
|
||||
|
|
@ -114,8 +124,8 @@ import java.util.*;
|
|||
* -T LeftAlignAndTrimVariants \
|
||||
* -R reference.fasta \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* --splitMultiallelics
|
||||
* -o output.vcf \
|
||||
* --splitMultiallelics \
|
||||
* --dontTrimAlleles
|
||||
* </pre>
|
||||
*
|
||||
|
|
@ -124,6 +134,9 @@ import java.util.*;
|
|||
@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well!
|
||||
public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
|
||||
|
||||
// Log message for a reference allele that is too long
|
||||
protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long";
|
||||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
|
|
@ -147,6 +160,10 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
|
|||
private VariantContextWriter writer;
|
||||
|
||||
private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size!
|
||||
|
||||
// Stop of the expanded window for which the reference context should be provided, relative to the locus.
|
||||
private int referenceWindowStop;
|
||||
|
||||
public void initialize() {
|
||||
String trackName = variantCollection.variants.getName();
|
||||
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
|
||||
|
|
@ -155,7 +172,9 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
|
|||
Set<VCFHeaderLine> headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder();
|
||||
baseWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
|
||||
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200);
|
||||
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, MAX_INDEL_LENGTH);
|
||||
|
||||
referenceWindowStop = getToolkit().getArguments().reference_window_stop;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
|
@ -203,6 +222,15 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
|
|||
@Requires("vc != null")
|
||||
protected int trimAlignWrite(final VariantContext vc, final ReferenceContext ref, final int numBiallelics ){
|
||||
|
||||
final int refLength = vc.getReference().length();
|
||||
|
||||
// ignore if the reference length is greater than the reference window stop before and after expansion
|
||||
if ( refLength > MAX_INDEL_LENGTH && refLength > referenceWindowStop ) {
|
||||
logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d",
|
||||
REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength));
|
||||
return 0;
|
||||
}
|
||||
|
||||
// optionally don't trim VC
|
||||
final VariantContext v = dontTrimAlleles ? vc : GATKVariantContextUtils.trimAlleles(vc, true, true);
|
||||
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ import java.util.*;
|
|||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>To perform VCF format tests and all strict validations with the VCFs containing alleles > 208 bases</h4>
|
||||
* <h4>To perform VCF format tests and all strict validations with the VCFs containing alleles <= 208 bases</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T ValidateVariants \
|
||||
|
|
|
|||
Loading…
Reference in New Issue