Change default behavior of SelectVariants to trim remaining alleles when samples are subset. -noTrim argument preserves original alleles. Add test for trimming.

This commit is contained in:
Laura Gauthier 2014-11-07 10:28:53 -05:00
parent 31cb47b9e6
commit 783a4fd651
3 changed files with 34 additions and 5 deletions

View File

@ -378,4 +378,12 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
UserException.class);
executeTest("InvalidJexl", spec);
}
@Test
public void testAlleleTrimming() {
final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf";
final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12878 -env "
+ testFile + " -o %s --no_cmdline_in_header";
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("69c3f59c132418ec10117aa395addfea"));
}
}

View File

@ -100,7 +100,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest {
{ // AD and PL decoding race condition
final String testfile = privateTestDir + "race_condition.vcf";
final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
new ParallelSelectTestProvider(b37KGReference, args, "62e6156387d6e91bd2b08ef649cb1129", nt);
new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt);
}
}

View File

@ -104,7 +104,7 @@ import java.util.*;
* -se 'SAMPLE.+PARC'
* -select "QD > 10.0"
*
* Select a sample and exclude non-variant loci and filtered loci:
* Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
@ -114,6 +114,16 @@ import java.util.*;
* -env \
* -ef
*
* Select a sample, subset remaining alleles, but don't trim:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_1_ACTG \
* -env \
* -noTrim
*
* Select a sample and restrict the output vcf to a set of intervals:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
@ -234,6 +244,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
protected boolean EXCLUDE_FILTERED = false;
/**
* Default is to remove bases common to all remaining alleles, leaving only their minimal representation.
* If this argument is set, original alleles from input VCF will be preserved.
*/
@Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false)
protected boolean preserveAlleles = false;
/**
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
* For example, a multiallelic record such as:
@ -509,7 +526,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( containsIndelLargerThan(vc, maxIndelSize) )
continue;
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS, preserveAlleles);
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
boolean failedJexlMatch = false;
@ -665,7 +682,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles?
* @return the subsetted VariantContext
*/
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) {
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants, final boolean preserveAlleles) {
if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
return vc;
@ -702,7 +719,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
addAnnotations(builder, vc, sub.getSampleNames());
return builder.make();
final VariantContext subset = builder.make();
final VariantContext trimmed = preserveAlleles? subset : GATKVariantContextUtils.trimAlleles(subset,true,true);
return trimmed;
}
/*