Change default behavior of SelectVariants to trim remaining alleles when samples are subset. -noTrim argument preserves original alleles. Add test for trimming.

This commit is contained in:
Laura Gauthier 2014-11-07 10:28:53 -05:00
parent 31cb47b9e6
commit 783a4fd651
3 changed files with 34 additions and 5 deletions

View File

@ -378,4 +378,12 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
UserException.class); UserException.class);
executeTest("InvalidJexl", spec); executeTest("InvalidJexl", spec);
} }
@Test
public void testAlleleTrimming() {
final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf";
final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12878 -env "
+ testFile + " -o %s --no_cmdline_in_header";
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("69c3f59c132418ec10117aa395addfea"));
}
} }

View File

@ -100,7 +100,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest {
{ // AD and PL decoding race condition { // AD and PL decoding race condition
final String testfile = privateTestDir + "race_condition.vcf"; final String testfile = privateTestDir + "race_condition.vcf";
final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
new ParallelSelectTestProvider(b37KGReference, args, "62e6156387d6e91bd2b08ef649cb1129", nt); new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt);
} }
} }

View File

@ -104,7 +104,7 @@ import java.util.*;
* -se 'SAMPLE.+PARC' * -se 'SAMPLE.+PARC'
* -select "QD > 10.0" * -select "QD > 10.0"
* *
* Select a sample and exclude non-variant loci and filtered loci: * Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):
* java -Xmx2g -jar GenomeAnalysisTK.jar \ * java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \ * -R ref.fasta \
* -T SelectVariants \ * -T SelectVariants \
@ -114,6 +114,16 @@ import java.util.*;
* -env \ * -env \
* -ef * -ef
* *
* Select a sample, subset remaining alleles, but don't trim:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_1_ACTG \
* -env \
* -noTrim
*
* Select a sample and restrict the output vcf to a set of intervals: * Select a sample and restrict the output vcf to a set of intervals:
* java -Xmx2g -jar GenomeAnalysisTK.jar \ * java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \ * -R ref.fasta \
@ -234,6 +244,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
protected boolean EXCLUDE_FILTERED = false; protected boolean EXCLUDE_FILTERED = false;
/**
* Default is to remove bases common to all remaining alleles, leaving only their minimal representation.
* If this argument is set, original alleles from input VCF will be preserved.
*/
@Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false)
protected boolean preserveAlleles = false;
/** /**
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
* For example, a multiallelic record such as: * For example, a multiallelic record such as:
@ -509,7 +526,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( containsIndelLargerThan(vc, maxIndelSize) ) if ( containsIndelLargerThan(vc, maxIndelSize) )
continue; continue;
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS, preserveAlleles);
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) { if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
boolean failedJexlMatch = false; boolean failedJexlMatch = false;
@ -665,7 +682,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles? * @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles?
* @return the subsetted VariantContext * @return the subsetted VariantContext
*/ */
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) { private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants, final boolean preserveAlleles) {
if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() ) if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
return vc; return vc;
@ -702,7 +719,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
addAnnotations(builder, vc, sub.getSampleNames()); addAnnotations(builder, vc, sub.getSampleNames());
return builder.make(); final VariantContext subset = builder.make();
final VariantContext trimmed = preserveAlleles? subset : GATKVariantContextUtils.trimAlleles(subset,true,true);
return trimmed;
} }
/* /*