Merge pull request #802 from broadinstitute/pd_selectvariants_subset

Added -trimAlternates argument to SelectVariants
This commit is contained in:
Valentin Ruano Rubio 2015-01-22 05:05:42 -05:00
commit e26e55efe1
3 changed files with 59 additions and 16 deletions

View File

@ -53,8 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.variantutils;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
public class SelectVariantsIntegrationTest extends WalkerTest {
@ -280,7 +282,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
"-T SelectVariants --keepOriginalAC -env -trimAlternates -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("4695c99d96490ed4e5b1568c5b52dea6")
);
@ -319,7 +321,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf";
String samplesFile = privateTestDir + "GIH.samples.list";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants -trimAlternates --variant " + testfile,
1,
Arrays.asList("69862fb97e8e895fe65c7abb14b03cee")
);
@ -382,8 +384,35 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
@Test
public void testAlleleTrimming() {
final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf";
final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12878 -env "
+ testFile + " -o %s --no_cmdline_in_header";
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("69c3f59c132418ec10117aa395addfea"));
final String cmd = "-T SelectVariants -R " + b37KGReference + " -sn NA12878 -env -trimAlternates "
+ "-V " + testFile + " -o %s --no_cmdline_in_header";
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9df942000eb18b12d9008c7d9b5c4178"));
executeTest("testAlleleTrimming", spec);
}
@DataProvider(name="unusedAlleleTrimmingProvider")
public Object[][] unusedAlleleTrimmingProvider() {
return new Object[][] {
{ privateTestDir+"forHardLeftAlignVariantsTest.vcf", "-trimAlternates", "9df942000eb18b12d9008c7d9b5c4178"},
{ privateTestDir+"forHardLeftAlignVariantsTest.vcf", "", "981b757e3dc6bf3864ac7e493cf9d30d"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT", "8ded359dd87fd498ff38736ea0fa4c28"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env", "a7e7288dcd779cfac6983069de45b79c"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -trimAlternates", "2e726d06a8d317199e8dda74691948a3"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env -trimAlternates", "1e5585f86c347da271a79fbfc61ac849"}
};
}
@Test(dataProvider = "unusedAlleleTrimmingProvider")
public void testUnusedAlleleTrimming(final String vcf, final String extraArgs, final String md5) {
final WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants" +
" -R "+b37KGReference +
" -V "+vcf +
" -o %s --no_cmdline_in_header" +
" "+extraArgs,
1,
Arrays.asList(md5)
);
executeTest(String.format("testUnusedAlleleTrimming: (%s,%s)", new File(vcf).getName(), extraArgs), spec);
}
}

View File

@ -99,7 +99,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest {
}
{ // AD and PL decoding race condition
final String testfile = privateTestDir + "race_condition.vcf";
final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
final String args = "-env -trimAlternates -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt);
}
}

View File

@ -253,6 +253,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false)
protected boolean preserveAlleles = false;
/**
* When this argument is present, all alternate alleles that are not present in the (output) samples will be removed.
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
* removed and the record will contain a '.' in the ALT column. Also note that sites-only VCFs, by definition, do
* not include the alternate allele in any genotype calls.
*/
@Argument(fullName="removeUnusedAlternates", shortName="trimAlternates", doc="Remove alternate alleles not present in any genotypes", required=false)
protected boolean removeUnusedAlternates = false;
/**
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
* For example, a multiallelic record such as:
@ -528,7 +537,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( containsIndelLargerThan(vc, maxIndelSize) )
continue;
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS, preserveAlleles);
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
boolean failedJexlMatch = false;
@ -681,25 +690,30 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
*
* @param vc the VariantContext record to subset
* @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles?
* @param preserveAlleles should we trim constant sequence from the beginning and/or end of all alleles, or preserve it?
* @param removeUnusedAlternates removes alternate alleles with AC=0
* @return the subsetted VariantContext
*/
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants, final boolean preserveAlleles) {
if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) {
//subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible
if ( NO_SAMPLES_SPECIFIED && !removeUnusedAlternates )
return vc;
final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used
// strip out the alternate alleles that aren't being used
final VariantContext sub = vc.subContextFromSamples(samples, removeUnusedAlternates);
//If no subsetting happened, exit now
if ( sub.getNSamples() == vc.getNSamples() && sub.getNAlleles() == vc.getNAlleles() )
return vc;
final VariantContextBuilder builder = new VariantContextBuilder(sub);
// if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values
GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc);
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
if ( vc.getNSamples() != sub.getNSamples() ) {
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
}
// since the VC has been subset (either by sample or allele), we need to strip out the MLE tags
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
// Remove a fraction of the genotypes if needed
if ( fractionGenotypes > 0 ){