Added -trimAlternates argument to SelectVariants
* PT 84021222 * -trimAlternates removes all unused alternate alleles from variants. Note that this is pretty aggressive for monomorphic sites
This commit is contained in:
parent
d209699485
commit
72f76add71
|
|
@ -53,8 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.variantutils;
|
|||
|
||||
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class SelectVariantsIntegrationTest extends WalkerTest {
|
||||
|
|
@ -280,7 +282,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
"-T SelectVariants --keepOriginalAC -env -trimAlternates -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("4695c99d96490ed4e5b1568c5b52dea6")
|
||||
);
|
||||
|
|
@ -319,7 +321,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf";
|
||||
String samplesFile = privateTestDir + "GIH.samples.list";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
|
||||
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants -trimAlternates --variant " + testfile,
|
||||
1,
|
||||
Arrays.asList("69862fb97e8e895fe65c7abb14b03cee")
|
||||
);
|
||||
|
|
@ -382,8 +384,35 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testAlleleTrimming() {
|
||||
final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf";
|
||||
final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12878 -env "
|
||||
+ testFile + " -o %s --no_cmdline_in_header";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("69c3f59c132418ec10117aa395addfea"));
|
||||
final String cmd = "-T SelectVariants -R " + b37KGReference + " -sn NA12878 -env -trimAlternates "
|
||||
+ "-V " + testFile + " -o %s --no_cmdline_in_header";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9df942000eb18b12d9008c7d9b5c4178"));
|
||||
executeTest("testAlleleTrimming", spec);
|
||||
}
|
||||
|
||||
@DataProvider(name="unusedAlleleTrimmingProvider")
|
||||
public Object[][] unusedAlleleTrimmingProvider() {
|
||||
return new Object[][] {
|
||||
{ privateTestDir+"forHardLeftAlignVariantsTest.vcf", "-trimAlternates", "9df942000eb18b12d9008c7d9b5c4178"},
|
||||
{ privateTestDir+"forHardLeftAlignVariantsTest.vcf", "", "981b757e3dc6bf3864ac7e493cf9d30d"},
|
||||
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT", "8ded359dd87fd498ff38736ea0fa4c28"},
|
||||
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env", "a7e7288dcd779cfac6983069de45b79c"},
|
||||
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -trimAlternates", "2e726d06a8d317199e8dda74691948a3"},
|
||||
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env -trimAlternates", "1e5585f86c347da271a79fbfc61ac849"}
|
||||
};
|
||||
}
|
||||
|
||||
@Test(dataProvider = "unusedAlleleTrimmingProvider")
|
||||
public void testUnusedAlleleTrimming(final String vcf, final String extraArgs, final String md5) {
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants" +
|
||||
" -R "+b37KGReference +
|
||||
" -V "+vcf +
|
||||
" -o %s --no_cmdline_in_header" +
|
||||
" "+extraArgs,
|
||||
1,
|
||||
Arrays.asList(md5)
|
||||
);
|
||||
executeTest(String.format("testUnusedAlleleTrimming: (%s,%s)", new File(vcf).getName(), extraArgs), spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest {
|
|||
}
|
||||
{ // AD and PL decoding race condition
|
||||
final String testfile = privateTestDir + "race_condition.vcf";
|
||||
final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
|
||||
final String args = "-env -trimAlternates -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
|
||||
new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -253,6 +253,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
@Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false)
|
||||
protected boolean preserveAlleles = false;
|
||||
|
||||
/**
|
||||
* When this argument is present, all alternate alleles that are not present in the (output) samples will be removed.
|
||||
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
|
||||
* removed and the record will contain a '.' in the ALT column. Also note that sites-only VCFs, by definition, do
|
||||
* not include the alternate allele in any genotype calls.
|
||||
*/
|
||||
@Argument(fullName="removeUnusedAlternates", shortName="trimAlternates", doc="Remove alternate alleles not present in any genotypes", required=false)
|
||||
protected boolean removeUnusedAlternates = false;
|
||||
|
||||
/**
|
||||
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
|
||||
* For example, a multiallelic record such as:
|
||||
|
|
@ -528,7 +537,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
if ( containsIndelLargerThan(vc, maxIndelSize) )
|
||||
continue;
|
||||
|
||||
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS, preserveAlleles);
|
||||
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
|
||||
|
||||
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
|
||||
boolean failedJexlMatch = false;
|
||||
|
|
@ -681,25 +690,30 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
|
||||
*
|
||||
* @param vc the VariantContext record to subset
|
||||
* @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles?
|
||||
* @param preserveAlleles should we trim constant sequence from the beginning and/or end of all alleles, or preserve it?
|
||||
* @param removeUnusedAlternates removes alternate alleles with AC=0
|
||||
* @return the subsetted VariantContext
|
||||
*/
|
||||
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants, final boolean preserveAlleles) {
|
||||
if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
|
||||
private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) {
|
||||
//subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible
|
||||
if ( NO_SAMPLES_SPECIFIED && !removeUnusedAlternates )
|
||||
return vc;
|
||||
|
||||
final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used
|
||||
// strip out the alternate alleles that aren't being used
|
||||
final VariantContext sub = vc.subContextFromSamples(samples, removeUnusedAlternates);
|
||||
|
||||
//If no subsetting happened, exit now
|
||||
if ( sub.getNSamples() == vc.getNSamples() && sub.getNAlleles() == vc.getNAlleles() )
|
||||
return vc;
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(sub);
|
||||
|
||||
// if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values
|
||||
GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc);
|
||||
|
||||
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
|
||||
if ( vc.getNSamples() != sub.getNSamples() ) {
|
||||
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
|
||||
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
|
||||
}
|
||||
// since the VC has been subset (either by sample or allele), we need to strip out the MLE tags
|
||||
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
|
||||
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
|
||||
|
||||
// Remove a fraction of the genotypes if needed
|
||||
if ( fractionGenotypes > 0 ){
|
||||
|
|
|
|||
Loading…
Reference in New Issue