Merge pull request #802 from broadinstitute/pd_selectvariants_subset
Added -trimAlternates argument to SelectVariants
This commit is contained in:
commit
e26e55efe1
|
|
@ -53,8 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.variantutils;
|
||||||
|
|
||||||
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
|
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
|
||||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
public class SelectVariantsIntegrationTest extends WalkerTest {
|
public class SelectVariantsIntegrationTest extends WalkerTest {
|
||||||
|
|
@ -280,7 +282,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
||||||
String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf";
|
String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf";
|
||||||
|
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
"-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
"-T SelectVariants --keepOriginalAC -env -trimAlternates -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("4695c99d96490ed4e5b1568c5b52dea6")
|
Arrays.asList("4695c99d96490ed4e5b1568c5b52dea6")
|
||||||
);
|
);
|
||||||
|
|
@ -319,7 +321,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
||||||
String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf";
|
String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf";
|
||||||
String samplesFile = privateTestDir + "GIH.samples.list";
|
String samplesFile = privateTestDir + "GIH.samples.list";
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
|
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants -trimAlternates --variant " + testfile,
|
||||||
1,
|
1,
|
||||||
Arrays.asList("69862fb97e8e895fe65c7abb14b03cee")
|
Arrays.asList("69862fb97e8e895fe65c7abb14b03cee")
|
||||||
);
|
);
|
||||||
|
|
@ -382,8 +384,35 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testAlleleTrimming() {
|
public void testAlleleTrimming() {
|
||||||
final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf";
|
final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf";
|
||||||
final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12878 -env "
|
final String cmd = "-T SelectVariants -R " + b37KGReference + " -sn NA12878 -env -trimAlternates "
|
||||||
+ testFile + " -o %s --no_cmdline_in_header";
|
+ "-V " + testFile + " -o %s --no_cmdline_in_header";
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("69c3f59c132418ec10117aa395addfea"));
|
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9df942000eb18b12d9008c7d9b5c4178"));
|
||||||
|
executeTest("testAlleleTrimming", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name="unusedAlleleTrimmingProvider")
|
||||||
|
public Object[][] unusedAlleleTrimmingProvider() {
|
||||||
|
return new Object[][] {
|
||||||
|
{ privateTestDir+"forHardLeftAlignVariantsTest.vcf", "-trimAlternates", "9df942000eb18b12d9008c7d9b5c4178"},
|
||||||
|
{ privateTestDir+"forHardLeftAlignVariantsTest.vcf", "", "981b757e3dc6bf3864ac7e493cf9d30d"},
|
||||||
|
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT", "8ded359dd87fd498ff38736ea0fa4c28"},
|
||||||
|
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env", "a7e7288dcd779cfac6983069de45b79c"},
|
||||||
|
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -trimAlternates", "2e726d06a8d317199e8dda74691948a3"},
|
||||||
|
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env -trimAlternates", "1e5585f86c347da271a79fbfc61ac849"}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "unusedAlleleTrimmingProvider")
|
||||||
|
public void testUnusedAlleleTrimming(final String vcf, final String extraArgs, final String md5) {
|
||||||
|
final WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T SelectVariants" +
|
||||||
|
" -R "+b37KGReference +
|
||||||
|
" -V "+vcf +
|
||||||
|
" -o %s --no_cmdline_in_header" +
|
||||||
|
" "+extraArgs,
|
||||||
|
1,
|
||||||
|
Arrays.asList(md5)
|
||||||
|
);
|
||||||
|
executeTest(String.format("testUnusedAlleleTrimming: (%s,%s)", new File(vcf).getName(), extraArgs), spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest {
|
||||||
}
|
}
|
||||||
{ // AD and PL decoding race condition
|
{ // AD and PL decoding race condition
|
||||||
final String testfile = privateTestDir + "race_condition.vcf";
|
final String testfile = privateTestDir + "race_condition.vcf";
|
||||||
final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
|
final String args = "-env -trimAlternates -sn SAMPLE -L 1:1-10,000,000 -V " + testfile;
|
||||||
new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt);
|
new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -253,6 +253,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
@Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false)
|
@Argument(fullName="preserveAlleles", shortName="noTrim", doc="Preserve original alleles, do not trim", required=false)
|
||||||
protected boolean preserveAlleles = false;
|
protected boolean preserveAlleles = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When this argument is present, all alternate alleles that are not present in the (output) samples will be removed.
|
||||||
|
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
|
||||||
|
* removed and the record will contain a '.' in the ALT column. Also note that sites-only VCFs, by definition, do
|
||||||
|
* not include the alternate allele in any genotype calls.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="removeUnusedAlternates", shortName="trimAlternates", doc="Remove alternate alleles not present in any genotypes", required=false)
|
||||||
|
protected boolean removeUnusedAlternates = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
|
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
|
||||||
* For example, a multiallelic record such as:
|
* For example, a multiallelic record such as:
|
||||||
|
|
@ -528,7 +537,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
if ( containsIndelLargerThan(vc, maxIndelSize) )
|
if ( containsIndelLargerThan(vc, maxIndelSize) )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS, preserveAlleles);
|
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
|
||||||
|
|
||||||
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
|
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
|
||||||
boolean failedJexlMatch = false;
|
boolean failedJexlMatch = false;
|
||||||
|
|
@ -681,25 +690,30 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
|
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
|
||||||
*
|
*
|
||||||
* @param vc the VariantContext record to subset
|
* @param vc the VariantContext record to subset
|
||||||
* @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles?
|
* @param preserveAlleles should we trim constant sequence from the beginning and/or end of all alleles, or preserve it?
|
||||||
|
* @param removeUnusedAlternates removes alternate alleles with AC=0
|
||||||
* @return the subsetted VariantContext
|
* @return the subsetted VariantContext
|
||||||
*/
|
*/
|
||||||
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants, final boolean preserveAlleles) {
|
private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) {
|
||||||
if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
|
//subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible
|
||||||
|
if ( NO_SAMPLES_SPECIFIED && !removeUnusedAlternates )
|
||||||
return vc;
|
return vc;
|
||||||
|
|
||||||
final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used
|
// strip out the alternate alleles that aren't being used
|
||||||
|
final VariantContext sub = vc.subContextFromSamples(samples, removeUnusedAlternates);
|
||||||
|
|
||||||
|
//If no subsetting happened, exit now
|
||||||
|
if ( sub.getNSamples() == vc.getNSamples() && sub.getNAlleles() == vc.getNAlleles() )
|
||||||
|
return vc;
|
||||||
|
|
||||||
final VariantContextBuilder builder = new VariantContextBuilder(sub);
|
final VariantContextBuilder builder = new VariantContextBuilder(sub);
|
||||||
|
|
||||||
// if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values
|
// if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values
|
||||||
GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc);
|
GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc);
|
||||||
|
|
||||||
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
|
// since the VC has been subset (either by sample or allele), we need to strip out the MLE tags
|
||||||
if ( vc.getNSamples() != sub.getNSamples() ) {
|
|
||||||
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
|
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
|
||||||
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
|
builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
|
||||||
}
|
|
||||||
|
|
||||||
// Remove a fraction of the genotypes if needed
|
// Remove a fraction of the genotypes if needed
|
||||||
if ( fractionGenotypes > 0 ){
|
if ( fractionGenotypes > 0 ){
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue