Merge pull request #1264 from broadinstitute/rhl_max_nocall_1262

Backport maxNoCall functionality from GATK4
This commit is contained in:
Ron Levine 2016-01-06 14:54:24 -05:00
commit f002acebdf
2 changed files with 108 additions and 4 deletions

View File

@ -69,6 +69,8 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
private static final String INVERT_SELECTION_MD5 = "26d192b868746ab14133f145ae812e7c";
private static final String MAX_FILTERED_GT_SELECTION_MD5 = "f83ac0deb7a8b022d6d40a85627a71ec";
private static final String MIN_FILTERED_GT_SELECTION_MD5 = "346620b7a5d66dabf89d3f42d6e27db7";
private static final String NO_CALL_FILTERING_KEEP_ONE = "6e2401190c5ada6a3bed2640c068f43b";
private static final String NO_CALL_FILTERING_KEEP_TWO = "6bced1ab6a3d58f1fd905b7f601987a3";
@Test
public void testDiscordanceNoSampleSpecified() {
@ -771,4 +773,56 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
spec.disableShadowBCF();
executeTest("testSACNonDiploid", spec);
}
@Test
public void testMaxNoCall1() {
final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLnumber 1",
1,
Arrays.asList(NO_CALL_FILTERING_KEEP_ONE));
spec.disableShadowBCF();
executeTest("testMaxNoCall1", spec);
}
@Test
public void testMaxNoCall0_25() {
final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLfraction 0.25",
1,
Arrays.asList(NO_CALL_FILTERING_KEEP_ONE));
spec.disableShadowBCF();
executeTest("testMaxNoCall0_25", spec);
}
@Test
public void testMaxNoCall2() {
final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLnumber 2",
1,
Arrays.asList(NO_CALL_FILTERING_KEEP_TWO));
spec.disableShadowBCF();
executeTest("testMaxNoCall2", spec);
}
@Test
public void testMaxNoCall0_5() {
final String testfile = privateTestDir + "vcfexample.forNoCallFiltering.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " --variant " + testfile + " -o %s --no_cmdline_in_header --maxNOCALLfraction 0.5",
1,
Arrays.asList(NO_CALL_FILTERING_KEEP_TWO));
spec.disableShadowBCF();
executeTest("testMaxNoCall0_5", spec);
}
}

View File

@ -309,9 +309,11 @@ import java.util.*;
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
static final int MAX_FILTERED_GENOTYPES_DEFAULT_VALUE = Integer.MAX_VALUE;
static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0;
static final double MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 1.0;
static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0;
static final double MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 0.0;
private static final int MAX_NOCALL_NUMBER_DEFAULT_VALUE = Integer.MAX_VALUE;
private static final double MAX_NOCALL_FRACTION_DEFAULT_VALUE = 1.0;
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ -564,6 +566,18 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="minFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level")
private double minFractionFilteredGenotypes = MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where at most the given number of samples have no-call genotypes.
*/
@Argument(fullName="maxNOCALLnumber", required=false, doc="Maximum number of samples with no-call genotypes")
private int maxNOCALLnumber = MAX_NOCALL_NUMBER_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where at most the given fraction of samples have no-call genotypes.
*/
@Argument(fullName="maxNOCALLfraction", required=false, doc="Maximum fraction of samples with no-call genotypes")
private double maxNOCALLfraction = MAX_NOCALL_FRACTION_DEFAULT_VALUE;
/**
* If this argument is provided, set filtered genotypes to no-call (./.).
*/
@ -815,7 +829,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize))
continue;
if ( needNumFilteredGenotypes()) {
if ( considerFilteredGenotypes()) {
int numFilteredSamples = numFilteredGenotypes(vc);
double fractionFilteredGenotypes = samples.isEmpty() ? 0.0 : numFilteredSamples / samples.size();
if (numFilteredSamples > maxFilteredGenotypes || numFilteredSamples < minFilteredGenotypes ||
@ -823,6 +837,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
continue;
}
if (considerNoCallGenotypes()) {
final int numNoCallSamples = numNoCallGenotypes(vc);
final double fractionNoCallGenotypes = samples.isEmpty() ? 0.0 : ((double) numNoCallSamples) / samples.size();
if (numNoCallSamples > maxNOCALLnumber || fractionNoCallGenotypes > maxNOCALLfraction)
continue;
}
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
VariantContext filteredGenotypeToNocall = setFilteredGenotypeToNocall(sub, setFilteredGenotypesToNocall);
@ -877,6 +898,25 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return false;
}
/**
* Find the number of no-call genotypes
*
* @param vc the variant context
* @return number of no-call genotypes
*/
private int numNoCallGenotypes(final VariantContext vc) {
if (vc == null)
return 0;
int numFiltered = 0;
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isNoCall() )
numFiltered++;
}
return numFiltered;
}
/**
* Find the number of filtered samples
*
@ -1183,14 +1223,24 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
}
/**
* Need the number of filtered genotypes samples?
* Should the number of filtered genotypes be considered for filtering?
*
* @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise
*/
private boolean needNumFilteredGenotypes(){
private boolean considerFilteredGenotypes(){
return maxFilteredGenotypes != MAX_FILTERED_GENOTYPES_DEFAULT_VALUE ||
minFilteredGenotypes != MIN_FILTERED_GENOTYPES_DEFAULT_VALUE ||
maxFractionFilteredGenotypes != MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE ||
minFractionFilteredGenotypes != MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
}
/**
* Should the number of no-call genotypes be considered for filtering?
*
* @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise
*
*/
private boolean considerNoCallGenotypes(){
return maxNOCALLnumber != MAX_NOCALL_NUMBER_DEFAULT_VALUE || maxNOCALLfraction != MAX_NOCALL_FRACTION_DEFAULT_VALUE;
}
}