Merge pull request #947 from broadinstitute/rhl_invert_selection

Added --invert_selection flag for variant selection queries
This commit is contained in:
Geraldine Van der Auwera 2015-05-13 13:40:32 -04:00
commit f6b3d8e862
7 changed files with 657 additions and 163 deletions

View File

@ -144,6 +144,14 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
executeTest("test filter with separate names #2", spec);
}
@Test
public void testInvertFilter() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invert_filter_expression", 1,
Arrays.asList("d478fd6bcf0884133fe2a47adf4cd765"));
executeTest("test inversion of selection of filter with separate names #2", spec);
}
@Test
public void testGenotypeFilters1() {
WalkerTestSpec spec1 = new WalkerTestSpec(
@ -194,4 +202,13 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
Arrays.asList("e10485c7c33d9211d0c1294fd7858476"));
executeTest("testFilteringDPfromFORMAT", spec);
}
@Test
public void testInvertGenotypeFilterExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invert_genotype_filter_expression", 1,
Arrays.asList("d2664870e7145eb73a2295766482c823"));
executeTest("testInvertGenotypeFilterExpression", spec);
}
}

View File

@ -64,6 +64,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args;
}
private static final String sampleExclusionMD5 = "eea22fbf1e490e59389a663c3d6a6537";
private static final String invertSelectionMD5 = "831bc0a5a723b0681a910d668ff3757b";
@Test
public void testDiscordanceNoSampleSpecified() {
String testFile = privateTestDir + "NA12878.hg19.example1.vcf";
@ -150,6 +153,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testNonExistingSelection--" + testfile, spec);
}
/**
* Test excluding samples from file and sample name
*/
@Test
public void testSampleExclusionFromFileAndSeparateSample() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
@ -162,9 +168,12 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
);
spec.disableShadowBCF();
executeTest("testSampleExclusion--" + testfile, spec);
executeTest("testSampleExclusionFromFileAndSeparateSample--" + testfile, spec);
}
/**
* Test excluding samples from file
*/
@Test
public void testSampleExclusionJustFromFile() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
@ -177,9 +186,46 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
);
spec.disableShadowBCF();
executeTest("testSampleExclusion--" + testfile, spec);
executeTest("testSampleExclusionJustFromFile--" + testfile, spec);
}
/**
* Test excluding samples from expression
*/
@Test
public void testSampleExclusionJustFromExpression() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_se '[CDH]' --variant " + testfile,
1,
Arrays.asList(sampleExclusionMD5)
);
spec.disableShadowBCF();
executeTest("testSampleExclusionJustFromExpression--" + testfile, spec);
}
/**
* Test excluding samples from negation expression
*/
@Test
public void testSampleExclusionJustFromNegationExpression() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -se '[^CDH]' --variant " + testfile,
1,
Arrays.asList(sampleExclusionMD5)
);
spec.disableShadowBCF();
executeTest("testSampleExclusionJustFromRegexExpression--" + testfile, spec);
}
/**
* Test including samples that are not in the VCF
*/
@Test
public void testSampleInclusionWithNonexistingSamples() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
@ -195,7 +241,6 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec);
}
@Test
public void testConcordance() {
String testFile = privateTestDir + "NA12878.hg19.example1.vcf";
@ -212,6 +257,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testConcordance--" + testFile, spec);
}
/**
* Test including variant types.
*/
@Test
public void testVariantTypeSelection() {
String testFile = privateTestDir + "complexExample1.vcf";
@ -219,23 +267,42 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794")
Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e")
);
executeTest("testVariantTypeSelection--" + testFile, spec);
}
/**
* Test excluding indels that are larger than the specified size
*/
@Test
public void testIndelLengthSelection() {
public void testMaxIndelLengthSelection() {
String testFile = privateTestDir + "complexExample1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3",
"-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 2",
1,
Arrays.asList("004589868ca5dc887e2dff876b4cc797")
Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e")
);
executeTest("testIndelLengthSelection--" + testFile, spec);
executeTest("testMaxIndelLengthSelection--" + testFile, spec);
}
/**
* Test excluding indels that are smaller than the specified size
*/
@Test
public void testMinIndelLengthSelection() {
String testFile = privateTestDir + "complexExample1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --minIndelSize 2",
1,
Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb")
);
executeTest("testMinIndelLengthSelection--" + testFile, spec);
}
@Test
@ -488,4 +555,114 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testMissingVcfException--" + testfile, spec);
}
/**
* Test inverting the variant selection criteria
*/
@Test
public void testInvertSelection() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' --invert_selection --variant " + testfile),
1,
Arrays.asList(invertSelectionMD5)
);
spec.disableShadowBCF();
executeTest("testInvertSelection--" + testfile, spec);
}
/**
* Test inverting the variant JEXL selection criteria
*/
@Test
public void testInvertJexlSelection() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP >= 20000'--variant " + testfile),
1,
Arrays.asList(invertSelectionMD5)
);
spec.disableShadowBCF();
executeTest("testInvertJexlSelection--" + testfile, spec);
}
/**
* Test selecting variants with IDs
*/
@Test
public void testKeepSelectionID() {
String testFile = privateTestDir + "complexExample1.vcf";
String idFile = privateTestDir + "complexExample1.vcf.id";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -IDs " + idFile + " --variant " + testFile),
1,
Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e")
);
spec.disableShadowBCF();
executeTest("testKeepSelectionID--" + testFile, spec);
}
/**
* Test excluding variants with IDs
*/
@Test
public void testExcludeSelectionID() {
String testFile = privateTestDir + "complexExample1.vcf";
String idFile = privateTestDir + "complexExample1.vcf.id";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -xlIDs " + idFile + " --variant " + testFile),
1,
Arrays.asList("77514a81233e1bbc0f5e47b0fb76a89a")
);
spec.disableShadowBCF();
executeTest("testExcludeSelectionID--" + testFile, spec);
}
/**
* Test excluding variant types
*/
@Test
public void testExcludeSelectionType() {
String testFile = privateTestDir + "complexExample1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -xlSelectType SNP --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb")
);
executeTest("testExcludeSelectionType--" + testFile, spec);
}
@Test
public void testMendelianViolationSelection() {
String testFile = privateTestDir + "CEUtrioTest.vcf";
String pedFile = privateTestDir + "CEUtrio.ped";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("406243096074a417d2aa103bd3d13e01"));
executeTest("testMendelianViolationSelection--" + testFile, spec);
}
@Test
public void testInvertMendelianViolationSelection() {
String testFile = privateTestDir + "CEUtrioTest.vcf";
String pedFile = privateTestDir + "CEUtrio.ped";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -inv_mv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("35921fb2dedca0ead83027a66b725794"));
executeTest("testMendelianViolationSelection--" + testFile, spec);
}
}

View File

@ -26,6 +26,7 @@
package org.broadinstitute.gatk.tools.walkers.filters;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
@ -104,13 +105,13 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
* --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
*/
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
protected ArrayList<String> FILTER_EXPS = new ArrayList<String>();
protected ArrayList<String> filterExpressions = new ArrayList<String>();
/**
* This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
*/
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
protected ArrayList<String> FILTER_NAMES = new ArrayList<String>();
protected ArrayList<String> filterNames = new ArrayList<String>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
@ -120,13 +121,13 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
* expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object.
*/
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
protected ArrayList<String> GENOTYPE_FILTER_EXPS = new ArrayList<String>();
protected ArrayList<String> genotypeFilterExpressions = new ArrayList<String>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
*/
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected ArrayList<String> GENOTYPE_FILTER_NAMES = new ArrayList<String>();
protected ArrayList<String> genotypeFilterNames = new ArrayList<String>();
/**
* Works together with the --clusterWindowSize argument.
@ -141,7 +142,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
protected Integer clusterWindow = 0;
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false)
protected Integer MASK_EXTEND = 0;
protected Integer maskExtension = 0;
/**
* When using the -mask argument, the maskName will be annotated in the variant record.
@ -150,7 +151,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
*/
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
protected String MASK_NAME = "Mask";
protected String maskName = "Mask";
/**
* By default, if the -mask argument is used, any variant falling in a mask will be filtered.
@ -167,7 +168,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
* Use this argument to have it evaluate as failing filters instead for these cases.
*/
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean FAIL_MISSING_VALUES = false;
protected Boolean failMissingValues = false;
/**
* Invalidate previous filters applied to the VariantContext, applying only the filters here
@ -175,6 +176,18 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
boolean invalidatePrevious = false;
/**
* Invert the selection criteria for --filterExpression
*/
@Argument(fullName="invert_filter_expression", shortName="inv_fil_sel", doc="Invert the selection criteria for --filterExpression", required=false)
protected boolean invertFilterExpression = false;
/**
* Invert the selection criteria for --genotypeFilterExpression
*/
@Argument(fullName="invert_genotype_filter_expression", shortName="inv_gen_fil_sel", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
protected boolean invertGenotypeFilterExpression = false;
// JEXL expressions for the filters
List<VariantContextUtils.JexlVCMatchExp> filterExps;
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
@ -185,9 +198,21 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
// the structures necessary to initialize and maintain a windowed context
private FiltrationContextWindow variantContextWindow;
private static final int windowSize = 10; // 10 variants on either end of the current one
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<FiltrationContext>();
/**
* Prepend inverse phrase to description if --invert_filter_expression
*
* @param description the description
* @return the description with inverse prepended if --invert_filter_expression
*/
private String possiblyInvertFilterExpression( String description ){
if ( invertFilterExpression )
description = "Inverse of: " + description;
return description;
}
private void initializeVcfWriter() {
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
@ -199,19 +224,19 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
if ( clusterWindow > 0 )
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
if ( genotypeFilterExps.size() > 0 )
if ( !genotypeFilterExps.isEmpty() )
hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
try {
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
if ( mask.isBound() ) {
if (filterRecordsNotInMask)
hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Doesn't overlap a user-input mask"));
else hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask"));
hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
}
} catch (IllegalArgumentException e) {
throw new UserException.BadInput(e.getMessage());
@ -224,13 +249,13 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
if ( clusterWindow > 0 )
clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow);
if ( MASK_EXTEND < 0 )
if ( maskExtension < 0 )
throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
if (filterRecordsNotInMask && !mask.isBound())
throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided");
filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS);
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
VariantContextUtils.engine.get().setSilent(true);
@ -262,15 +287,9 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
if ( invalidatePrevious ) {
vc = (new VariantContextBuilder(vc)).filters(new HashSet<String>()).make();
}
// filter based on previous mask position
if ( previousMaskPosition != null && // we saw a previous mask site
previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig
vc.getStart() - previousMaskPosition.getStop() <= MASK_EXTEND && // it's within the mask area (multi-base masks that overlap this site will always give a negative distance)
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
filters.add(MASK_NAME);
vc = new VariantContextBuilder(vc).filters(filters).make();
}
vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false);
FiltrationContext varContext = new FiltrationContext(ref, vc);
@ -280,11 +299,11 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
// if this is a mask position, filter previous records
if ( hasMask ) {
for ( FiltrationContext prevVC : windowInitializer )
prevVC.setVariantContext(checkMaskForPreviousLocation(prevVC.getVariantContext(), ref.getLocus()));
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
}
windowInitializer.add(varContext);
if ( windowInitializer.size() == windowSize ) {
if ( windowInitializer.size() == WINDOW_SIZE ) {
variantContextWindow = new FiltrationContextWindow(windowInitializer);
windowInitializer = null;
}
@ -294,7 +313,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
if ( hasMask ) {
for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
if ( prevVC != null )
prevVC.setVariantContext(checkMaskForPreviousLocation(prevVC.getVariantContext(), ref.getLocus()));
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
}
}
@ -306,12 +325,44 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
return 1;
}
private VariantContext checkMaskForPreviousLocation(VariantContext vc, GenomeLoc maskLoc) {
if ( maskLoc.getContig().equals(vc.getChr()) && // it's on the same contig
maskLoc.getStart() - vc.getEnd() <= MASK_EXTEND && // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
/**
* Helper function to check if a mask covers the variant location.
*
* @param vc variant context
* @param genomeLoc genome location
* @param maskName name of the mask
* @param maskExtension bases beyond the mask
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
* @return true if the genome location is within the extended mask area, false otherwise
*/
protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) {
boolean logic = genomeLoc != null && // have a location
genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
(vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied
if ( logic ) {
if (vcBeforeLoc)
return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
else
return vc.getStart() - genomeLoc.getStop() <= maskExtension;
} else {
return false;
}
}
/**
* Add mask to variant context filters if it covers the it's location
*
* @param vc VariantContext
* @param genomeLoc genome location
* @param maskName name of the mask
* @param maskExtension bases beyond the mask
* @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
* @return VariantContext with the mask added if the VariantContext is within the extended mask area
*/
private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) {
if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
filters.add(MASK_NAME);
filters.add(maskName);
vc = new VariantContextBuilder(vc).filters(filters).make();
}
@ -328,7 +379,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
final VariantContextBuilder builder = new VariantContextBuilder(vc);
// make new Genotypes based on filters
if ( genotypeFilterExps.size() > 0 ) {
if ( !genotypeFilterExps.isEmpty() ) {
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
// for each genotype, check filters then create a new object
@ -338,7 +389,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
if ( g.isFiltered() ) filters.add(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(vc, g, exp) )
if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) )
filters.add(exp.name);
}
@ -360,11 +411,11 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
try {
if ( VariantContextUtils.match(vc, exp) )
if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) )
filters.add(exp.name);
} catch (Exception e) {
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
if ( FAIL_MISSING_VALUES )
if ( failMissingValues )
filters.add(exp.name);
}
}
@ -389,11 +440,11 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
public void onTraversalDone(Integer result) {
// move the window over so that we can filter the last few variants
if ( windowInitializer != null ) {
while ( windowInitializer.size() < windowSize )
while ( windowInitializer.size() < WINDOW_SIZE )
windowInitializer.add(null);
variantContextWindow = new FiltrationContextWindow(windowInitializer);
}
for (int i=0; i < windowSize; i++) {
for (int i=0; i < WINDOW_SIZE; i++) {
variantContextWindow.moveWindow(null);
filter();
}

View File

@ -113,9 +113,22 @@ import java.util.*;
* -se 'SAMPLE.+PARC'
* </pre>
*
* <h4>Select any sample that matches a regular expression and sites where the QD annotation is more than 10</h4>
* <h4>Exclude two samples and any sample that matches a regular expression:</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -xl_sn SAMPLE_1_PARC \
* -xl_sn SAMPLE_1_ACTG \
* -xl_se 'SAMPLE.+PARC'
* </pre>
*
* <h4>Select any sample that matches a regular expression and sites where the QD annotation is more than 10:</h4>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* -R reference.fasta \
* -V input.vcf \
@ -124,9 +137,22 @@ import java.util.*;
* -select "QD > 10.0"
* </pre>
*
* <h4>Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default)</h4>
* <h4>Select any sample that does not match a regular expression and sites where the QD annotation is more than 10:</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -se 'SAMPLE.+PARC' \
* -select "QD > 10.0"
* --invert_selection
* </pre>
*
* <h4>Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* -R reference.fasta \
* -V input.vcf \
@ -136,7 +162,7 @@ import java.util.*;
* -ef
* </pre>
*
* <h4>Select a sample, subset remaining alleles, but don't trim</h4>
* <h4>Select a sample, subset remaining alleles, but don't trim:</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@ -148,7 +174,7 @@ import java.util.*;
* -noTrim
*</pre>
*
* <h4>Select a sample and restrict the output vcf to a set of intervals</h4>
* <h4>Select a sample and restrict the output vcf to a set of intervals:</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@ -159,7 +185,7 @@ import java.util.*;
* -sn SAMPLE_1_ACTG
* </pre>
*
* <h4>Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset)</h4>
* <h4>Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset):</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@ -170,7 +196,7 @@ import java.util.*;
* -sn mySample
* </pre>
*
* <h4>Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers)</h4>
* <h4>Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers):</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@ -192,7 +218,18 @@ import java.util.*;
* -o violations.vcf
* </pre>
*
* <h4>Create a set with 50% of the total number of variants in the variant VCF</h4>
* <h4>Generating a VCF of all the variants that are not mendelian violations. The optional argument `-mvq` together with '-inv_mv' restricts the selection to sites that have a QUAL score of 50 or less</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
* -R reference.fasta \
* -V input.vcf \
* -ped family.ped \
* -mv -mvq 50 -inv_mv \
* -o violations.vcf
* </pre>
*
* <h4>Create a set with 50% of the total number of variants in the variant VCF:</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@ -202,17 +239,30 @@ import java.util.*;
* -fraction 0.5
* </pre>
*
* <h4>Select only indels from a VCF</h4>
* <h4>Select only indels between 2 and 5 bases long from a VCF:</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* -R reference.fasta \
* -V input.vcf \
* -o output.vcf \
* -selectType INDEL
* -minIndelSize 2
* -maxIndelSize 5
* </pre>
*
* <h4>Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column)</h4>
* <h4>Exclude indels from a VCF:</h4>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -selectTypeToExclude INDEL
* </pre>
*
* <h4>Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@ -223,6 +273,17 @@ import java.util.*;
* -restrictAllelesTo MULTIALLELIC
* </pre>
*
* <h4>Select IDs in fileKeep and exclude IDs in fileExclude:</h4>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -IDs fileKeep \
* -excludeIDs fileExclude
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
@ -258,7 +319,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* argument can be specified multiple times in order to use multiple different matching patterns.
*/
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select multiple samples", required=false)
public Set<String> sampleExpressions ;
public Set<String> sampleExpressions;
/**
* Sample names should be in a plain text file listing one sample name per line. This argument can be specified multiple times in order to provide
@ -282,26 +343,40 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Input(fullName="exclude_sample_file", shortName="xl_sf", doc="List of samples to exclude", required=false)
public Set<File> XLsampleFiles = new HashSet<>(0);
/**
* Using a regular expression allows you to match multiple sample names that have that pattern in common. Note that sample exclusion takes precedence
* over inclusion, so that if a sample is in both lists it will be excluded. This argument can be specified multiple times in order to use multiple
* different matching patterns.
*/
@Input(fullName="exclude_sample_expressions", shortName="xl_se", doc="List of sample expressions to exclude", required=false)
public Set<String> XLsampleExpressions = new HashSet<>(0);
/**
* See example commands above for detailed usage examples. Note that these expressions are evaluated *after* the
* specified samples are extracted and the INFO field annotations are updated.
*/
@Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<>();
public ArrayList<String> selectExpressions = new ArrayList<>();
/**
* Invert the selection criteria for --select.
*/
@Argument(fullName="invert_selection", shortName="inv_sel", doc="Invert the selection criteria for --select", required=false)
protected boolean invertSelection = false;
/*
* If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none
* of the selected samples display evidence of variation) will be excluded from the output.
*/
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include non-variant sites", required=false)
protected boolean EXCLUDE_NON_VARIANTS = false;
protected boolean XLnonVariants = false;
/**
* If this flag is enabled, sites that have been marked as filtered (i.e. have anything other than `.` or `PASS`
* in the FILTER field) will be excluded from the output.
*/
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered sites", required=false)
protected boolean EXCLUDE_FILTERED = false;
protected boolean XLfiltered = false;
/**
* The default behavior of this tool is to remove bases common to all remaining alleles after subsetting
@ -338,7 +413,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* AC_Orig, AF_Orig, and AN_Orig.
*/
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values after subsetting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
private boolean keepOriginalChrCounts = false;
/**
* When subsetting a callset, this tool recalculates the site-level (INFO field) DP value corresponding to the contents of the
@ -346,7 +421,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* DP_Orig.
*/
@Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value after subsetting", required=false)
private boolean KEEP_ORIGINAL_DEPTH = false;
private boolean keepOriginalDepth = false;
/**
* If this flag is enabled, this tool will select only variants that correspond to a mendelian violation as
@ -354,14 +429,22 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* `-ped` argument.
*/
@Argument(fullName="mendelianViolation", shortName="mv", doc="Output mendelian violation sites only", required=false)
private Boolean MENDELIAN_VIOLATIONS = false;
private Boolean mendelianViolations = false;
/**
* If this flag is enabled, this tool will select only variants that do not correspond to a mendelian violation as
* determined on the basis of family structure. Requires passing a pedigree file using the engine-level
* `-ped` argument.
*/
@Argument(fullName="invert_mendelianViolation", shortName="inv_mv", doc="Output non-mendelian violation sites only", required=false)
private Boolean invert_mendelianViolations = false;
/**
* This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order
* for a site to be accepted as a mendelian violation. Note that the `-mv` flag must be set for this argument to have an effect.
*/
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum GQ score for each trio member to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
protected double medelianViolationQualThreshold = 0;
/**
* The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be
@ -385,7 +468,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* SYMBOLIC, NO_VARIATION. Can be specified multiple times.
*/
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file", required=false)
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<>();
private List<VariantContext.Type> typesToInclude = new ArrayList<>();
/**
* This argument excludes particular kinds of variants out of a list. If left empty, there is no type selection
* and all variant types are considered for other selection criteria. Valid types are INDEL, SNP, MIXED, MNP,
* SYMBOLIC, NO_VARIATION. Can be specified multiple times.
*/
@Argument(fullName="selectTypeToExclude", shortName="xlSelectType", doc="Do not select certain type of variants from the input file", required=false)
private List<VariantContext.Type> typesToExclude = new ArrayList<>();
/**
* If a file containing a list of IDs is provided to this argument, the tool will only select variants whose ID
@ -395,29 +486,37 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="keepIDs", shortName="IDs", doc="List of variant IDs to select", required=false)
private File rsIDFile = null;
/**
* If a file containing a list of IDs is provided to this argument, the tool will not select variants whose ID
* field is present in this list of IDs. The matching is done by exact string matching. The expected file format
* is simply plain text with one ID per line.
*/
@Argument(fullName="excludeIDs", shortName="xlIDs", doc="List of variant IDs to select", required=false)
private File XLrsIDFile = null;
@Hidden
@Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
private boolean fullyDecode = false;
@Hidden
@Argument(fullName="forceGenotypesDecode", doc="If true, the incoming VariantContext will have its genotypes forcibly decoded by computing AC across all genotypes. For efficiency testing only", required=false)
private boolean forceGenotypesDecode = false;
@Hidden
@Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false)
private boolean justRead = false;
/**
* If this argument is provided, indels that are larger than the specified siwe will be excluded.
* If this argument is provided, indels that are larger than the specified size will be excluded.
*/
@Argument(fullName="maxIndelSize", required=false, doc="Maximum size of indels to include")
private int maxIndelSize = Integer.MAX_VALUE;
/**
* If this argument is provided, indels that are smaller than the specified size will be excluded.
*/
@Argument(fullName="minIndelSize", required=false, doc="Minimum size of indels to include")
private int minIndelSize = 0;
@Hidden
@Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.")
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
private boolean allowNonOverlappingCommandLineSamples = false;
public enum NumberAlleleRestriction {
ALL,
@ -430,21 +529,22 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
private TreeSet<String> samples = new TreeSet<>();
private boolean NO_SAMPLES_SPECIFIED = false;
private boolean noSamplesSpecified = false;
private boolean DISCORDANCE_ONLY = false;
private boolean CONCORDANCE_ONLY = false;
private boolean discordanceOnly = false;
private boolean concordanceOnly = false;
private MendelianViolation mv;
/* variables used by the SELECT RANDOM modules */
private boolean SELECT_RANDOM_FRACTION = false;
private boolean selectRandomFraction = false;
//Random number generator for the genotypes to remove
private Random randomGenotypes = new Random();
private Set<String> IDsToKeep = null;
private Set<String> IDsToRemove = null;
private Map<String, VCFHeader> vcfRods;
/**
@ -472,103 +572,126 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
samples.addAll(samplesFromExpressions);
samples.addAll(samplesFromFile);
logger.debug(Utils.join(",",commandLineUniqueSamples));
if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) {
logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored.");
samples.removeAll(commandLineUniqueSamples);
} else if (commandLineUniqueSamples.size() > 0 ) {
throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s",
"Samples entered on command line (through -sf or -sn) that are not present in the VCF.",
"A list of these samples:",
Utils.join(",",commandLineUniqueSamples),
"To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES"));
}
logger.debug(Utils.join(",", commandLineUniqueSamples));
if (!commandLineUniqueSamples.isEmpty()) {
if (allowNonOverlappingCommandLineSamples) {
logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored.");
samples.removeAll(commandLineUniqueSamples);
} else {
throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s",
"Samples entered on command line (through -sf or -sn) that are not present in the VCF.",
"A list of these samples:",
Utils.join(",", commandLineUniqueSamples),
"To ignore these samples, run with --allowNonOverlappingCommandLineSamples"));
}
}
// if none were requested, we want all of them
if ( samples.isEmpty() ) {
samples.addAll(vcfSamples);
NO_SAMPLES_SPECIFIED = true;
noSamplesSpecified = true;
}
// now, exclude any requested samples
final Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
final Collection<String> XLsamplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, XLsampleExpressions);
samples.removeAll(XLsamplesFromFile);
samples.removeAll(XLsampleNames);
NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty();
samples.removeAll(XLsamplesFromExpressions);
noSamplesSpecified = noSamplesSpecified && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty() &&
XLsamplesFromExpressions.isEmpty();
if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
if ( samples.isEmpty() && !noSamplesSpecified )
throw new UserException("All samples requested to be included were also requested to be excluded.");
if ( ! NO_SAMPLES_SPECIFIED )
if ( ! noSamplesSpecified )
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
if (TYPES_TO_INCLUDE.isEmpty()) {
if (typesToInclude.isEmpty()) {
for (VariantContext.Type t : VariantContext.Type.values())
selectedTypes.add(t);
}
else {
for (VariantContext.Type t : TYPES_TO_INCLUDE)
for (VariantContext.Type t : typesToInclude)
selectedTypes.add(t);
}
// remove specifed types
for (VariantContext.Type t : typesToExclude)
selectedTypes.remove(t);
// Initialize VCF header
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
if (KEEP_ORIGINAL_CHR_COUNTS) {
if (keepOriginalChrCounts) {
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY));
}
if (KEEP_ORIGINAL_DEPTH)
if (keepOriginalDepth)
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_DP_KEY));
headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY));
for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {
for (int i = 0; i < selectExpressions.size(); i++) {
// It's not necessary that the user supply select names for the JEXL expressions, since those
// expressions will only be needed for omitting records. Make up the select names here.
selectNames.add(String.format("select-%d", i));
}
jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS);
jexls = VariantContextUtils.initializeMatchExps(selectNames, selectExpressions);
// Look at the parameters to decide which analysis to perform
DISCORDANCE_ONLY = discordanceTrack.isBound();
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName());
discordanceOnly = discordanceTrack.isBound();
if (discordanceOnly) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName());
CONCORDANCE_ONLY = concordanceTrack.isBound();
if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName());
concordanceOnly = concordanceTrack.isBound();
if (concordanceOnly) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName());
if (MENDELIAN_VIOLATIONS) {
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
if (mendelianViolations) {
mv = new MendelianViolation(medelianViolationQualThreshold,false,true);
}
SELECT_RANDOM_FRACTION = fractionRandom > 0;
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
selectRandomFraction = fractionRandom > 0;
if (selectRandomFraction) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
/** load in the IDs file to a hashset for matching */
if ( rsIDFile != null ) {
IDsToKeep = new HashSet<>();
try {
for ( final String line : new XReadLines(rsIDFile).readLines() ) {
IDsToKeep.add(line.trim());
}
logger.info("Selecting only variants with one of " + IDsToKeep.size() + " IDs from " + rsIDFile);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(rsIDFile, e);
}
}
// Get variant IDs to keep and removed
IDsToKeep = getIDsFromFile(rsIDFile);
IDsToRemove = getIDsFromFile(XLrsIDFile);
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
}
/**
* Get IDs from a file
*
* @param file file containing the IDs
* @return set of IDs or null if the file is null
* @throws UserException.CouldNotReadInputFile if could not read the file
*/
private Set<String> getIDsFromFile(final File file){
/** load in the IDs file to a hashset for matching */
if ( file != null ) {
Set<String> ids = new HashSet<>();
try {
for ( final java.lang.String line : new XReadLines(file).readLines() ) {
ids.add(line.trim());
}
logger.info("Selecting only variants with one of " + ids.size() + " IDs from " + file);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(file, e);
}
return ids;
}
return null;
}
/**
* Subset VC record if necessary and emit the modified record (provided it satisfies criteria for printing)
*
@ -584,7 +707,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation());
if ( vcs == null || vcs.size() == 0) {
if ( vcs == null || vcs.isEmpty()) {
return 0;
}
@ -593,24 +716,21 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( fullyDecode )
vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() );
// an option for performance testing only
if ( forceGenotypesDecode ) {
final int x = vc.getCalledChrCount();
//logger.info("forceGenotypesDecode with getCalledChrCount() = " + );
}
if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
if ( IDsToKeep != null && !IDsToKeep.contains(vc.getID()) )
continue;
if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1)
if ( IDsToRemove != null && IDsToRemove.contains(vc.getID()) )
continue;
if ( mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invert_mendelianViolations) )
break;
if (DISCORDANCE_ONLY) {
if (discordanceOnly) {
Collection<VariantContext> compVCs = tracker.getValues(discordanceTrack, context.getLocation());
if (!isDiscordant(vc, compVCs))
continue;
}
if (CONCORDANCE_ONLY) {
if (concordanceOnly) {
Collection<VariantContext> compVCs = tracker.getValues(concordanceTrack, context.getLocation());
if (!isConcordant(vc, compVCs))
continue;
@ -625,16 +745,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (!selectedTypes.contains(vc.getType()))
continue;
if ( containsIndelLargerThan(vc, maxIndelSize) )
if ( containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize) )
continue;
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
// Not excluding non-variants or subsetted polymorphic variants AND including filtered loci or subsetted variant is not filtered
if ( (!XLnonVariants || sub.isPolymorphicInSamples()) && (!XLfiltered || !sub.isFiltered()) ) {
// Write the subsetted variant if it matches all of the expressions
boolean failedJexlMatch = false;
try {
for (VariantContextUtils.JexlVCMatchExp jexl : jexls) {
if (!VariantContextUtils.match(sub, jexl)) {
if ( Utils.invertLogic(!VariantContextUtils.match(sub, jexl), invertSelection) ){
failedJexlMatch = true;
break;
}
@ -646,7 +770,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
}
if ( !failedJexlMatch &&
!justRead &&
( !SELECT_RANDOM_FRACTION || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) {
( !selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) {
vcfWriter.add(sub);
}
}
@ -656,19 +780,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
}
/*
* Determines if any of the alternate alleles are greater than the max indel size
* Determines if any of the alternate alleles are greater than the max indel size or less than the min indel size
*
* @param vc the variant context to check
* @param maxIndelSize the maximum size of allowed indels
* @return true if the VC contains an indel larger than maxIndelSize and false otherwise
* @param minIndelSize the minimum size of allowed indels
* @return true if the VC contains an indel larger than maxIndelSize or less than the minIndelSize, false otherwise
*/
protected static boolean containsIndelLargerThan(final VariantContext vc, final int maxIndelSize) {
protected static boolean containsIndelLargerOrSmallerThan(final VariantContext vc, final int maxIndelSize, final int minIndelSize) {
final List<Integer> lengths = vc.getIndelLengths();
if ( lengths == null )
return false;
for ( Integer indelLength : lengths ) {
if ( Math.abs(indelLength) > maxIndelSize )
if ( Math.abs(indelLength) > maxIndelSize || Math.abs(indelLength) < minIndelSize )
return true;
}
@ -677,16 +802,17 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
/**
* Checks if vc has a variant call for (at least one of) the samples.
*
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
* @param compVCs the comparison VariantContext (discordance
* @return true if is discordant
* @param compVCs the comparison VariantContext (discordance)
* @return true VariantContexts are discordant, false otherwise
*/
private boolean isDiscordant (VariantContext vc, Collection<VariantContext> compVCs) {
if (vc == null)
return false;
// if we're not looking at specific samples then the absence of a compVC means discordance
if (NO_SAMPLES_SPECIFIED)
if (noSamplesSpecified)
return (compVCs == null || compVCs.isEmpty());
// check if we find it in the variant rod
@ -712,12 +838,19 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return false; // we only get here if all samples have a variant in the comp rod.
}
/**
* Checks if the two variants have the same genotypes for the selected samples
*
* @param vc the variant rod VariantContext.
* @param compVCs the comparison VariantContext
* @return true if VariantContexts are concordant, false otherwise
*/
private boolean isConcordant (VariantContext vc, Collection<VariantContext> compVCs) {
if (vc == null || compVCs == null || compVCs.isEmpty())
return false;
// if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant.
if (NO_SAMPLES_SPECIFIED)
if (noSamplesSpecified)
return true;
// make a list of all samples contained in this variant VC that are being tracked by the user command line arguments.
@ -744,7 +877,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
}
private boolean sampleHasVariant(Genotype g) {
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED)));
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !XLfiltered)));
}
private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) {
@ -753,7 +886,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ((g1.isCalled() && g2.isFiltered()) ||
(g2.isCalled() && g1.isFiltered()) ||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
(g1.isFiltered() && g2.isFiltered() && XLfiltered))
return false;
List<Allele> a1s = g1.getAlleles();
@ -787,7 +920,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
*/
private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) {
//subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible
if ( NO_SAMPLES_SPECIFIED && !removeUnusedAlternates )
if ( noSamplesSpecified && !removeUnusedAlternates )
return vc;
// strip out the alternate alleles that aren't being used
@ -843,7 +976,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final Set<String> selectedSampleNames) {
if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data
if ( KEEP_ORIGINAL_CHR_COUNTS ) {
if ( keepOriginalChrCounts ) {
final int[] indexOfOriginalAlleleForNewAllele;
final List<Allele> newAlleles = builder.getAlleles();
final int numOriginalAlleles = originalVC.getNAlleles();
@ -879,7 +1012,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
VariantContextUtils.calculateChromosomeCounts(builder, false);
if (KEEP_ORIGINAL_DEPTH && originalVC.hasAttribute(VCFConstants.DEPTH_KEY))
if (keepOriginalDepth && originalVC.hasAttribute(VCFConstants.DEPTH_KEY))
builder.attribute(GATKVCFConstants.ORIGINAL_DP_KEY, originalVC.getAttribute(VCFConstants.DEPTH_KEY));
boolean sawDP = false;

View File

@ -0,0 +1,107 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.filters;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.Utils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.testng.Assert;
import org.testng.annotations.*;
public class VariantFiltrationUnitTest extends BaseTest {
private String chr1 = null;
private GenomeLoc genomeLoc = null;
private String vcFilter = "testFilter";
@BeforeTest
public void before() {
// Create GenomeLoc
IndexedFastaSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta"));
GenomeLocParser genomeLocParser = new GenomeLocParser(fasta);
chr1 = fasta.getSequenceDictionary().getSequence(0).getSequenceName();
genomeLoc = genomeLocParser.createGenomeLoc(chr1, 5, 10);
}
@DataProvider(name = "VariantMaskData")
public Object[][] DoesMaskCoverVariantTestData() {
final String maskName = "testMask";
List<Object[]> tests = Arrays.asList(new Object[]{chr1, 0, 0, maskName, 10, true, true},
new Object[]{"chr2", 0, 0, maskName, 10, true, false},
new Object[]{chr1, 0, 0, null, 10, true, true},
new Object[]{chr1, 0, 0, maskName, 10, true, true},
new Object[]{chr1, 0, 0, vcFilter, 10, true, false},
new Object[]{chr1, 0, 0, maskName, 1, true, false},
new Object[]{chr1, 15, 15, maskName, 10, false, true},
new Object[]{chr1, 15, 15, maskName, 1, false, false}
);
return tests.toArray(new Object[][]{});
}
/**
* Test doesMaskCoverVariant() logic
*
* @param contig chromosome or contig name
* @param start variant context start
* @param stop variant context stop
* @param maskName mask or filter name
* @param maskExtension bases beyond the mask
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
* @param expectedValue return the expected return value from doesMaskCoverVariant()
*/
@Test(dataProvider = "VariantMaskData")
public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
final boolean vcBeforeLoc, final boolean expectedValue) {
// Build VariantContext
final byte[] allele1 = Utils.dupBytes((byte) 'A', 1);
final byte[] allele2 = Utils.dupBytes((byte) 'T', 2);
final List<Allele> alleles = new ArrayList<Allele>(2);
final Allele ref = Allele.create(allele1, true);
final Allele alt = Allele.create(allele2, false);
alleles.add(ref);
alleles.add(alt);
final VariantContext vc = new VariantContextBuilder("test", contig, start, stop, alleles).filter(vcFilter).make();
boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc);
Assert.assertEquals(coversVariant, expectedValue);
}
}

View File

@ -41,20 +41,22 @@ import java.util.List;
public class SelectVariantsUnitTest extends BaseTest {
//////////////////////////////////////////
// Tests for maxIndelSize functionality //
//////////////////////////////////////////
///////////////////////////////////////////////////////////
// Tests for maxIndelSize and minIndelSize functionality //
///////////////////////////////////////////////////////////
@DataProvider(name = "MaxIndelSize")
public Object[][] MaxIndelSizeTestData() {
@DataProvider(name = "MaxMinIndelSize")
public Object[][] MaxMinIndelSizeTestData() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int size : Arrays.asList(1, 3, 10, 100) ) {
for ( final int otherSize : Arrays.asList(0, 1) ) {
for ( final int max : Arrays.asList(0, 1, 5, 50, 100000) ) {
for ( final String op : Arrays.asList("D", "I") ) {
tests.add(new Object[]{size, otherSize, max, op});
for ( final int min : Arrays.asList(0, 1, 5, 50) ) {
for (final String op : Arrays.asList("D", "I")) {
tests.add(new Object[]{size, otherSize, max, min, op});
}
}
}
}
@ -63,8 +65,8 @@ public class SelectVariantsUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "MaxIndelSize")
public void maxIndelSizeTest(final int size, final int otherSize, final int max, final String op) {
@Test(dataProvider = "MaxMinIndelSize")
public void maxIndelSizeTest(final int size, final int otherSize, final int max, final int min, final String op) {
final byte[] largerAllele = Utils.dupBytes((byte) 'A', size+1);
final byte[] smallerAllele = Utils.dupBytes((byte) 'A', 1);
@ -74,15 +76,11 @@ public class SelectVariantsUnitTest extends BaseTest {
final Allele alt = Allele.create(op.equals("D") ? smallerAllele : largerAllele, false);
alleles.add(ref);
alleles.add(alt);
if ( otherSize > 0 && otherSize != size ) {
final Allele otherAlt = Allele.create(op.equals("D") ? Utils.dupBytes((byte) 'A', size-otherSize+1) : Utils.dupBytes((byte) 'A', otherSize+1), false);
alleles.add(otherAlt);
}
final VariantContext vc = new VariantContextBuilder("test", "1", 10, 10 + ref.length() - 1, alleles).make();
boolean hasTooLargeIndel = SelectVariants.containsIndelLargerThan(vc, max);
Assert.assertEquals(hasTooLargeIndel, size > max);
boolean hasIndelTooLargeOrSmall = SelectVariants.containsIndelLargerOrSmallerThan(vc, max, min);
Assert.assertEquals(hasIndelTooLargeOrSmall, size > max || size < min);
}
}

View File

@ -77,6 +77,17 @@ public class Utils {
return x != y;
}
/**
* Invert logic if specified
*
* @param logic boolean logical operation value
* @param invert whether to invert logic
* @return invert logic if invert flag is true, otherwise leave the logic
*/
public static boolean invertLogic(final boolean logic, final boolean invert){
return logic ^ invert;
}
/**
* Calculates the optimum initial size for a hash table given the maximum number
* of elements it will need to hold. The optimum size is the smallest size that