diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java index c538d6b81..0816b00f2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java @@ -144,6 +144,14 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { executeTest("test filter with separate names #2", spec); } + @Test + public void testInvertFilter() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invert_filter_expression", 1, + Arrays.asList("d478fd6bcf0884133fe2a47adf4cd765")); + executeTest("test inversion of selection of filter with separate names #2", spec); + } + @Test public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( @@ -194,4 +202,13 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { Arrays.asList("e10485c7c33d9211d0c1294fd7858476")); executeTest("testFilteringDPfromFORMAT", spec); } + + @Test + public void testInvertGenotypeFilterExpression() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invert_genotype_filter_expression", 1, + Arrays.asList("d2664870e7145eb73a2295766482c823")); + executeTest("testInvertGenotypeFilterExpression", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index a8a479e2e..e0ce9405e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -64,6 +64,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; } + private static final String sampleExclusionMD5 = "eea22fbf1e490e59389a663c3d6a6537"; + private static final String invertSelectionMD5 = "831bc0a5a723b0681a910d668ff3757b"; + @Test public void testDiscordanceNoSampleSpecified() { String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; @@ -150,6 +153,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testNonExistingSelection--" + testfile, spec); } + /** + * Test excluding samples from file and sample name + */ @Test public void testSampleExclusionFromFileAndSeparateSample() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -162,9 +168,12 @@ public class SelectVariantsIntegrationTest extends WalkerTest { ); spec.disableShadowBCF(); - executeTest("testSampleExclusion--" + testfile, spec); + executeTest("testSampleExclusionFromFileAndSeparateSample--" + testfile, spec); } + /** + * Test excluding samples from file + */ @Test public void testSampleExclusionJustFromFile() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -177,9 +186,46 @@ public class SelectVariantsIntegrationTest extends WalkerTest { ); spec.disableShadowBCF(); - executeTest("testSampleExclusion--" + testfile, spec); + executeTest("testSampleExclusionJustFromFile--" + testfile, spec); } + /** + * Test excluding samples from expression + */ + @Test + public void testSampleExclusionJustFromExpression() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_se '[CDH]' --variant " + testfile, + 1, + Arrays.asList(sampleExclusionMD5) + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusionJustFromExpression--" + testfile, spec); + } + + /** + * Test excluding samples from negation expression + */ + @Test + public void testSampleExclusionJustFromNegationExpression() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -se '[^CDH]' --variant " + testfile, + 1, + Arrays.asList(sampleExclusionMD5) + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusionJustFromRegexExpression--" + testfile, spec); + } + + /** + * Test including samples that are not in the VCF + */ @Test public void testSampleInclusionWithNonexistingSamples() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -195,7 +241,6 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); } - @Test public void testConcordance() { String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; @@ -212,6 +257,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testConcordance--" + testFile, spec); } + /** + * Test including variant types. + */ @Test public void testVariantTypeSelection() { String testFile = privateTestDir + "complexExample1.vcf"; @@ -219,23 +267,42 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") + Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e") ); executeTest("testVariantTypeSelection--" + testFile, spec); } + /** + * Test excluding indels that are larger than the specified size + */ @Test - public void testIndelLengthSelection() { + public void testMaxIndelLengthSelection() { String testFile = privateTestDir + "complexExample1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 2", 1, - Arrays.asList("004589868ca5dc887e2dff876b4cc797") + Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e") ); - executeTest("testIndelLengthSelection--" + testFile, spec); + executeTest("testMaxIndelLengthSelection--" + testFile, spec); + } + + /** + * Test excluding indels that are smaller than the specified size + */ + @Test + public void testMinIndelLengthSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --minIndelSize 2", + 1, + Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb") + ); + + executeTest("testMinIndelLengthSelection--" + testFile, spec); } @Test @@ -488,4 +555,114 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testMissingVcfException--" + testfile, spec); } + + /** + * Test inverting the variant selection criteria + */ + @Test + public void testInvertSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' --invert_selection --variant " + testfile), + 1, + Arrays.asList(invertSelectionMD5) + ); + spec.disableShadowBCF(); + executeTest("testInvertSelection--" + testfile, spec); + } + + /** + * Test inverting the variant JEXL selection criteria + */ + @Test + public void testInvertJexlSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP >= 20000'--variant " + testfile), + 1, + Arrays.asList(invertSelectionMD5) + ); + spec.disableShadowBCF(); + executeTest("testInvertJexlSelection--" + testfile, spec); + } + + /** + * Test selecting variants with IDs + */ + @Test + public void testKeepSelectionID() { + String testFile = privateTestDir + "complexExample1.vcf"; + String idFile = privateTestDir + "complexExample1.vcf.id"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -IDs " + idFile + " --variant " + testFile), + 1, + Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e") + ); + spec.disableShadowBCF(); + executeTest("testKeepSelectionID--" + testFile, spec); + } + + /** + * Test excluding variants with IDs + */ + @Test + public void testExcludeSelectionID() { + String testFile = privateTestDir + "complexExample1.vcf"; + String idFile = privateTestDir + "complexExample1.vcf.id"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -xlIDs " + idFile + " --variant " + testFile), + 1, + Arrays.asList("77514a81233e1bbc0f5e47b0fb76a89a") + ); + spec.disableShadowBCF(); + executeTest("testExcludeSelectionID--" + testFile, spec); + } + + /** + * Test excluding variant types + */ + @Test + public void testExcludeSelectionType() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -xlSelectType SNP --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb") + ); + + executeTest("testExcludeSelectionType--" + testFile, spec); + } + + @Test + public void testMendelianViolationSelection() { + String testFile = privateTestDir + "CEUtrioTest.vcf"; + String pedFile = privateTestDir + "CEUtrio.ped"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("406243096074a417d2aa103bd3d13e01")); + + executeTest("testMendelianViolationSelection--" + testFile, spec); + } + + @Test + public void testInvertMendelianViolationSelection() { + String testFile = privateTestDir + "CEUtrioTest.vcf"; + String pedFile = privateTestDir + "CEUtrio.ped"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -inv_mv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("35921fb2dedca0ead83027a66b725794")); + + executeTest("testMendelianViolationSelection--" + testFile, spec); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index 42276599f..aa2e96f1a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.tools.walkers.filters; import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; @@ -104,13 +105,13 @@ public class VariantFiltration extends RodWalker { * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"). */ @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false) - protected ArrayList FILTER_EXPS = new ArrayList(); + protected ArrayList filterExpressions = new ArrayList(); /** * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. */ @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false) - protected ArrayList FILTER_NAMES = new ArrayList(); + protected ArrayList filterNames = new ArrayList(); /** * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. @@ -120,13 +121,13 @@ public class VariantFiltration extends RodWalker { * expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object. */ @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false) - protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList(); + protected ArrayList genotypeFilterExpressions = new ArrayList(); /** * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. */ @Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) - protected ArrayList GENOTYPE_FILTER_NAMES = new ArrayList(); + protected ArrayList genotypeFilterNames = new ArrayList(); /** * Works together with the --clusterWindowSize argument. @@ -141,7 +142,7 @@ public class VariantFiltration extends RodWalker { protected Integer clusterWindow = 0; @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false) - protected Integer MASK_EXTEND = 0; + protected Integer maskExtension = 0; /** * When using the -mask argument, the maskName will be annotated in the variant record. @@ -150,7 +151,7 @@ public class VariantFiltration extends RodWalker { * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). */ @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) - protected String MASK_NAME = "Mask"; + protected String maskName = "Mask"; /** * By default, if the -mask argument is used, any variant falling in a mask will be filtered. @@ -167,7 +168,7 @@ public class VariantFiltration extends RodWalker { * Use this argument to have it evaluate as failing filters instead for these cases. */ @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) - protected Boolean FAIL_MISSING_VALUES = false; + protected Boolean failMissingValues = false; /** * Invalidate previous filters applied to the VariantContext, applying only the filters here @@ -175,6 +176,18 @@ public class VariantFiltration extends RodWalker { @Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false) boolean invalidatePrevious = false; + /** + * Invert the selection criteria for --filterExpression + */ + @Argument(fullName="invert_filter_expression", shortName="inv_fil_sel", doc="Invert the selection criteria for --filterExpression", required=false) + protected boolean invertFilterExpression = false; + + /** + * Invert the selection criteria for --genotypeFilterExpression + */ + @Argument(fullName="invert_genotype_filter_expression", shortName="inv_gen_fil_sel", doc="Invert the selection criteria for --genotypeFilterExpression", required=false) + protected boolean invertGenotypeFilterExpression = false; + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; @@ -185,9 +198,21 @@ public class VariantFiltration extends RodWalker { // the structures necessary to initialize and maintain a windowed context private FiltrationContextWindow variantContextWindow; - private static final int windowSize = 10; // 10 variants on either end of the current one + private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one private ArrayList windowInitializer = new ArrayList(); + /** + * Prepend inverse phrase to description if --invert_filter_expression + * + * @param description the description + * @return the description with inverse prepended if --invert_filter_expression + */ + private String possiblyInvertFilterExpression( String description ){ + if ( invertFilterExpression ) + description = "Inverse of: " + description; + return description; + } + private void initializeVcfWriter() { final List inputNames = Arrays.asList(variantCollection.variants.getName()); @@ -199,19 +224,19 @@ public class VariantFiltration extends RodWalker { if ( clusterWindow > 0 ) hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters")); - if ( genotypeFilterExps.size() > 0 ) + if ( !genotypeFilterExps.isEmpty() ) hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); try { for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); + hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); + hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString()))); if ( mask.isBound() ) { if (filterRecordsNotInMask) - hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Doesn't overlap a user-input mask")); - else hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); + hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask")); + else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask")); } } catch (IllegalArgumentException e) { throw new UserException.BadInput(e.getMessage()); @@ -224,13 +249,13 @@ public class VariantFiltration extends RodWalker { if ( clusterWindow > 0 ) clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow); - if ( MASK_EXTEND < 0 ) + if ( maskExtension < 0 ) throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed"); if (filterRecordsNotInMask && !mask.isBound()) throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided"); - filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS); - genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS); + filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions); + genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions); VariantContextUtils.engine.get().setSilent(true); @@ -262,15 +287,9 @@ public class VariantFiltration extends RodWalker { if ( invalidatePrevious ) { vc = (new VariantContextBuilder(vc)).filters(new HashSet()).make(); } + // filter based on previous mask position - if ( previousMaskPosition != null && // we saw a previous mask site - previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig - vc.getStart() - previousMaskPosition.getStop() <= MASK_EXTEND && // it's within the mask area (multi-base masks that overlap this site will always give a negative distance) - (vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied - Set filters = new LinkedHashSet(vc.getFilters()); - filters.add(MASK_NAME); - vc = new VariantContextBuilder(vc).filters(filters).make(); - } + vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false); FiltrationContext varContext = new FiltrationContext(ref, vc); @@ -280,11 +299,11 @@ public class VariantFiltration extends RodWalker { // if this is a mask position, filter previous records if ( hasMask ) { for ( FiltrationContext prevVC : windowInitializer ) - prevVC.setVariantContext(checkMaskForPreviousLocation(prevVC.getVariantContext(), ref.getLocus())); + prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true)); } windowInitializer.add(varContext); - if ( windowInitializer.size() == windowSize ) { + if ( windowInitializer.size() == WINDOW_SIZE ) { variantContextWindow = new FiltrationContextWindow(windowInitializer); windowInitializer = null; } @@ -294,7 +313,7 @@ public class VariantFiltration extends RodWalker { if ( hasMask ) { for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) { if ( prevVC != null ) - prevVC.setVariantContext(checkMaskForPreviousLocation(prevVC.getVariantContext(), ref.getLocus())); + prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true)); } } @@ -306,12 +325,44 @@ public class VariantFiltration extends RodWalker { return 1; } - private VariantContext checkMaskForPreviousLocation(VariantContext vc, GenomeLoc maskLoc) { - if ( maskLoc.getContig().equals(vc.getChr()) && // it's on the same contig - maskLoc.getStart() - vc.getEnd() <= MASK_EXTEND && // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance) - (vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied + /** + * Helper function to check if a mask covers the variant location. + * + * @param vc variant context + * @param genomeLoc genome location + * @param maskName name of the mask + * @param maskExtension bases beyond the mask + * @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true. + * @return true if the genome location is within the extended mask area, false otherwise + */ + protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) { + boolean logic = genomeLoc != null && // have a location + genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig + (vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied + if ( logic ) { + if (vcBeforeLoc) + return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance) + else + return vc.getStart() - genomeLoc.getStop() <= maskExtension; + } else { + return false; + } + } + + /** + * Add mask to variant context filters if it covers the it's location + * + * @param vc VariantContext + * @param genomeLoc genome location + * @param maskName name of the mask + * @param maskExtension bases beyond the mask + * @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite. + * @return VariantContext with the mask added if the VariantContext is within the extended mask area + */ + private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) { + if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) { Set filters = new LinkedHashSet(vc.getFilters()); - filters.add(MASK_NAME); + filters.add(maskName); vc = new VariantContextBuilder(vc).filters(filters).make(); } @@ -328,7 +379,7 @@ public class VariantFiltration extends RodWalker { final VariantContextBuilder builder = new VariantContextBuilder(vc); // make new Genotypes based on filters - if ( genotypeFilterExps.size() > 0 ) { + if ( !genotypeFilterExps.isEmpty() ) { GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size()); // for each genotype, check filters then create a new object @@ -338,7 +389,7 @@ public class VariantFiltration extends RodWalker { if ( g.isFiltered() ) filters.add(g.getFilters()); for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { - if ( VariantContextUtils.match(vc, g, exp) ) + if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) ) filters.add(exp.name); } @@ -360,11 +411,11 @@ public class VariantFiltration extends RodWalker { for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) { try { - if ( VariantContextUtils.match(vc, exp) ) + if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) ) filters.add(exp.name); } catch (Exception e) { // do nothing unless specifically asked to; it just means that the expression isn't defined for this context - if ( FAIL_MISSING_VALUES ) + if ( failMissingValues ) filters.add(exp.name); } } @@ -389,11 +440,11 @@ public class VariantFiltration extends RodWalker { public void onTraversalDone(Integer result) { // move the window over so that we can filter the last few variants if ( windowInitializer != null ) { - while ( windowInitializer.size() < windowSize ) + while ( windowInitializer.size() < WINDOW_SIZE ) windowInitializer.add(null); variantContextWindow = new FiltrationContextWindow(windowInitializer); } - for (int i=0; i < windowSize; i++) { + for (int i=0; i < WINDOW_SIZE; i++) { variantContextWindow.moveWindow(null); filter(); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 9b9738164..63971c497 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -113,9 +113,22 @@ import java.util.*; * -se 'SAMPLE.+PARC' * * - *

Select any sample that matches a regular expression and sites where the QD annotation is more than 10

+ *

Exclude two samples and any sample that matches a regular expression:

*
  * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -xl_sn SAMPLE_1_PARC \
+ *   -xl_sn SAMPLE_1_ACTG \
+ *   -xl_se 'SAMPLE.+PARC'
+ * 
+ * + *

Select any sample that matches a regular expression and sites where the QD annotation is more than 10:

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
  *   -T SelectVariants \
  *   -R reference.fasta \
  *   -V input.vcf \
@@ -124,9 +137,22 @@ import java.util.*;
  *   -select "QD > 10.0"
  * 
* - *

Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default)

+ *

Select any sample that does not match a regular expression and sites where the QD annotation is more than 10:

+ *
+ * java  -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -se 'SAMPLE.+PARC' \
+ *   -select "QD > 10.0"
+ *   --invert_selection
+ * 
+ * + *

Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):

*
  * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
  *   -T SelectVariants \
  *   -R reference.fasta \
  *   -V input.vcf \
@@ -136,7 +162,7 @@ import java.util.*;
  *   -ef
  * 
* - *

Select a sample, subset remaining alleles, but don't trim

+ *

Select a sample, subset remaining alleles, but don't trim:

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -148,7 +174,7 @@ import java.util.*;
  *   -noTrim
  *
* - *

Select a sample and restrict the output vcf to a set of intervals

+ *

Select a sample and restrict the output vcf to a set of intervals:

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -159,7 +185,7 @@ import java.util.*;
  *   -sn SAMPLE_1_ACTG
  * 
* - *

Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset)

+ *

Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset):

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -170,7 +196,7 @@ import java.util.*;
  *   -sn mySample
  * 
* - *

Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers)

+ *

Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers):

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -192,7 +218,18 @@ import java.util.*;
  *   -o violations.vcf
  * 
* - *

Create a set with 50% of the total number of variants in the variant VCF

+ *

Generating a VCF of all the variants that are not mendelian violations. The optional argument `-mvq` together with '-inv_mv' restricts the selection to sites that have a QUAL score of 50 or less

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T SelectVariants \
+ *   -R reference.fasta \
+ *   -V input.vcf \
+ *   -ped family.ped \
+ *   -mv -mvq 50 -inv_mv \
+ *   -o violations.vcf
+ * 
+ * + *

Create a set with 50% of the total number of variants in the variant VCF:

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -202,17 +239,30 @@ import java.util.*;
  *   -fraction 0.5
  * 
* - *

Select only indels from a VCF

+ *

Select only indels between 2 and 5 bases long from a VCF:

*
  * java -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
  *   -T SelectVariants \
  *   -R reference.fasta \
  *   -V input.vcf \
  *   -o output.vcf \
  *   -selectType INDEL
+ *   -minIndelSize 2
+ *   -maxIndelSize 5
  * 
* - *

Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column)

+ *

Exclude indels from a VCF:

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -selectTypeToExclude INDEL
+ * 
+ * + *

Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T SelectVariants \
@@ -223,6 +273,17 @@ import java.util.*;
  *   -restrictAllelesTo MULTIALLELIC
  * 
* + *

Select IDs in fileKeep and exclude IDs in fileExclude:

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -IDs fileKeep \
+ *   -excludeIDs fileExclude
+ * 
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class SelectVariants extends RodWalker implements TreeReducible { @@ -258,7 +319,7 @@ public class SelectVariants extends RodWalker implements TreeR * argument can be specified multiple times in order to use multiple different matching patterns. */ @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select multiple samples", required=false) - public Set sampleExpressions ; + public Set sampleExpressions; /** * Sample names should be in a plain text file listing one sample name per line. This argument can be specified multiple times in order to provide @@ -282,26 +343,40 @@ public class SelectVariants extends RodWalker implements TreeR @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="List of samples to exclude", required=false) public Set XLsampleFiles = new HashSet<>(0); + /** + * Using a regular expression allows you to match multiple sample names that have that pattern in common. Note that sample exclusion takes precedence + * over inclusion, so that if a sample is in both lists it will be excluded. This argument can be specified multiple times in order to use multiple + * different matching patterns. + */ + @Input(fullName="exclude_sample_expressions", shortName="xl_se", doc="List of sample expressions to exclude", required=false) + public Set XLsampleExpressions = new HashSet<>(0); + /** * See example commands above for detailed usage examples. Note that these expressions are evaluated *after* the * specified samples are extracted and the INFO field annotations are updated. */ @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false) - public ArrayList SELECT_EXPRESSIONS = new ArrayList<>(); + public ArrayList selectExpressions = new ArrayList<>(); /** + * Invert the selection criteria for --select. + */ + @Argument(fullName="invert_selection", shortName="inv_sel", doc="Invert the selection criteria for --select", required=false) + protected boolean invertSelection = false; + + /* * If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none * of the selected samples display evidence of variation) will be excluded from the output. */ @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include non-variant sites", required=false) - protected boolean EXCLUDE_NON_VARIANTS = false; + protected boolean XLnonVariants = false; /** * If this flag is enabled, sites that have been marked as filtered (i.e. have anything other than `.` or `PASS` * in the FILTER field) will be excluded from the output. */ @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered sites", required=false) - protected boolean EXCLUDE_FILTERED = false; + protected boolean XLfiltered = false; /** * The default behavior of this tool is to remove bases common to all remaining alleles after subsetting @@ -338,7 +413,7 @@ public class SelectVariants extends RodWalker implements TreeR * AC_Orig, AF_Orig, and AN_Orig. */ @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values after subsetting", required=false) - private boolean KEEP_ORIGINAL_CHR_COUNTS = false; + private boolean keepOriginalChrCounts = false; /** * When subsetting a callset, this tool recalculates the site-level (INFO field) DP value corresponding to the contents of the @@ -346,7 +421,7 @@ public class SelectVariants extends RodWalker implements TreeR * DP_Orig. */ @Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value after subsetting", required=false) - private boolean KEEP_ORIGINAL_DEPTH = false; + private boolean keepOriginalDepth = false; /** * If this flag is enabled, this tool will select only variants that correspond to a mendelian violation as @@ -354,14 +429,22 @@ public class SelectVariants extends RodWalker implements TreeR * `-ped` argument. */ @Argument(fullName="mendelianViolation", shortName="mv", doc="Output mendelian violation sites only", required=false) - private Boolean MENDELIAN_VIOLATIONS = false; + private Boolean mendelianViolations = false; + + /** + * If this flag is enabled, this tool will select only variants that do not correspond to a mendelian violation as + * determined on the basis of family structure. Requires passing a pedigree file using the engine-level + * `-ped` argument. + */ + @Argument(fullName="invert_mendelianViolation", shortName="inv_mv", doc="Output non-mendelian violation sites only", required=false) + private Boolean invert_mendelianViolations = false; /** * This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order * for a site to be accepted as a mendelian violation. Note that the `-mv` flag must be set for this argument to have an effect. */ @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum GQ score for each trio member to accept a site as a violation", required=false) - protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; + protected double medelianViolationQualThreshold = 0; /** * The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be @@ -385,7 +468,15 @@ public class SelectVariants extends RodWalker implements TreeR * SYMBOLIC, NO_VARIATION. Can be specified multiple times. */ @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file", required=false) - private List TYPES_TO_INCLUDE = new ArrayList<>(); + private List typesToInclude = new ArrayList<>(); + + /** + * This argument excludes particular kinds of variants out of a list. If left empty, there is no type selection + * and all variant types are considered for other selection criteria. Valid types are INDEL, SNP, MIXED, MNP, + * SYMBOLIC, NO_VARIATION. Can be specified multiple times. + */ + @Argument(fullName="selectTypeToExclude", shortName="xlSelectType", doc="Do not select certain type of variants from the input file", required=false) + private List typesToExclude = new ArrayList<>(); /** * If a file containing a list of IDs is provided to this argument, the tool will only select variants whose ID @@ -395,29 +486,37 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="keepIDs", shortName="IDs", doc="List of variant IDs to select", required=false) private File rsIDFile = null; + /** + * If a file containing a list of IDs is provided to this argument, the tool will not select variants whose ID + * field is present in this list of IDs. The matching is done by exact string matching. The expected file format + * is simply plain text with one ID per line. + */ + @Argument(fullName="excludeIDs", shortName="xlIDs", doc="List of variant IDs to select", required=false) + private File XLrsIDFile = null; @Hidden @Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false) private boolean fullyDecode = false; - @Hidden - @Argument(fullName="forceGenotypesDecode", doc="If true, the incoming VariantContext will have its genotypes forcibly decoded by computing AC across all genotypes. For efficiency testing only", required=false) - private boolean forceGenotypesDecode = false; - @Hidden @Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false) private boolean justRead = false; /** - * If this argument is provided, indels that are larger than the specified siwe will be excluded. + * If this argument is provided, indels that are larger than the specified size will be excluded. */ @Argument(fullName="maxIndelSize", required=false, doc="Maximum size of indels to include") private int maxIndelSize = Integer.MAX_VALUE; + /** + * If this argument is provided, indels that are smaller than the specified size will be excluded. + */ + @Argument(fullName="minIndelSize", required=false, doc="Minimum size of indels to include") + private int minIndelSize = 0; + @Hidden @Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.") - private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; - + private boolean allowNonOverlappingCommandLineSamples = false; public enum NumberAlleleRestriction { ALL, @@ -430,21 +529,22 @@ public class SelectVariants extends RodWalker implements TreeR private List jexls = null; private TreeSet samples = new TreeSet<>(); - private boolean NO_SAMPLES_SPECIFIED = false; + private boolean noSamplesSpecified = false; - private boolean DISCORDANCE_ONLY = false; - private boolean CONCORDANCE_ONLY = false; + private boolean discordanceOnly = false; + private boolean concordanceOnly = false; private MendelianViolation mv; /* variables used by the SELECT RANDOM modules */ - private boolean SELECT_RANDOM_FRACTION = false; + private boolean selectRandomFraction = false; //Random number generator for the genotypes to remove private Random randomGenotypes = new Random(); private Set IDsToKeep = null; + private Set IDsToRemove = null; private Map vcfRods; /** @@ -472,103 +572,126 @@ public class SelectVariants extends RodWalker implements TreeR samples.addAll(samplesFromExpressions); samples.addAll(samplesFromFile); - logger.debug(Utils.join(",",commandLineUniqueSamples)); - - if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) { - logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); - samples.removeAll(commandLineUniqueSamples); - } else if (commandLineUniqueSamples.size() > 0 ) { - throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", - "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", - "A list of these samples:", - Utils.join(",",commandLineUniqueSamples), - "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")); - } + logger.debug(Utils.join(",", commandLineUniqueSamples)); + if (!commandLineUniqueSamples.isEmpty()) { + if (allowNonOverlappingCommandLineSamples) { + logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); + samples.removeAll(commandLineUniqueSamples); + } else { + throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", + "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", + "A list of these samples:", + Utils.join(",", commandLineUniqueSamples), + "To ignore these samples, run with --allowNonOverlappingCommandLineSamples")); + } + } // if none were requested, we want all of them if ( samples.isEmpty() ) { samples.addAll(vcfSamples); - NO_SAMPLES_SPECIFIED = true; + noSamplesSpecified = true; } // now, exclude any requested samples final Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); + final Collection XLsamplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, XLsampleExpressions); samples.removeAll(XLsamplesFromFile); samples.removeAll(XLsampleNames); - NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty(); + samples.removeAll(XLsamplesFromExpressions); + noSamplesSpecified = noSamplesSpecified && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty() && + XLsamplesFromExpressions.isEmpty(); - if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED ) + if ( samples.isEmpty() && !noSamplesSpecified ) throw new UserException("All samples requested to be included were also requested to be excluded."); - if ( ! NO_SAMPLES_SPECIFIED ) + if ( ! noSamplesSpecified ) for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include - if (TYPES_TO_INCLUDE.isEmpty()) { - + if (typesToInclude.isEmpty()) { for (VariantContext.Type t : VariantContext.Type.values()) selectedTypes.add(t); - } else { - for (VariantContext.Type t : TYPES_TO_INCLUDE) + for (VariantContext.Type t : typesToInclude) selectedTypes.add(t); - } + + // remove specifed types + for (VariantContext.Type t : typesToExclude) + selectedTypes.remove(t); + // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); - if (KEEP_ORIGINAL_CHR_COUNTS) { + if (keepOriginalChrCounts) { headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY)); } - if (KEEP_ORIGINAL_DEPTH) + if (keepOriginalDepth) headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_DP_KEY)); headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); - for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { + for (int i = 0; i < selectExpressions.size(); i++) { // It's not necessary that the user supply select names for the JEXL expressions, since those // expressions will only be needed for omitting records. Make up the select names here. selectNames.add(String.format("select-%d", i)); } - jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); + jexls = VariantContextUtils.initializeMatchExps(selectNames, selectExpressions); // Look at the parameters to decide which analysis to perform - DISCORDANCE_ONLY = discordanceTrack.isBound(); - if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName()); + discordanceOnly = discordanceTrack.isBound(); + if (discordanceOnly) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName()); - CONCORDANCE_ONLY = concordanceTrack.isBound(); - if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName()); + concordanceOnly = concordanceTrack.isBound(); + if (concordanceOnly) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName()); - if (MENDELIAN_VIOLATIONS) { - mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); + if (mendelianViolations) { + mv = new MendelianViolation(medelianViolationQualThreshold,false,true); } - SELECT_RANDOM_FRACTION = fractionRandom > 0; - if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); + selectRandomFraction = fractionRandom > 0; + if (selectRandomFraction) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); - /** load in the IDs file to a hashset for matching */ - if ( rsIDFile != null ) { - IDsToKeep = new HashSet<>(); - try { - for ( final String line : new XReadLines(rsIDFile).readLines() ) { - IDsToKeep.add(line.trim()); - } - logger.info("Selecting only variants with one of " + IDsToKeep.size() + " IDs from " + rsIDFile); - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(rsIDFile, e); - } - } + // Get variant IDs to keep and removed + IDsToKeep = getIDsFromFile(rsIDFile); + + IDsToRemove = getIDsFromFile(XLrsIDFile); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); } + /** + * Get IDs from a file + * + * @param file file containing the IDs + * @return set of IDs or null if the file is null + * @throws UserException.CouldNotReadInputFile if could not read the file + */ + private Set getIDsFromFile(final File file){ + /** load in the IDs file to a hashset for matching */ + if ( file != null ) { + Set ids = new HashSet<>(); + try { + for ( final java.lang.String line : new XReadLines(file).readLines() ) { + ids.add(line.trim()); + } + logger.info("Selecting only variants with one of " + ids.size() + " IDs from " + file); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(file, e); + } + return ids; + } + + return null; + } + /** * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for printing) * @@ -584,7 +707,7 @@ public class SelectVariants extends RodWalker implements TreeR Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); - if ( vcs == null || vcs.size() == 0) { + if ( vcs == null || vcs.isEmpty()) { return 0; } @@ -593,24 +716,21 @@ public class SelectVariants extends RodWalker implements TreeR if ( fullyDecode ) vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() ); - // an option for performance testing only - if ( forceGenotypesDecode ) { - final int x = vc.getCalledChrCount(); - //logger.info("forceGenotypesDecode with getCalledChrCount() = " + ); - } - - if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) ) + if ( IDsToKeep != null && !IDsToKeep.contains(vc.getID()) ) continue; - if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1) + if ( IDsToRemove != null && IDsToRemove.contains(vc.getID()) ) + continue; + + if ( mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invert_mendelianViolations) ) break; - if (DISCORDANCE_ONLY) { + if (discordanceOnly) { Collection compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) continue; } - if (CONCORDANCE_ONLY) { + if (concordanceOnly) { Collection compVCs = tracker.getValues(concordanceTrack, context.getLocation()); if (!isConcordant(vc, compVCs)) continue; @@ -625,16 +745,20 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; - if ( containsIndelLargerThan(vc, maxIndelSize) ) + if ( containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize) ) continue; VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates); - if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) { + // Not excluding non-variants or subsetted polymorphic variants AND including filtered loci or subsetted variant is not filtered + if ( (!XLnonVariants || sub.isPolymorphicInSamples()) && (!XLfiltered || !sub.isFiltered()) ) { + + // Write the subsetted variant if it matches all of the expressions boolean failedJexlMatch = false; + try { for (VariantContextUtils.JexlVCMatchExp jexl : jexls) { - if (!VariantContextUtils.match(sub, jexl)) { + if ( Utils.invertLogic(!VariantContextUtils.match(sub, jexl), invertSelection) ){ failedJexlMatch = true; break; } @@ -646,7 +770,7 @@ public class SelectVariants extends RodWalker implements TreeR } if ( !failedJexlMatch && !justRead && - ( !SELECT_RANDOM_FRACTION || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) { + ( !selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) { vcfWriter.add(sub); } } @@ -656,19 +780,20 @@ public class SelectVariants extends RodWalker implements TreeR } /* - * Determines if any of the alternate alleles are greater than the max indel size + * Determines if any of the alternate alleles are greater than the max indel size or less than the min indel size * * @param vc the variant context to check * @param maxIndelSize the maximum size of allowed indels - * @return true if the VC contains an indel larger than maxIndelSize and false otherwise + * @param minIndelSize the minimum size of allowed indels + * @return true if the VC contains an indel larger than maxIndelSize or less than the minIndelSize, false otherwise */ - protected static boolean containsIndelLargerThan(final VariantContext vc, final int maxIndelSize) { + protected static boolean containsIndelLargerOrSmallerThan(final VariantContext vc, final int maxIndelSize, final int minIndelSize) { final List lengths = vc.getIndelLengths(); if ( lengths == null ) return false; for ( Integer indelLength : lengths ) { - if ( Math.abs(indelLength) > maxIndelSize ) + if ( Math.abs(indelLength) > maxIndelSize || Math.abs(indelLength) < minIndelSize ) return true; } @@ -677,16 +802,17 @@ public class SelectVariants extends RodWalker implements TreeR /** * Checks if vc has a variant call for (at least one of) the samples. + * * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap) - * @param compVCs the comparison VariantContext (discordance - * @return true if is discordant + * @param compVCs the comparison VariantContext (discordance) + * @return true VariantContexts are discordant, false otherwise */ private boolean isDiscordant (VariantContext vc, Collection compVCs) { if (vc == null) return false; // if we're not looking at specific samples then the absence of a compVC means discordance - if (NO_SAMPLES_SPECIFIED) + if (noSamplesSpecified) return (compVCs == null || compVCs.isEmpty()); // check if we find it in the variant rod @@ -712,12 +838,19 @@ public class SelectVariants extends RodWalker implements TreeR return false; // we only get here if all samples have a variant in the comp rod. } + /** + * Checks if the two variants have the same genotypes for the selected samples + * + * @param vc the variant rod VariantContext. + * @param compVCs the comparison VariantContext + * @return true if VariantContexts are concordant, false otherwise + */ private boolean isConcordant (VariantContext vc, Collection compVCs) { if (vc == null || compVCs == null || compVCs.isEmpty()) return false; // if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant. - if (NO_SAMPLES_SPECIFIED) + if (noSamplesSpecified) return true; // make a list of all samples contained in this variant VC that are being tracked by the user command line arguments. @@ -744,7 +877,7 @@ public class SelectVariants extends RodWalker implements TreeR } private boolean sampleHasVariant(Genotype g) { - return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED))); + return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !XLfiltered))); } private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) { @@ -753,7 +886,7 @@ public class SelectVariants extends RodWalker implements TreeR if ((g1.isCalled() && g2.isFiltered()) || (g2.isCalled() && g1.isFiltered()) || - (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) + (g1.isFiltered() && g2.isFiltered() && XLfiltered)) return false; List a1s = g1.getAlleles(); @@ -787,7 +920,7 @@ public class SelectVariants extends RodWalker implements TreeR */ private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) { //subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible - if ( NO_SAMPLES_SPECIFIED && !removeUnusedAlternates ) + if ( noSamplesSpecified && !removeUnusedAlternates ) return vc; // strip out the alternate alleles that aren't being used @@ -843,7 +976,7 @@ public class SelectVariants extends RodWalker implements TreeR private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final Set selectedSampleNames) { if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data - if ( KEEP_ORIGINAL_CHR_COUNTS ) { + if ( keepOriginalChrCounts ) { final int[] indexOfOriginalAlleleForNewAllele; final List newAlleles = builder.getAlleles(); final int numOriginalAlleles = originalVC.getNAlleles(); @@ -879,7 +1012,7 @@ public class SelectVariants extends RodWalker implements TreeR VariantContextUtils.calculateChromosomeCounts(builder, false); - if (KEEP_ORIGINAL_DEPTH && originalVC.hasAttribute(VCFConstants.DEPTH_KEY)) + if (keepOriginalDepth && originalVC.hasAttribute(VCFConstants.DEPTH_KEY)) builder.attribute(GATKVCFConstants.ORIGINAL_DP_KEY, originalVC.getAttribute(VCFConstants.DEPTH_KEY)); boolean sawDP = false; diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java new file mode 100644 index 000000000..ab972a119 --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationUnitTest.java @@ -0,0 +1,107 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.filters; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.testng.Assert; +import org.testng.annotations.*; + +public class VariantFiltrationUnitTest extends BaseTest { + + private String chr1 = null; + private GenomeLoc genomeLoc = null; + private String vcFilter = "testFilter"; + + @BeforeTest + public void before() { + // Create GenomeLoc + IndexedFastaSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta")); + GenomeLocParser genomeLocParser = new GenomeLocParser(fasta); + chr1 = fasta.getSequenceDictionary().getSequence(0).getSequenceName(); + genomeLoc = genomeLocParser.createGenomeLoc(chr1, 5, 10); + } + + @DataProvider(name = "VariantMaskData") + public Object[][] DoesMaskCoverVariantTestData() { + + final String maskName = "testMask"; + + List tests = Arrays.asList(new Object[]{chr1, 0, 0, maskName, 10, true, true}, + new Object[]{"chr2", 0, 0, maskName, 10, true, false}, + new Object[]{chr1, 0, 0, null, 10, true, true}, + new Object[]{chr1, 0, 0, maskName, 10, true, true}, + new Object[]{chr1, 0, 0, vcFilter, 10, true, false}, + new Object[]{chr1, 0, 0, maskName, 1, true, false}, + new Object[]{chr1, 15, 15, maskName, 10, false, true}, + new Object[]{chr1, 15, 15, maskName, 1, false, false} + ); + return tests.toArray(new Object[][]{}); + } + + /** + * Test doesMaskCoverVariant() logic + * + * @param contig chromosome or contig name + * @param start variant context start + * @param stop variant context stop + * @param maskName mask or filter name + * @param maskExtension bases beyond the mask + * @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true. + * @param expectedValue return the expected return value from doesMaskCoverVariant() + */ + @Test(dataProvider = "VariantMaskData") + public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension, + final boolean vcBeforeLoc, final boolean expectedValue) { + + // Build VariantContext + final byte[] allele1 = Utils.dupBytes((byte) 'A', 1); + final byte[] allele2 = Utils.dupBytes((byte) 'T', 2); + + final List alleles = new ArrayList(2); + final Allele ref = Allele.create(allele1, true); + final Allele alt = Allele.create(allele2, false); + alleles.add(ref); + alleles.add(alt); + + final VariantContext vc = new VariantContextBuilder("test", contig, start, stop, alleles).filter(vcFilter).make(); + + boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc); + Assert.assertEquals(coversVariant, expectedValue); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsUnitTest.java index 117f50714..91f1a7931 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsUnitTest.java @@ -41,20 +41,22 @@ import java.util.List; public class SelectVariantsUnitTest extends BaseTest { - ////////////////////////////////////////// - // Tests for maxIndelSize functionality // - ////////////////////////////////////////// + /////////////////////////////////////////////////////////// + // Tests for maxIndelSize and minIndelSize functionality // + /////////////////////////////////////////////////////////// - @DataProvider(name = "MaxIndelSize") - public Object[][] MaxIndelSizeTestData() { + @DataProvider(name = "MaxMinIndelSize") + public Object[][] MaxMinIndelSizeTestData() { List tests = new ArrayList(); for ( final int size : Arrays.asList(1, 3, 10, 100) ) { for ( final int otherSize : Arrays.asList(0, 1) ) { for ( final int max : Arrays.asList(0, 1, 5, 50, 100000) ) { - for ( final String op : Arrays.asList("D", "I") ) { - tests.add(new Object[]{size, otherSize, max, op}); + for ( final int min : Arrays.asList(0, 1, 5, 50) ) { + for (final String op : Arrays.asList("D", "I")) { + tests.add(new Object[]{size, otherSize, max, min, op}); + } } } } @@ -63,8 +65,8 @@ public class SelectVariantsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "MaxIndelSize") - public void maxIndelSizeTest(final int size, final int otherSize, final int max, final String op) { + @Test(dataProvider = "MaxMinIndelSize") + public void maxIndelSizeTest(final int size, final int otherSize, final int max, final int min, final String op) { final byte[] largerAllele = Utils.dupBytes((byte) 'A', size+1); final byte[] smallerAllele = Utils.dupBytes((byte) 'A', 1); @@ -74,15 +76,11 @@ public class SelectVariantsUnitTest extends BaseTest { final Allele alt = Allele.create(op.equals("D") ? smallerAllele : largerAllele, false); alleles.add(ref); alleles.add(alt); - if ( otherSize > 0 && otherSize != size ) { - final Allele otherAlt = Allele.create(op.equals("D") ? Utils.dupBytes((byte) 'A', size-otherSize+1) : Utils.dupBytes((byte) 'A', otherSize+1), false); - alleles.add(otherAlt); - } final VariantContext vc = new VariantContextBuilder("test", "1", 10, 10 + ref.length() - 1, alleles).make(); - boolean hasTooLargeIndel = SelectVariants.containsIndelLargerThan(vc, max); - Assert.assertEquals(hasTooLargeIndel, size > max); + boolean hasIndelTooLargeOrSmall = SelectVariants.containsIndelLargerOrSmallerThan(vc, max, min); + Assert.assertEquals(hasIndelTooLargeOrSmall, size > max || size < min); } } \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java index 4641e0542..883143a14 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java @@ -77,6 +77,17 @@ public class Utils { return x != y; } + /** + * Invert logic if specified + * + * @param logic boolean logical operation value + * @param invert whether to invert logic + * @return invert logic if invert flag is true, otherwise leave the logic + */ + public static boolean invertLogic(final boolean logic, final boolean invert){ + return logic ^ invert; + } + /** * Calculates the optimum initial size for a hash table given the maximum number * of elements it will need to hold. The optimum size is the smallest size that