Merge pull request #947 from broadinstitute/rhl_invert_selection
Added --invert_selection flag for variant selection queries
This commit is contained in:
commit
f6b3d8e862
|
|
@ -144,6 +144,14 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
executeTest("test filter with separate names #2", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInvertFilter() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invert_filter_expression", 1,
|
||||
Arrays.asList("d478fd6bcf0884133fe2a47adf4cd765"));
|
||||
executeTest("test inversion of selection of filter with separate names #2", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenotypeFilters1() {
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
|
|
@ -194,4 +202,13 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
Arrays.asList("e10485c7c33d9211d0c1294fd7858476"));
|
||||
executeTest("testFilteringDPfromFORMAT", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInvertGenotypeFilterExpression() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
|
||||
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invert_genotype_filter_expression", 1,
|
||||
Arrays.asList("d2664870e7145eb73a2295766482c823"));
|
||||
executeTest("testInvertGenotypeFilterExpression", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,6 +64,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args;
|
||||
}
|
||||
|
||||
private static final String sampleExclusionMD5 = "eea22fbf1e490e59389a663c3d6a6537";
|
||||
private static final String invertSelectionMD5 = "831bc0a5a723b0681a910d668ff3757b";
|
||||
|
||||
@Test
|
||||
public void testDiscordanceNoSampleSpecified() {
|
||||
String testFile = privateTestDir + "NA12878.hg19.example1.vcf";
|
||||
|
|
@ -150,6 +153,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
executeTest("testNonExistingSelection--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding samples from file and sample name
|
||||
*/
|
||||
@Test
|
||||
public void testSampleExclusionFromFileAndSeparateSample() {
|
||||
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
|
||||
|
|
@ -162,9 +168,12 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
);
|
||||
spec.disableShadowBCF();
|
||||
|
||||
executeTest("testSampleExclusion--" + testfile, spec);
|
||||
executeTest("testSampleExclusionFromFileAndSeparateSample--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding samples from file
|
||||
*/
|
||||
@Test
|
||||
public void testSampleExclusionJustFromFile() {
|
||||
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
|
||||
|
|
@ -177,9 +186,46 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
);
|
||||
spec.disableShadowBCF();
|
||||
|
||||
executeTest("testSampleExclusion--" + testfile, spec);
|
||||
executeTest("testSampleExclusionJustFromFile--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding samples from expression
|
||||
*/
|
||||
@Test
|
||||
public void testSampleExclusionJustFromExpression() {
|
||||
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_se '[CDH]' --variant " + testfile,
|
||||
1,
|
||||
Arrays.asList(sampleExclusionMD5)
|
||||
);
|
||||
spec.disableShadowBCF();
|
||||
|
||||
executeTest("testSampleExclusionJustFromExpression--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding samples from negation expression
|
||||
*/
|
||||
@Test
|
||||
public void testSampleExclusionJustFromNegationExpression() {
|
||||
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -se '[^CDH]' --variant " + testfile,
|
||||
1,
|
||||
Arrays.asList(sampleExclusionMD5)
|
||||
);
|
||||
spec.disableShadowBCF();
|
||||
|
||||
executeTest("testSampleExclusionJustFromRegexExpression--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test including samples that are not in the VCF
|
||||
*/
|
||||
@Test
|
||||
public void testSampleInclusionWithNonexistingSamples() {
|
||||
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
|
||||
|
|
@ -195,7 +241,6 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testConcordance() {
|
||||
String testFile = privateTestDir + "NA12878.hg19.example1.vcf";
|
||||
|
|
@ -212,6 +257,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
executeTest("testConcordance--" + testFile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test including variant types.
|
||||
*/
|
||||
@Test
|
||||
public void testVariantTypeSelection() {
|
||||
String testFile = privateTestDir + "complexExample1.vcf";
|
||||
|
|
@ -219,23 +267,42 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794")
|
||||
Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e")
|
||||
);
|
||||
|
||||
executeTest("testVariantTypeSelection--" + testFile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding indels that are larger than the specified size
|
||||
*/
|
||||
@Test
|
||||
public void testIndelLengthSelection() {
|
||||
public void testMaxIndelLengthSelection() {
|
||||
String testFile = privateTestDir + "complexExample1.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3",
|
||||
"-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 2",
|
||||
1,
|
||||
Arrays.asList("004589868ca5dc887e2dff876b4cc797")
|
||||
Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e")
|
||||
);
|
||||
|
||||
executeTest("testIndelLengthSelection--" + testFile, spec);
|
||||
executeTest("testMaxIndelLengthSelection--" + testFile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding indels that are smaller than the specified size
|
||||
*/
|
||||
@Test
|
||||
public void testMinIndelLengthSelection() {
|
||||
String testFile = privateTestDir + "complexExample1.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --minIndelSize 2",
|
||||
1,
|
||||
Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb")
|
||||
);
|
||||
|
||||
executeTest("testMinIndelLengthSelection--" + testFile, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -488,4 +555,114 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
executeTest("testMissingVcfException--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inverting the variant selection criteria
|
||||
*/
|
||||
@Test
|
||||
public void testInvertSelection() {
|
||||
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
|
||||
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' --invert_selection --variant " + testfile),
|
||||
1,
|
||||
Arrays.asList(invertSelectionMD5)
|
||||
);
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testInvertSelection--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test inverting the variant JEXL selection criteria
|
||||
*/
|
||||
@Test
|
||||
public void testInvertJexlSelection() {
|
||||
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
|
||||
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP >= 20000'--variant " + testfile),
|
||||
1,
|
||||
Arrays.asList(invertSelectionMD5)
|
||||
);
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testInvertJexlSelection--" + testfile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test selecting variants with IDs
|
||||
*/
|
||||
@Test
|
||||
public void testKeepSelectionID() {
|
||||
String testFile = privateTestDir + "complexExample1.vcf";
|
||||
String idFile = privateTestDir + "complexExample1.vcf.id";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(" -IDs " + idFile + " --variant " + testFile),
|
||||
1,
|
||||
Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e")
|
||||
);
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testKeepSelectionID--" + testFile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding variants with IDs
|
||||
*/
|
||||
@Test
|
||||
public void testExcludeSelectionID() {
|
||||
String testFile = privateTestDir + "complexExample1.vcf";
|
||||
String idFile = privateTestDir + "complexExample1.vcf.id";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(" -xlIDs " + idFile + " --variant " + testFile),
|
||||
1,
|
||||
Arrays.asList("77514a81233e1bbc0f5e47b0fb76a89a")
|
||||
);
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testExcludeSelectionID--" + testFile, spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test excluding variant types
|
||||
*/
|
||||
@Test
|
||||
public void testExcludeSelectionType() {
|
||||
String testFile = privateTestDir + "complexExample1.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -xlSelectType SNP --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb")
|
||||
);
|
||||
|
||||
executeTest("testExcludeSelectionType--" + testFile, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMendelianViolationSelection() {
|
||||
String testFile = privateTestDir + "CEUtrioTest.vcf";
|
||||
String pedFile = privateTestDir + "CEUtrio.ped";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("406243096074a417d2aa103bd3d13e01"));
|
||||
|
||||
executeTest("testMendelianViolationSelection--" + testFile, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInvertMendelianViolationSelection() {
|
||||
String testFile = privateTestDir + "CEUtrioTest.vcf";
|
||||
String pedFile = privateTestDir + "CEUtrio.ped";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -inv_mv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("35921fb2dedca0ead83027a66b725794"));
|
||||
|
||||
executeTest("testMendelianViolationSelection--" + testFile, spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@
|
|||
package org.broadinstitute.gatk.tools.walkers.filters;
|
||||
|
||||
import htsjdk.tribble.Feature;
|
||||
import org.broadinstitute.gatk.utils.Utils;
|
||||
import org.broadinstitute.gatk.utils.commandline.*;
|
||||
import org.broadinstitute.gatk.engine.CommandLineGATK;
|
||||
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
|
||||
|
|
@ -104,13 +105,13 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
* --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
|
||||
*/
|
||||
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
|
||||
protected ArrayList<String> FILTER_EXPS = new ArrayList<String>();
|
||||
protected ArrayList<String> filterExpressions = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
|
||||
*/
|
||||
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
|
||||
protected ArrayList<String> FILTER_NAMES = new ArrayList<String>();
|
||||
protected ArrayList<String> filterNames = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
|
||||
|
|
@ -120,13 +121,13 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
* expressions isCalled, isNoCall, isMixed, and isAvailable, in accordance with the methods of the Genotype object.
|
||||
*/
|
||||
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
|
||||
protected ArrayList<String> GENOTYPE_FILTER_EXPS = new ArrayList<String>();
|
||||
protected ArrayList<String> genotypeFilterExpressions = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
|
||||
*/
|
||||
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
|
||||
protected ArrayList<String> GENOTYPE_FILTER_NAMES = new ArrayList<String>();
|
||||
protected ArrayList<String> genotypeFilterNames = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Works together with the --clusterWindowSize argument.
|
||||
|
|
@ -141,7 +142,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
protected Integer clusterWindow = 0;
|
||||
|
||||
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false)
|
||||
protected Integer MASK_EXTEND = 0;
|
||||
protected Integer maskExtension = 0;
|
||||
|
||||
/**
|
||||
* When using the -mask argument, the maskName will be annotated in the variant record.
|
||||
|
|
@ -150,7 +151,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
* (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking).
|
||||
*/
|
||||
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
|
||||
protected String MASK_NAME = "Mask";
|
||||
protected String maskName = "Mask";
|
||||
|
||||
/**
|
||||
* By default, if the -mask argument is used, any variant falling in a mask will be filtered.
|
||||
|
|
@ -167,7 +168,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
* Use this argument to have it evaluate as failing filters instead for these cases.
|
||||
*/
|
||||
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
|
||||
protected Boolean FAIL_MISSING_VALUES = false;
|
||||
protected Boolean failMissingValues = false;
|
||||
|
||||
/**
|
||||
* Invalidate previous filters applied to the VariantContext, applying only the filters here
|
||||
|
|
@ -175,6 +176,18 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false)
|
||||
boolean invalidatePrevious = false;
|
||||
|
||||
/**
|
||||
* Invert the selection criteria for --filterExpression
|
||||
*/
|
||||
@Argument(fullName="invert_filter_expression", shortName="inv_fil_sel", doc="Invert the selection criteria for --filterExpression", required=false)
|
||||
protected boolean invertFilterExpression = false;
|
||||
|
||||
/**
|
||||
* Invert the selection criteria for --genotypeFilterExpression
|
||||
*/
|
||||
@Argument(fullName="invert_genotype_filter_expression", shortName="inv_gen_fil_sel", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
|
||||
protected boolean invertGenotypeFilterExpression = false;
|
||||
|
||||
// JEXL expressions for the filters
|
||||
List<VariantContextUtils.JexlVCMatchExp> filterExps;
|
||||
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
|
||||
|
|
@ -185,9 +198,21 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
|
||||
// the structures necessary to initialize and maintain a windowed context
|
||||
private FiltrationContextWindow variantContextWindow;
|
||||
private static final int windowSize = 10; // 10 variants on either end of the current one
|
||||
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
|
||||
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<FiltrationContext>();
|
||||
|
||||
/**
|
||||
* Prepend inverse phrase to description if --invert_filter_expression
|
||||
*
|
||||
* @param description the description
|
||||
* @return the description with inverse prepended if --invert_filter_expression
|
||||
*/
|
||||
private String possiblyInvertFilterExpression( String description ){
|
||||
if ( invertFilterExpression )
|
||||
description = "Inverse of: " + description;
|
||||
return description;
|
||||
}
|
||||
|
||||
private void initializeVcfWriter() {
|
||||
|
||||
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
|
||||
|
|
@ -199,19 +224,19 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
if ( clusterWindow > 0 )
|
||||
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
|
||||
|
||||
if ( genotypeFilterExps.size() > 0 )
|
||||
if ( !genotypeFilterExps.isEmpty() )
|
||||
hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
||||
|
||||
try {
|
||||
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps )
|
||||
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
|
||||
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
|
||||
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps )
|
||||
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString()));
|
||||
hInfo.add(new VCFFilterHeaderLine(exp.name, possiblyInvertFilterExpression(exp.exp.toString())));
|
||||
|
||||
if ( mask.isBound() ) {
|
||||
if (filterRecordsNotInMask)
|
||||
hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Doesn't overlap a user-input mask"));
|
||||
else hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask"));
|
||||
hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
|
||||
else hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
|
||||
}
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new UserException.BadInput(e.getMessage());
|
||||
|
|
@ -224,13 +249,13 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
if ( clusterWindow > 0 )
|
||||
clusteredSNPs = new ClusteredSnps(getToolkit().getGenomeLocParser(),clusterSize, clusterWindow);
|
||||
|
||||
if ( MASK_EXTEND < 0 )
|
||||
if ( maskExtension < 0 )
|
||||
throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed");
|
||||
|
||||
if (filterRecordsNotInMask && !mask.isBound())
|
||||
throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided");
|
||||
filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS);
|
||||
genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS);
|
||||
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
|
||||
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
|
||||
|
||||
VariantContextUtils.engine.get().setSilent(true);
|
||||
|
||||
|
|
@ -262,15 +287,9 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
if ( invalidatePrevious ) {
|
||||
vc = (new VariantContextBuilder(vc)).filters(new HashSet<String>()).make();
|
||||
}
|
||||
|
||||
// filter based on previous mask position
|
||||
if ( previousMaskPosition != null && // we saw a previous mask site
|
||||
previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig
|
||||
vc.getStart() - previousMaskPosition.getStop() <= MASK_EXTEND && // it's within the mask area (multi-base masks that overlap this site will always give a negative distance)
|
||||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||
filters.add(MASK_NAME);
|
||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||
}
|
||||
vc = addMaskIfCoversVariant(vc, previousMaskPosition, maskName, maskExtension, false);
|
||||
|
||||
FiltrationContext varContext = new FiltrationContext(ref, vc);
|
||||
|
||||
|
|
@ -280,11 +299,11 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
// if this is a mask position, filter previous records
|
||||
if ( hasMask ) {
|
||||
for ( FiltrationContext prevVC : windowInitializer )
|
||||
prevVC.setVariantContext(checkMaskForPreviousLocation(prevVC.getVariantContext(), ref.getLocus()));
|
||||
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
|
||||
}
|
||||
|
||||
windowInitializer.add(varContext);
|
||||
if ( windowInitializer.size() == windowSize ) {
|
||||
if ( windowInitializer.size() == WINDOW_SIZE ) {
|
||||
variantContextWindow = new FiltrationContextWindow(windowInitializer);
|
||||
windowInitializer = null;
|
||||
}
|
||||
|
|
@ -294,7 +313,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
if ( hasMask ) {
|
||||
for ( FiltrationContext prevVC : variantContextWindow.getWindow(10, 10) ) {
|
||||
if ( prevVC != null )
|
||||
prevVC.setVariantContext(checkMaskForPreviousLocation(prevVC.getVariantContext(), ref.getLocus()));
|
||||
prevVC.setVariantContext(addMaskIfCoversVariant(prevVC.getVariantContext(), ref.getLocus(), maskName, maskExtension, true));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -306,12 +325,44 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
private VariantContext checkMaskForPreviousLocation(VariantContext vc, GenomeLoc maskLoc) {
|
||||
if ( maskLoc.getContig().equals(vc.getChr()) && // it's on the same contig
|
||||
maskLoc.getStart() - vc.getEnd() <= MASK_EXTEND && // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
|
||||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||
/**
|
||||
* Helper function to check if a mask covers the variant location.
|
||||
*
|
||||
* @param vc variant context
|
||||
* @param genomeLoc genome location
|
||||
* @param maskName name of the mask
|
||||
* @param maskExtension bases beyond the mask
|
||||
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
|
||||
* @return true if the genome location is within the extended mask area, false otherwise
|
||||
*/
|
||||
protected static boolean doesMaskCoverVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean vcBeforeLoc) {
|
||||
boolean logic = genomeLoc != null && // have a location
|
||||
genomeLoc.getContig().equals(vc.getChr()) && // it's on the same contig
|
||||
(vc.getFilters() == null || !vc.getFilters().contains(maskName)); // the filter hasn't already been applied
|
||||
if ( logic ) {
|
||||
if (vcBeforeLoc)
|
||||
return genomeLoc.getStart() - vc.getEnd() <= maskExtension; // it's within the mask area (multi-base VCs that overlap this site will always give a negative distance)
|
||||
else
|
||||
return vc.getStart() - genomeLoc.getStop() <= maskExtension;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add mask to variant context filters if it covers the it's location
|
||||
*
|
||||
* @param vc VariantContext
|
||||
* @param genomeLoc genome location
|
||||
* @param maskName name of the mask
|
||||
* @param maskExtension bases beyond the mask
|
||||
* @param locStart if true, start at genome location and end at VariantContext. If false, do the opposite.
|
||||
* @return VariantContext with the mask added if the VariantContext is within the extended mask area
|
||||
*/
|
||||
private VariantContext addMaskIfCoversVariant(VariantContext vc, GenomeLoc genomeLoc, String maskName, int maskExtension, boolean locStart) {
|
||||
if (doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, locStart) ) {
|
||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||
filters.add(MASK_NAME);
|
||||
filters.add(maskName);
|
||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||
}
|
||||
|
||||
|
|
@ -328,7 +379,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
|
||||
// make new Genotypes based on filters
|
||||
if ( genotypeFilterExps.size() > 0 ) {
|
||||
if ( !genotypeFilterExps.isEmpty() ) {
|
||||
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
|
||||
|
||||
// for each genotype, check filters then create a new object
|
||||
|
|
@ -338,7 +389,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
if ( g.isFiltered() ) filters.add(g.getFilters());
|
||||
|
||||
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
|
||||
if ( VariantContextUtils.match(vc, g, exp) )
|
||||
if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) )
|
||||
filters.add(exp.name);
|
||||
}
|
||||
|
||||
|
|
@ -360,11 +411,11 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
|
||||
for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) {
|
||||
try {
|
||||
if ( VariantContextUtils.match(vc, exp) )
|
||||
if ( Utils.invertLogic(VariantContextUtils.match(vc, exp), invertFilterExpression) )
|
||||
filters.add(exp.name);
|
||||
} catch (Exception e) {
|
||||
// do nothing unless specifically asked to; it just means that the expression isn't defined for this context
|
||||
if ( FAIL_MISSING_VALUES )
|
||||
if ( failMissingValues )
|
||||
filters.add(exp.name);
|
||||
}
|
||||
}
|
||||
|
|
@ -389,11 +440,11 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
|
|||
public void onTraversalDone(Integer result) {
|
||||
// move the window over so that we can filter the last few variants
|
||||
if ( windowInitializer != null ) {
|
||||
while ( windowInitializer.size() < windowSize )
|
||||
while ( windowInitializer.size() < WINDOW_SIZE )
|
||||
windowInitializer.add(null);
|
||||
variantContextWindow = new FiltrationContextWindow(windowInitializer);
|
||||
}
|
||||
for (int i=0; i < windowSize; i++) {
|
||||
for (int i=0; i < WINDOW_SIZE; i++) {
|
||||
variantContextWindow.moveWindow(null);
|
||||
filter();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -113,9 +113,22 @@ import java.util.*;
|
|||
* -se 'SAMPLE.+PARC'
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select any sample that matches a regular expression and sites where the QD annotation is more than 10</h4>
|
||||
* <h4>Exclude two samples and any sample that matches a regular expression:</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* -xl_sn SAMPLE_1_PARC \
|
||||
* -xl_sn SAMPLE_1_ACTG \
|
||||
* -xl_se 'SAMPLE.+PARC'
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select any sample that matches a regular expression and sites where the QD annotation is more than 10:</h4>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* -R reference.fasta \
|
||||
* -V input.vcf \
|
||||
|
|
@ -124,9 +137,22 @@ import java.util.*;
|
|||
* -select "QD > 10.0"
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default)</h4>
|
||||
* <h4>Select any sample that does not match a regular expression and sites where the QD annotation is more than 10:</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* -se 'SAMPLE.+PARC' \
|
||||
* -select "QD > 10.0"
|
||||
* --invert_selection
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* -R reference.fasta \
|
||||
* -V input.vcf \
|
||||
|
|
@ -136,7 +162,7 @@ import java.util.*;
|
|||
* -ef
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select a sample, subset remaining alleles, but don't trim</h4>
|
||||
* <h4>Select a sample, subset remaining alleles, but don't trim:</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T SelectVariants \
|
||||
|
|
@ -148,7 +174,7 @@ import java.util.*;
|
|||
* -noTrim
|
||||
*</pre>
|
||||
*
|
||||
* <h4>Select a sample and restrict the output vcf to a set of intervals</h4>
|
||||
* <h4>Select a sample and restrict the output vcf to a set of intervals:</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T SelectVariants \
|
||||
|
|
@ -159,7 +185,7 @@ import java.util.*;
|
|||
* -sn SAMPLE_1_ACTG
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset)</h4>
|
||||
* <h4>Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called in my dataset):</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T SelectVariants \
|
||||
|
|
@ -170,7 +196,7 @@ import java.util.*;
|
|||
* -sn mySample
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers)</h4>
|
||||
* <h4>Select all calls made by both myCalls and theirCalls (useful to take a look at what is consistent between two callers):</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T SelectVariants \
|
||||
|
|
@ -192,7 +218,18 @@ import java.util.*;
|
|||
* -o violations.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Create a set with 50% of the total number of variants in the variant VCF</h4>
|
||||
* <h4>Generating a VCF of all the variants that are not mendelian violations. The optional argument `-mvq` together with '-inv_mv' restricts the selection to sites that have a QUAL score of 50 or less</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T SelectVariants \
|
||||
* -R reference.fasta \
|
||||
* -V input.vcf \
|
||||
* -ped family.ped \
|
||||
* -mv -mvq 50 -inv_mv \
|
||||
* -o violations.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Create a set with 50% of the total number of variants in the variant VCF:</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T SelectVariants \
|
||||
|
|
@ -202,17 +239,30 @@ import java.util.*;
|
|||
* -fraction 0.5
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select only indels from a VCF</h4>
|
||||
* <h4>Select only indels between 2 and 5 bases long from a VCF:</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* -R reference.fasta \
|
||||
* -V input.vcf \
|
||||
* -o output.vcf \
|
||||
* -selectType INDEL
|
||||
* -minIndelSize 2
|
||||
* -maxIndelSize 5
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column)</h4>
|
||||
* <h4>Exclude indels from a VCF:</h4>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* -selectTypeToExclude INDEL
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T SelectVariants \
|
||||
|
|
@ -223,6 +273,17 @@ import java.util.*;
|
|||
* -restrictAllelesTo MULTIALLELIC
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Select IDs in fileKeep and exclude IDs in fileExclude:</h4>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SelectVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf \
|
||||
* -IDs fileKeep \
|
||||
* -excludeIDs fileExclude
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
|
||||
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
|
||||
|
|
@ -258,7 +319,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* argument can be specified multiple times in order to use multiple different matching patterns.
|
||||
*/
|
||||
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select multiple samples", required=false)
|
||||
public Set<String> sampleExpressions ;
|
||||
public Set<String> sampleExpressions;
|
||||
|
||||
/**
|
||||
* Sample names should be in a plain text file listing one sample name per line. This argument can be specified multiple times in order to provide
|
||||
|
|
@ -282,26 +343,40 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
@Input(fullName="exclude_sample_file", shortName="xl_sf", doc="List of samples to exclude", required=false)
|
||||
public Set<File> XLsampleFiles = new HashSet<>(0);
|
||||
|
||||
/**
|
||||
* Using a regular expression allows you to match multiple sample names that have that pattern in common. Note that sample exclusion takes precedence
|
||||
* over inclusion, so that if a sample is in both lists it will be excluded. This argument can be specified multiple times in order to use multiple
|
||||
* different matching patterns.
|
||||
*/
|
||||
@Input(fullName="exclude_sample_expressions", shortName="xl_se", doc="List of sample expressions to exclude", required=false)
|
||||
public Set<String> XLsampleExpressions = new HashSet<>(0);
|
||||
|
||||
/**
|
||||
* See example commands above for detailed usage examples. Note that these expressions are evaluated *after* the
|
||||
* specified samples are extracted and the INFO field annotations are updated.
|
||||
*/
|
||||
@Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
|
||||
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<>();
|
||||
public ArrayList<String> selectExpressions = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* Invert the selection criteria for --select.
|
||||
*/
|
||||
@Argument(fullName="invert_selection", shortName="inv_sel", doc="Invert the selection criteria for --select", required=false)
|
||||
protected boolean invertSelection = false;
|
||||
|
||||
/*
|
||||
* If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none
|
||||
* of the selected samples display evidence of variation) will be excluded from the output.
|
||||
*/
|
||||
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include non-variant sites", required=false)
|
||||
protected boolean EXCLUDE_NON_VARIANTS = false;
|
||||
protected boolean XLnonVariants = false;
|
||||
|
||||
/**
|
||||
* If this flag is enabled, sites that have been marked as filtered (i.e. have anything other than `.` or `PASS`
|
||||
* in the FILTER field) will be excluded from the output.
|
||||
*/
|
||||
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered sites", required=false)
|
||||
protected boolean EXCLUDE_FILTERED = false;
|
||||
protected boolean XLfiltered = false;
|
||||
|
||||
/**
|
||||
* The default behavior of this tool is to remove bases common to all remaining alleles after subsetting
|
||||
|
|
@ -338,7 +413,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* AC_Orig, AF_Orig, and AN_Orig.
|
||||
*/
|
||||
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values after subsetting", required=false)
|
||||
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
|
||||
private boolean keepOriginalChrCounts = false;
|
||||
|
||||
/**
|
||||
* When subsetting a callset, this tool recalculates the site-level (INFO field) DP value corresponding to the contents of the
|
||||
|
|
@ -346,7 +421,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* DP_Orig.
|
||||
*/
|
||||
@Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value after subsetting", required=false)
|
||||
private boolean KEEP_ORIGINAL_DEPTH = false;
|
||||
private boolean keepOriginalDepth = false;
|
||||
|
||||
/**
|
||||
* If this flag is enabled, this tool will select only variants that correspond to a mendelian violation as
|
||||
|
|
@ -354,14 +429,22 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* `-ped` argument.
|
||||
*/
|
||||
@Argument(fullName="mendelianViolation", shortName="mv", doc="Output mendelian violation sites only", required=false)
|
||||
private Boolean MENDELIAN_VIOLATIONS = false;
|
||||
private Boolean mendelianViolations = false;
|
||||
|
||||
/**
|
||||
* If this flag is enabled, this tool will select only variants that do not correspond to a mendelian violation as
|
||||
* determined on the basis of family structure. Requires passing a pedigree file using the engine-level
|
||||
* `-ped` argument.
|
||||
*/
|
||||
@Argument(fullName="invert_mendelianViolation", shortName="inv_mv", doc="Output non-mendelian violation sites only", required=false)
|
||||
private Boolean invert_mendelianViolations = false;
|
||||
|
||||
/**
|
||||
* This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order
|
||||
* for a site to be accepted as a mendelian violation. Note that the `-mv` flag must be set for this argument to have an effect.
|
||||
*/
|
||||
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum GQ score for each trio member to accept a site as a violation", required=false)
|
||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||
protected double medelianViolationQualThreshold = 0;
|
||||
|
||||
/**
|
||||
* The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be
|
||||
|
|
@ -385,7 +468,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* SYMBOLIC, NO_VARIATION. Can be specified multiple times.
|
||||
*/
|
||||
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file", required=false)
|
||||
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<>();
|
||||
private List<VariantContext.Type> typesToInclude = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* This argument excludes particular kinds of variants out of a list. If left empty, there is no type selection
|
||||
* and all variant types are considered for other selection criteria. Valid types are INDEL, SNP, MIXED, MNP,
|
||||
* SYMBOLIC, NO_VARIATION. Can be specified multiple times.
|
||||
*/
|
||||
@Argument(fullName="selectTypeToExclude", shortName="xlSelectType", doc="Do not select certain type of variants from the input file", required=false)
|
||||
private List<VariantContext.Type> typesToExclude = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* If a file containing a list of IDs is provided to this argument, the tool will only select variants whose ID
|
||||
|
|
@ -395,29 +486,37 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
@Argument(fullName="keepIDs", shortName="IDs", doc="List of variant IDs to select", required=false)
|
||||
private File rsIDFile = null;
|
||||
|
||||
/**
|
||||
* If a file containing a list of IDs is provided to this argument, the tool will not select variants whose ID
|
||||
* field is present in this list of IDs. The matching is done by exact string matching. The expected file format
|
||||
* is simply plain text with one ID per line.
|
||||
*/
|
||||
@Argument(fullName="excludeIDs", shortName="xlIDs", doc="List of variant IDs to select", required=false)
|
||||
private File XLrsIDFile = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
|
||||
private boolean fullyDecode = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="forceGenotypesDecode", doc="If true, the incoming VariantContext will have its genotypes forcibly decoded by computing AC across all genotypes. For efficiency testing only", required=false)
|
||||
private boolean forceGenotypesDecode = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false)
|
||||
private boolean justRead = false;
|
||||
|
||||
/**
|
||||
* If this argument is provided, indels that are larger than the specified siwe will be excluded.
|
||||
* If this argument is provided, indels that are larger than the specified size will be excluded.
|
||||
*/
|
||||
@Argument(fullName="maxIndelSize", required=false, doc="Maximum size of indels to include")
|
||||
private int maxIndelSize = Integer.MAX_VALUE;
|
||||
|
||||
/**
|
||||
* If this argument is provided, indels that are smaller than the specified size will be excluded.
|
||||
*/
|
||||
@Argument(fullName="minIndelSize", required=false, doc="Minimum size of indels to include")
|
||||
private int minIndelSize = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.")
|
||||
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
|
||||
|
||||
private boolean allowNonOverlappingCommandLineSamples = false;
|
||||
|
||||
public enum NumberAlleleRestriction {
|
||||
ALL,
|
||||
|
|
@ -430,21 +529,22 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
|
||||
|
||||
private TreeSet<String> samples = new TreeSet<>();
|
||||
private boolean NO_SAMPLES_SPECIFIED = false;
|
||||
private boolean noSamplesSpecified = false;
|
||||
|
||||
private boolean DISCORDANCE_ONLY = false;
|
||||
private boolean CONCORDANCE_ONLY = false;
|
||||
private boolean discordanceOnly = false;
|
||||
private boolean concordanceOnly = false;
|
||||
|
||||
private MendelianViolation mv;
|
||||
|
||||
|
||||
/* variables used by the SELECT RANDOM modules */
|
||||
private boolean SELECT_RANDOM_FRACTION = false;
|
||||
private boolean selectRandomFraction = false;
|
||||
|
||||
//Random number generator for the genotypes to remove
|
||||
private Random randomGenotypes = new Random();
|
||||
|
||||
private Set<String> IDsToKeep = null;
|
||||
private Set<String> IDsToRemove = null;
|
||||
private Map<String, VCFHeader> vcfRods;
|
||||
|
||||
/**
|
||||
|
|
@ -472,103 +572,126 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
samples.addAll(samplesFromExpressions);
|
||||
samples.addAll(samplesFromFile);
|
||||
|
||||
logger.debug(Utils.join(",",commandLineUniqueSamples));
|
||||
|
||||
if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) {
|
||||
logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored.");
|
||||
samples.removeAll(commandLineUniqueSamples);
|
||||
} else if (commandLineUniqueSamples.size() > 0 ) {
|
||||
throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s",
|
||||
"Samples entered on command line (through -sf or -sn) that are not present in the VCF.",
|
||||
"A list of these samples:",
|
||||
Utils.join(",",commandLineUniqueSamples),
|
||||
"To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES"));
|
||||
}
|
||||
logger.debug(Utils.join(",", commandLineUniqueSamples));
|
||||
|
||||
if (!commandLineUniqueSamples.isEmpty()) {
|
||||
if (allowNonOverlappingCommandLineSamples) {
|
||||
logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored.");
|
||||
samples.removeAll(commandLineUniqueSamples);
|
||||
} else {
|
||||
throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s",
|
||||
"Samples entered on command line (through -sf or -sn) that are not present in the VCF.",
|
||||
"A list of these samples:",
|
||||
Utils.join(",", commandLineUniqueSamples),
|
||||
"To ignore these samples, run with --allowNonOverlappingCommandLineSamples"));
|
||||
}
|
||||
}
|
||||
|
||||
// if none were requested, we want all of them
|
||||
if ( samples.isEmpty() ) {
|
||||
samples.addAll(vcfSamples);
|
||||
NO_SAMPLES_SPECIFIED = true;
|
||||
noSamplesSpecified = true;
|
||||
}
|
||||
|
||||
// now, exclude any requested samples
|
||||
final Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
|
||||
final Collection<String> XLsamplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, XLsampleExpressions);
|
||||
samples.removeAll(XLsamplesFromFile);
|
||||
samples.removeAll(XLsampleNames);
|
||||
NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty();
|
||||
samples.removeAll(XLsamplesFromExpressions);
|
||||
noSamplesSpecified = noSamplesSpecified && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty() &&
|
||||
XLsamplesFromExpressions.isEmpty();
|
||||
|
||||
if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
|
||||
if ( samples.isEmpty() && !noSamplesSpecified )
|
||||
throw new UserException("All samples requested to be included were also requested to be excluded.");
|
||||
|
||||
if ( ! NO_SAMPLES_SPECIFIED )
|
||||
if ( ! noSamplesSpecified )
|
||||
for ( String sample : samples )
|
||||
logger.info("Including sample '" + sample + "'");
|
||||
|
||||
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
|
||||
if (TYPES_TO_INCLUDE.isEmpty()) {
|
||||
|
||||
if (typesToInclude.isEmpty()) {
|
||||
for (VariantContext.Type t : VariantContext.Type.values())
|
||||
selectedTypes.add(t);
|
||||
|
||||
}
|
||||
else {
|
||||
for (VariantContext.Type t : TYPES_TO_INCLUDE)
|
||||
for (VariantContext.Type t : typesToInclude)
|
||||
selectedTypes.add(t);
|
||||
|
||||
}
|
||||
|
||||
// remove specifed types
|
||||
for (VariantContext.Type t : typesToExclude)
|
||||
selectedTypes.remove(t);
|
||||
|
||||
// Initialize VCF header
|
||||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
|
||||
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
|
||||
|
||||
if (KEEP_ORIGINAL_CHR_COUNTS) {
|
||||
if (keepOriginalChrCounts) {
|
||||
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY));
|
||||
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY));
|
||||
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY));
|
||||
}
|
||||
if (KEEP_ORIGINAL_DEPTH)
|
||||
if (keepOriginalDepth)
|
||||
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_DP_KEY));
|
||||
headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
|
||||
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY));
|
||||
|
||||
for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {
|
||||
for (int i = 0; i < selectExpressions.size(); i++) {
|
||||
// It's not necessary that the user supply select names for the JEXL expressions, since those
|
||||
// expressions will only be needed for omitting records. Make up the select names here.
|
||||
selectNames.add(String.format("select-%d", i));
|
||||
}
|
||||
|
||||
jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS);
|
||||
jexls = VariantContextUtils.initializeMatchExps(selectNames, selectExpressions);
|
||||
|
||||
// Look at the parameters to decide which analysis to perform
|
||||
DISCORDANCE_ONLY = discordanceTrack.isBound();
|
||||
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName());
|
||||
discordanceOnly = discordanceTrack.isBound();
|
||||
if (discordanceOnly) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName());
|
||||
|
||||
CONCORDANCE_ONLY = concordanceTrack.isBound();
|
||||
if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName());
|
||||
concordanceOnly = concordanceTrack.isBound();
|
||||
if (concordanceOnly) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName());
|
||||
|
||||
if (MENDELIAN_VIOLATIONS) {
|
||||
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
|
||||
if (mendelianViolations) {
|
||||
mv = new MendelianViolation(medelianViolationQualThreshold,false,true);
|
||||
}
|
||||
|
||||
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
||||
selectRandomFraction = fractionRandom > 0;
|
||||
if (selectRandomFraction) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
||||
|
||||
/** load in the IDs file to a hashset for matching */
|
||||
if ( rsIDFile != null ) {
|
||||
IDsToKeep = new HashSet<>();
|
||||
try {
|
||||
for ( final String line : new XReadLines(rsIDFile).readLines() ) {
|
||||
IDsToKeep.add(line.trim());
|
||||
}
|
||||
logger.info("Selecting only variants with one of " + IDsToKeep.size() + " IDs from " + rsIDFile);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(rsIDFile, e);
|
||||
}
|
||||
}
|
||||
// Get variant IDs to keep and removed
|
||||
IDsToKeep = getIDsFromFile(rsIDFile);
|
||||
|
||||
IDsToRemove = getIDsFromFile(XLrsIDFile);
|
||||
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get IDs from a file
|
||||
*
|
||||
* @param file file containing the IDs
|
||||
* @return set of IDs or null if the file is null
|
||||
* @throws UserException.CouldNotReadInputFile if could not read the file
|
||||
*/
|
||||
private Set<String> getIDsFromFile(final File file){
|
||||
/** load in the IDs file to a hashset for matching */
|
||||
if ( file != null ) {
|
||||
Set<String> ids = new HashSet<>();
|
||||
try {
|
||||
for ( final java.lang.String line : new XReadLines(file).readLines() ) {
|
||||
ids.add(line.trim());
|
||||
}
|
||||
logger.info("Selecting only variants with one of " + ids.size() + " IDs from " + file);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(file, e);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subset VC record if necessary and emit the modified record (provided it satisfies criteria for printing)
|
||||
*
|
||||
|
|
@ -584,7 +707,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation());
|
||||
|
||||
if ( vcs == null || vcs.size() == 0) {
|
||||
if ( vcs == null || vcs.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -593,24 +716,21 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
if ( fullyDecode )
|
||||
vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() );
|
||||
|
||||
// an option for performance testing only
|
||||
if ( forceGenotypesDecode ) {
|
||||
final int x = vc.getCalledChrCount();
|
||||
//logger.info("forceGenotypesDecode with getCalledChrCount() = " + );
|
||||
}
|
||||
|
||||
if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
|
||||
if ( IDsToKeep != null && !IDsToKeep.contains(vc.getID()) )
|
||||
continue;
|
||||
|
||||
if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1)
|
||||
if ( IDsToRemove != null && IDsToRemove.contains(vc.getID()) )
|
||||
continue;
|
||||
|
||||
if ( mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invert_mendelianViolations) )
|
||||
break;
|
||||
|
||||
if (DISCORDANCE_ONLY) {
|
||||
if (discordanceOnly) {
|
||||
Collection<VariantContext> compVCs = tracker.getValues(discordanceTrack, context.getLocation());
|
||||
if (!isDiscordant(vc, compVCs))
|
||||
continue;
|
||||
}
|
||||
if (CONCORDANCE_ONLY) {
|
||||
if (concordanceOnly) {
|
||||
Collection<VariantContext> compVCs = tracker.getValues(concordanceTrack, context.getLocation());
|
||||
if (!isConcordant(vc, compVCs))
|
||||
continue;
|
||||
|
|
@ -625,16 +745,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
if (!selectedTypes.contains(vc.getType()))
|
||||
continue;
|
||||
|
||||
if ( containsIndelLargerThan(vc, maxIndelSize) )
|
||||
if ( containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize) )
|
||||
continue;
|
||||
|
||||
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
|
||||
|
||||
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
|
||||
// Not excluding non-variants or subsetted polymorphic variants AND including filtered loci or subsetted variant is not filtered
|
||||
if ( (!XLnonVariants || sub.isPolymorphicInSamples()) && (!XLfiltered || !sub.isFiltered()) ) {
|
||||
|
||||
// Write the subsetted variant if it matches all of the expressions
|
||||
boolean failedJexlMatch = false;
|
||||
|
||||
try {
|
||||
for (VariantContextUtils.JexlVCMatchExp jexl : jexls) {
|
||||
if (!VariantContextUtils.match(sub, jexl)) {
|
||||
if ( Utils.invertLogic(!VariantContextUtils.match(sub, jexl), invertSelection) ){
|
||||
failedJexlMatch = true;
|
||||
break;
|
||||
}
|
||||
|
|
@ -646,7 +770,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
}
|
||||
if ( !failedJexlMatch &&
|
||||
!justRead &&
|
||||
( !SELECT_RANDOM_FRACTION || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) {
|
||||
( !selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) {
|
||||
vcfWriter.add(sub);
|
||||
}
|
||||
}
|
||||
|
|
@ -656,19 +780,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
}
|
||||
|
||||
/*
|
||||
* Determines if any of the alternate alleles are greater than the max indel size
|
||||
* Determines if any of the alternate alleles are greater than the max indel size or less than the min indel size
|
||||
*
|
||||
* @param vc the variant context to check
|
||||
* @param maxIndelSize the maximum size of allowed indels
|
||||
* @return true if the VC contains an indel larger than maxIndelSize and false otherwise
|
||||
* @param minIndelSize the minimum size of allowed indels
|
||||
* @return true if the VC contains an indel larger than maxIndelSize or less than the minIndelSize, false otherwise
|
||||
*/
|
||||
protected static boolean containsIndelLargerThan(final VariantContext vc, final int maxIndelSize) {
|
||||
protected static boolean containsIndelLargerOrSmallerThan(final VariantContext vc, final int maxIndelSize, final int minIndelSize) {
|
||||
final List<Integer> lengths = vc.getIndelLengths();
|
||||
if ( lengths == null )
|
||||
return false;
|
||||
|
||||
for ( Integer indelLength : lengths ) {
|
||||
if ( Math.abs(indelLength) > maxIndelSize )
|
||||
if ( Math.abs(indelLength) > maxIndelSize || Math.abs(indelLength) < minIndelSize )
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -677,16 +802,17 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
/**
|
||||
* Checks if vc has a variant call for (at least one of) the samples.
|
||||
*
|
||||
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
|
||||
* @param compVCs the comparison VariantContext (discordance
|
||||
* @return true if is discordant
|
||||
* @param compVCs the comparison VariantContext (discordance)
|
||||
* @return true VariantContexts are discordant, false otherwise
|
||||
*/
|
||||
private boolean isDiscordant (VariantContext vc, Collection<VariantContext> compVCs) {
|
||||
if (vc == null)
|
||||
return false;
|
||||
|
||||
// if we're not looking at specific samples then the absence of a compVC means discordance
|
||||
if (NO_SAMPLES_SPECIFIED)
|
||||
if (noSamplesSpecified)
|
||||
return (compVCs == null || compVCs.isEmpty());
|
||||
|
||||
// check if we find it in the variant rod
|
||||
|
|
@ -712,12 +838,19 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
return false; // we only get here if all samples have a variant in the comp rod.
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the two variants have the same genotypes for the selected samples
|
||||
*
|
||||
* @param vc the variant rod VariantContext.
|
||||
* @param compVCs the comparison VariantContext
|
||||
* @return true if VariantContexts are concordant, false otherwise
|
||||
*/
|
||||
private boolean isConcordant (VariantContext vc, Collection<VariantContext> compVCs) {
|
||||
if (vc == null || compVCs == null || compVCs.isEmpty())
|
||||
return false;
|
||||
|
||||
// if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant.
|
||||
if (NO_SAMPLES_SPECIFIED)
|
||||
if (noSamplesSpecified)
|
||||
return true;
|
||||
|
||||
// make a list of all samples contained in this variant VC that are being tracked by the user command line arguments.
|
||||
|
|
@ -744,7 +877,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
}
|
||||
|
||||
private boolean sampleHasVariant(Genotype g) {
|
||||
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED)));
|
||||
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !XLfiltered)));
|
||||
}
|
||||
|
||||
private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) {
|
||||
|
|
@ -753,7 +886,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
if ((g1.isCalled() && g2.isFiltered()) ||
|
||||
(g2.isCalled() && g1.isFiltered()) ||
|
||||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
|
||||
(g1.isFiltered() && g2.isFiltered() && XLfiltered))
|
||||
return false;
|
||||
|
||||
List<Allele> a1s = g1.getAlleles();
|
||||
|
|
@ -787,7 +920,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
*/
|
||||
private VariantContext subsetRecord(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) {
|
||||
//subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible
|
||||
if ( NO_SAMPLES_SPECIFIED && !removeUnusedAlternates )
|
||||
if ( noSamplesSpecified && !removeUnusedAlternates )
|
||||
return vc;
|
||||
|
||||
// strip out the alternate alleles that aren't being used
|
||||
|
|
@ -843,7 +976,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final Set<String> selectedSampleNames) {
|
||||
if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data
|
||||
|
||||
if ( KEEP_ORIGINAL_CHR_COUNTS ) {
|
||||
if ( keepOriginalChrCounts ) {
|
||||
final int[] indexOfOriginalAlleleForNewAllele;
|
||||
final List<Allele> newAlleles = builder.getAlleles();
|
||||
final int numOriginalAlleles = originalVC.getNAlleles();
|
||||
|
|
@ -879,7 +1012,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
||||
|
||||
if (KEEP_ORIGINAL_DEPTH && originalVC.hasAttribute(VCFConstants.DEPTH_KEY))
|
||||
if (keepOriginalDepth && originalVC.hasAttribute(VCFConstants.DEPTH_KEY))
|
||||
builder.attribute(GATKVCFConstants.ORIGINAL_DP_KEY, originalVC.getAttribute(VCFConstants.DEPTH_KEY));
|
||||
|
||||
boolean sawDP = false;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.gatk.tools.walkers.filters;
|
||||
|
||||
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.gatk.utils.BaseTest;
|
||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||
import org.broadinstitute.gatk.utils.GenomeLocParser;
|
||||
import org.broadinstitute.gatk.utils.Utils;
|
||||
import htsjdk.variant.variantcontext.Allele;
|
||||
import htsjdk.variant.variantcontext.VariantContext;
|
||||
import htsjdk.variant.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.*;
|
||||
|
||||
public class VariantFiltrationUnitTest extends BaseTest {
|
||||
|
||||
private String chr1 = null;
|
||||
private GenomeLoc genomeLoc = null;
|
||||
private String vcFilter = "testFilter";
|
||||
|
||||
@BeforeTest
|
||||
public void before() {
|
||||
// Create GenomeLoc
|
||||
IndexedFastaSequenceFile fasta = CachingIndexedFastaSequenceFile.checkAndCreate(new File(privateTestDir + "iupacFASTA.fasta"));
|
||||
GenomeLocParser genomeLocParser = new GenomeLocParser(fasta);
|
||||
chr1 = fasta.getSequenceDictionary().getSequence(0).getSequenceName();
|
||||
genomeLoc = genomeLocParser.createGenomeLoc(chr1, 5, 10);
|
||||
}
|
||||
|
||||
@DataProvider(name = "VariantMaskData")
|
||||
public Object[][] DoesMaskCoverVariantTestData() {
|
||||
|
||||
final String maskName = "testMask";
|
||||
|
||||
List<Object[]> tests = Arrays.asList(new Object[]{chr1, 0, 0, maskName, 10, true, true},
|
||||
new Object[]{"chr2", 0, 0, maskName, 10, true, false},
|
||||
new Object[]{chr1, 0, 0, null, 10, true, true},
|
||||
new Object[]{chr1, 0, 0, maskName, 10, true, true},
|
||||
new Object[]{chr1, 0, 0, vcFilter, 10, true, false},
|
||||
new Object[]{chr1, 0, 0, maskName, 1, true, false},
|
||||
new Object[]{chr1, 15, 15, maskName, 10, false, true},
|
||||
new Object[]{chr1, 15, 15, maskName, 1, false, false}
|
||||
);
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test doesMaskCoverVariant() logic
|
||||
*
|
||||
* @param contig chromosome or contig name
|
||||
* @param start variant context start
|
||||
* @param stop variant context stop
|
||||
* @param maskName mask or filter name
|
||||
* @param maskExtension bases beyond the mask
|
||||
* @param vcBeforeLoc if true, variant context is before the genome location; if false, the converse is true.
|
||||
* @param expectedValue return the expected return value from doesMaskCoverVariant()
|
||||
*/
|
||||
@Test(dataProvider = "VariantMaskData")
|
||||
public void TestDoesMaskCoverVariant(final String contig, final int start, final int stop, final String maskName, final int maskExtension,
|
||||
final boolean vcBeforeLoc, final boolean expectedValue) {
|
||||
|
||||
// Build VariantContext
|
||||
final byte[] allele1 = Utils.dupBytes((byte) 'A', 1);
|
||||
final byte[] allele2 = Utils.dupBytes((byte) 'T', 2);
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>(2);
|
||||
final Allele ref = Allele.create(allele1, true);
|
||||
final Allele alt = Allele.create(allele2, false);
|
||||
alleles.add(ref);
|
||||
alleles.add(alt);
|
||||
|
||||
final VariantContext vc = new VariantContextBuilder("test", contig, start, stop, alleles).filter(vcFilter).make();
|
||||
|
||||
boolean coversVariant = VariantFiltration.doesMaskCoverVariant(vc, genomeLoc, maskName, maskExtension, vcBeforeLoc);
|
||||
Assert.assertEquals(coversVariant, expectedValue);
|
||||
}
|
||||
}
|
||||
|
|
@ -41,20 +41,22 @@ import java.util.List;
|
|||
|
||||
public class SelectVariantsUnitTest extends BaseTest {
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Tests for maxIndelSize functionality //
|
||||
//////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////
|
||||
// Tests for maxIndelSize and minIndelSize functionality //
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
@DataProvider(name = "MaxIndelSize")
|
||||
public Object[][] MaxIndelSizeTestData() {
|
||||
@DataProvider(name = "MaxMinIndelSize")
|
||||
public Object[][] MaxMinIndelSizeTestData() {
|
||||
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final int size : Arrays.asList(1, 3, 10, 100) ) {
|
||||
for ( final int otherSize : Arrays.asList(0, 1) ) {
|
||||
for ( final int max : Arrays.asList(0, 1, 5, 50, 100000) ) {
|
||||
for ( final String op : Arrays.asList("D", "I") ) {
|
||||
tests.add(new Object[]{size, otherSize, max, op});
|
||||
for ( final int min : Arrays.asList(0, 1, 5, 50) ) {
|
||||
for (final String op : Arrays.asList("D", "I")) {
|
||||
tests.add(new Object[]{size, otherSize, max, min, op});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -63,8 +65,8 @@ public class SelectVariantsUnitTest extends BaseTest {
|
|||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MaxIndelSize")
|
||||
public void maxIndelSizeTest(final int size, final int otherSize, final int max, final String op) {
|
||||
@Test(dataProvider = "MaxMinIndelSize")
|
||||
public void maxIndelSizeTest(final int size, final int otherSize, final int max, final int min, final String op) {
|
||||
|
||||
final byte[] largerAllele = Utils.dupBytes((byte) 'A', size+1);
|
||||
final byte[] smallerAllele = Utils.dupBytes((byte) 'A', 1);
|
||||
|
|
@ -74,15 +76,11 @@ public class SelectVariantsUnitTest extends BaseTest {
|
|||
final Allele alt = Allele.create(op.equals("D") ? smallerAllele : largerAllele, false);
|
||||
alleles.add(ref);
|
||||
alleles.add(alt);
|
||||
if ( otherSize > 0 && otherSize != size ) {
|
||||
final Allele otherAlt = Allele.create(op.equals("D") ? Utils.dupBytes((byte) 'A', size-otherSize+1) : Utils.dupBytes((byte) 'A', otherSize+1), false);
|
||||
alleles.add(otherAlt);
|
||||
}
|
||||
|
||||
final VariantContext vc = new VariantContextBuilder("test", "1", 10, 10 + ref.length() - 1, alleles).make();
|
||||
|
||||
boolean hasTooLargeIndel = SelectVariants.containsIndelLargerThan(vc, max);
|
||||
Assert.assertEquals(hasTooLargeIndel, size > max);
|
||||
boolean hasIndelTooLargeOrSmall = SelectVariants.containsIndelLargerOrSmallerThan(vc, max, min);
|
||||
Assert.assertEquals(hasIndelTooLargeOrSmall, size > max || size < min);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -77,6 +77,17 @@ public class Utils {
|
|||
return x != y;
|
||||
}
|
||||
|
||||
/**
|
||||
* Invert logic if specified
|
||||
*
|
||||
* @param logic boolean logical operation value
|
||||
* @param invert whether to invert logic
|
||||
* @return invert logic if invert flag is true, otherwise leave the logic
|
||||
*/
|
||||
public static boolean invertLogic(final boolean logic, final boolean invert){
|
||||
return logic ^ invert;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the optimum initial size for a hash table given the maximum number
|
||||
* of elements it will need to hold. The optimum size is the smallest size that
|
||||
|
|
|
|||
Loading…
Reference in New Issue