diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 709185821..21b069343 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -290,6 +290,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testKeepOriginalACAndENV--" + testFile, spec); } + @Test + public void testKeepOriginalDP() { + String testFile = privateTestDir + "CEUtrioTest.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalDP -R " + b37KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("e897097a47aee5516dc4f1c0b9d69037") + ); + + executeTest("testKeepOriginalDP--" + testFile, spec); + } + @Test public void testMultipleRecordsAtOnePosition() { String testFile = privateTestDir + "selectVariants.onePosition.vcf"; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 98393596f..6a5d34a6f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -214,7 +214,7 @@ public class SelectVariants extends RodWalker implements TreeR protected VariantContextWriter vcfWriter = null; @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) - public Set sampleNames = new HashSet(0); + public Set sampleNames = new HashSet<>(0); @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) public Set sampleExpressions ; @@ -226,19 +226,19 @@ public class SelectVariants extends RodWalker implements TreeR * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. */ @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false) - public Set XLsampleNames = new HashSet(0); + public Set XLsampleNames = new HashSet<>(0); /** * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. */ @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false) - public Set XLsampleFiles = new HashSet(0); + public Set XLsampleFiles = new HashSet<>(0); /** * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated. */ @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false) - public ArrayList SELECT_EXPRESSIONS = new ArrayList(); + public ArrayList SELECT_EXPRESSIONS = new ArrayList<>(); @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) protected boolean EXCLUDE_NON_VARIANTS = false; @@ -276,6 +276,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; + @Argument(fullName="keepOriginalDP", shortName="keepOriginalDP", doc="Store the original DP value in the INFO field (using the DP_Orig key) after selecting", required=false) + private boolean KEEP_ORIGINAL_DEPTH = false; + /** * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure. */ @@ -300,7 +303,7 @@ public class SelectVariants extends RodWalker implements TreeR * */ @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) - private List TYPES_TO_INCLUDE = new ArrayList(); + private List TYPES_TO_INCLUDE = new ArrayList<>(); /** * If provided, we will only include variants whose ID field is present in this list of ids. The matching @@ -336,11 +339,11 @@ public class SelectVariants extends RodWalker implements TreeR MULTIALLELIC } - private ArrayList selectedTypes = new ArrayList(); - private ArrayList selectNames = new ArrayList(); + private ArrayList selectedTypes = new ArrayList<>(); + private ArrayList selectNames = new ArrayList<>(); private List jexls = null; - private TreeSet samples = new TreeSet(); + private TreeSet samples = new TreeSet<>(); private boolean NO_SAMPLES_SPECIFIED = false; private boolean DISCORDANCE_ONLY = false; @@ -366,13 +369,13 @@ public class SelectVariants extends RodWalker implements TreeR List rodNames = Arrays.asList(variantCollection.variants.getName()); vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + TreeSet vcfSamples = new TreeSet<>(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); // first, check overlap between requested and present samples - Set commandLineUniqueSamples = new HashSet(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size()); + Set commandLineUniqueSamples = new HashSet<>(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size()); commandLineUniqueSamples.addAll(samplesFromFile); commandLineUniqueSamples.addAll(samplesFromExpressions); commandLineUniqueSamples.addAll(sampleNames); @@ -437,6 +440,8 @@ public class SelectVariants extends RodWalker implements TreeR headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY)); } + if (KEEP_ORIGINAL_DEPTH) + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_DP_KEY)); headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); @@ -464,7 +469,7 @@ public class SelectVariants extends RodWalker implements TreeR /** load in the IDs file to a hashset for matching */ if ( rsIDFile != null ) { - IDsToKeep = new HashSet(); + IDsToKeep = new HashSet<>(); try { for ( final String line : new XReadLines(rsIDFile).readLines() ) { IDsToKeep.add(line.trim()); @@ -788,6 +793,9 @@ public class SelectVariants extends RodWalker implements TreeR VariantContextUtils.calculateChromosomeCounts(builder, false); + if (KEEP_ORIGINAL_DEPTH && originalVC.hasAttribute(VCFConstants.DEPTH_KEY)) + builder.attribute(GATKVCFConstants.ORIGINAL_DP_KEY, originalVC.getAttribute(VCFConstants.DEPTH_KEY)); + boolean sawDP = false; int depth = 0; for ( final String sample : selectedSampleNames ) { diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index 6154b0bfb..bcf8185fd 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -49,6 +49,7 @@ public final class GATKVCFConstants { public static final String CLIPPING_RANK_SUM_KEY = "ClippingRankSum"; public static final String CULPRIT_KEY = "culprit"; public static final String SPANNING_DELETIONS_KEY = "Dels"; + public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants public static final String DOWNSAMPLED_KEY = "DS"; public static final String FISHER_STRAND_KEY = "FS"; public static final String GC_CONTENT_KEY = "GC"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index ccfd89b60..89b9510d2 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -129,6 +129,7 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(ORIGINAL_AC_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Original AC")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_AF_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Original AF")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_AN_KEY, 1, VCFHeaderLineType.Integer, "Original AN")); + addInfoLine(new VCFInfoHeaderLine(ORIGINAL_DP_KEY, 1, VCFHeaderLineType.Integer, "Original DP")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_CONTIG_KEY, 1, VCFHeaderLineType.String, "Original contig name for the record")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_START_KEY, 1, VCFHeaderLineType.Integer, "Original start position for the record")); addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));