From b39b5edca8e76bb389cbb6cb6d1db38f14368a8c Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 12 Mar 2010 19:23:12 +0000 Subject: [PATCH] Bug fix in variant eval 2. Preliminary (slow and buggy) support for -XL exclude lists. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2991 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 20 +++++++++++++++++++ .../arguments/GATKArgumentCollection.java | 7 +++++++ .../varianteval2/VariantEval2Walker.java | 2 +- .../arguments/GATKArgumentCollectionTest.java | 1 + .../VariantEval2IntegrationTest.java | 10 +++++----- 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 4cb029a46..9ae1271cd 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -158,10 +158,21 @@ public class GenomeAnalysisEngine { // create the output streams initializeOutputStreams(my_walker, microScheduler.getOutputTracker()); + // todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided + GenomeLocSortedSet excludeIntervals = null; + if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) { + excludeIntervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL)); + } + if (argCollection.intervals != null && argCollection.intervalMerging.check()) { intervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals)); } + if ( excludeIntervals != null ) { + GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals; + intervals = pruneIntervals( toPrune, excludeIntervals ); + } + ShardStrategy shardStrategy = getShardStrategy(my_walker, microScheduler.getReference(), intervals, @@ -294,6 +305,15 @@ public class GenomeAnalysisEngine { return microScheduler; } + private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) { + logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size())); + for ( GenomeLoc exclude : toExclude ) + toPrune.removeRegion(exclude); + logger.info(String.format("done pruning intervals == now have %d", toPrune.size())); + + return toPrune; + } + /** * setup the interval regions, from either the interval file of the genome region string * diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 8babc3267..305519447 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -64,6 +64,10 @@ public class GATKArgumentCollection { @Argument(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.", required = false) public List intervals = null; + @ElementList(required = false) + @Argument(fullName = "excludeIntervals", shortName = "XL", doc = "A list of genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file.", required = false) + public List excludeIntervals = null; + @Element(required = false) @Argument(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; @@ -261,6 +265,9 @@ public class GATKArgumentCollection { if (!other.intervals.equals(this.intervals)) { return false; } + if (!other.excludeIntervals.equals(this.excludeIntervals)) { + return false; + } if (!other.DBSNPFile.equals(this.DBSNPFile)) { return false; } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java index c46f4a13a..f5ed2dfae 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java @@ -490,7 +490,7 @@ public class VariantEval2Walker extends RodWalker { VariantContext vc = contexts.size() == 1 ? contexts.iterator().next() : null; - if ( vc != null && vc.hasGenotypes(SAMPLES_LIST) ) { + if ( vc != null && vc.hasGenotypes(SAMPLES_LIST) && SAMPLES_LIST.size() > 0 ) { //if ( ! name.equals("eval") ) logger.info(String.format("subsetting VC %s", vc)); vc = vc.subContextFromGenotypes(vc.getGenotypes(SAMPLES_LIST).values()); //if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc)); diff --git a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionTest.java b/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionTest.java index 74f313079..a82e7fe00 100755 --- a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionTest.java +++ b/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionTest.java @@ -91,6 +91,7 @@ public class GATKArgumentCollectionTest extends BaseTest { collect.downsampleCoverage = null; collect.intervals = new ArrayList(); collect.intervals.add("intervals".toLowerCase()); + collect.excludeIntervals = new ArrayList(); collect.disableThreading = false; collect.outFileName = "outFileName".toLowerCase(); collect.errFileName = "errFileName".toLowerCase(); diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2IntegrationTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2IntegrationTest.java index d6b6a7c2b..5bba572de 100755 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2IntegrationTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2IntegrationTest.java @@ -20,8 +20,8 @@ public class VariantEval2IntegrationTest extends WalkerTest { @Test public void testVE2Simple() { HashMap expectations = new HashMap(); - expectations.put("-L 1:1-10,000,000", "32b2e9758078b66e6d50d140acb37947"); - expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "5ee420ebf7c2d3c2e3827c0114a6706d"); + expectations.put("-L 1:1-10,000,000", "8f6d7d4ded62c4558b4c72053ca2f3d5"); + expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "3adaab00a5475504ede7bb13b2c8736f"); for ( Map.Entry entry : expectations.entrySet() ) { String extraArgs = entry.getKey(); @@ -41,10 +41,10 @@ public class VariantEval2IntegrationTest extends WalkerTest { " -B dbsnp_130,dbSNP," + GATKDataLocation + "dbsnp_130_b36.rod" + " -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf"; - String eqMD5s = "ba021a4c963200191710a220a5577753"; // next two examples should be the same! + String eqMD5s = "5b51146d4282a236f2b6b73fe585a305"; // next two examples should be the same! expectations.put("", eqMD5s); expectations.put(" -known comp_hapmap -known dbsnp", eqMD5s); - expectations.put(" -known comp_hapmap", "5ce16165f4242d77b4e82c704273c11d"); + expectations.put(" -known comp_hapmap", "bcb832f75afd63e2f66bc5490f89cac3"); for ( Map.Entry entry : expectations.entrySet() ) { String extraArgs2 = entry.getKey(); @@ -62,7 +62,7 @@ public class VariantEval2IntegrationTest extends WalkerTest { String extraArgs = "-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 30"; WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s -outputVCF %s", 2, - Arrays.asList("0b29285da3ca778b9c8b7f62e99aa72d", "d41d8cd98f00b204e9800998ecf8427e")); + Arrays.asList("2c9e03fe3d1c9aa32fbdbf74a5758e85", "a3ce1d70d8ae3874807e9d61994d42af")); executeTest("testVE2WriteVCF", spec); } }