From b29eda83bbe962aea96682533a26e96ea299a431 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 16 Jul 2010 14:10:18 +0000 Subject: [PATCH] Parallelized CountCovarites! percent_ref_called_var now a standard genotype concordance module (for validation!). Really much smarter merging of headers for combineVariants. VCF codecs now actually look at the file version and blow up if they are the wrong versions. setHeaderVersion() in VCFHeaderLine. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3802 348d0f76-0448-11de-a6fe-93d51630548a --- java/src/org/broad/tribble/vcf/VCFCodec.java | 2 + .../org/broad/tribble/vcf/VCFHeaderLine.java | 5 +++ .../gatk/refdata/features/vcf4/VCF4Codec.java | 2 + .../recalibration/CovariateCounterWalker.java | 38 ++++++++-------- .../recalibration/RecalDatumOptimized.java | 8 ++-- .../RecalibrationArgumentCollection.java | 2 - .../TableRecalibrationWalker.java | 11 +++-- .../varianteval/GenotypeConcordance.java | 38 +++++++++------- .../utils/collections/NestedHashMap.java | 2 +- .../sting/utils/genotype/vcf/VCFUtils.java | 5 ++- .../RecalibrationWalkersIntegrationTest.java | 44 ++++++++++--------- .../RecalibrationWalkersPerformanceTest.java | 20 ++++++--- .../VariantEvalIntegrationTest.java | 16 +++---- 13 files changed, 111 insertions(+), 82 deletions(-) diff --git a/java/src/org/broad/tribble/vcf/VCFCodec.java b/java/src/org/broad/tribble/vcf/VCFCodec.java index cb400fff3..0f6a378c0 100644 --- a/java/src/org/broad/tribble/vcf/VCFCodec.java +++ b/java/src/org/broad/tribble/vcf/VCFCodec.java @@ -72,6 +72,8 @@ public class VCFCodec implements FeatureCodec { try { while ((line = reader.readLine()) != null) { if (line.startsWith("##")) { + if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv3" ) ) + throw new CodecLineParsingException("VCF codec can only parse VCF3 formated files. Your version line is " + line + ". If you want to parse VCF4, use VCF4 use VCF as the rod type"); headerStrings.add(line); } else if (line.startsWith("#")) { diff --git a/java/src/org/broad/tribble/vcf/VCFHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java index fda574a3e..f3dadccf5 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java @@ -42,6 +42,7 @@ public class VCFHeaderLine implements Comparable { private String stringRep = null; private String mKey = null; private String mValue = null; + protected VCFHeaderVersion mVersion = null; /** @@ -135,6 +136,10 @@ public class VCFHeaderLine implements Comparable { this.mVersion = version; } + public VCFHeaderVersion getVersion() { + return mVersion; + } + /** * create a string of a mapping pair for the target VCF version * @param keyValues a mapping of the key->value pairs to output diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 90afdd594..014a3b5c5 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -80,6 +80,8 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { while ((line = reader.readLine()) != null) { lineNo++; if (line.startsWith("##")) { + if ( line.startsWith("##fileformat") && ! line.startsWith("##fileformat=VCFv4" ) ) + throw new CodecLineParsingException("VCF4 codec can only parse VCF4 formated files. Your version line is " + line + ". If you want VCF3 parsing, use VCF as the rod type."); headerStrings.add(line); } else if (line.startsWith("#")) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 4232bdf92..54c1a8995 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -70,7 +70,7 @@ import java.util.Map; @WalkerName( "CountCovariates" ) @ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta -public class CovariateCounterWalker extends LocusWalker { +public class CovariateCounterWalker extends LocusWalker implements TreeReducible { ///////////////////////////// // Constants @@ -87,6 +87,9 @@ public class CovariateCounterWalker extends LocusWalker { ///////////////////////////// // Command Line Arguments ///////////////////////////// + @Argument(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file") + public PrintStream RECAL_FILE; + @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) private boolean LIST_ONLY = false; @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false) @@ -230,6 +233,12 @@ public class CovariateCounterWalker extends LocusWalker { logger.info( "\t" + cov.getClass().getSimpleName() ); cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection } + +// try { +// stream = new PrintStream( RAC.RECAL_FILE ); +// } catch ( FileNotFoundException e ) { +// throw new RuntimeException( "Couldn't open output file: ", e ); +// } } @@ -263,15 +272,10 @@ public class CovariateCounterWalker extends LocusWalker { if( !isSNP && ( ++numUnprocessed >= PROCESS_EVERY_NTH_LOCUS ) ) { numUnprocessed = 0; // Reset the counter because we are processing this very locus - GATKSAMRecord gatkRead; - int offset; - byte refBase; - byte[] bases; - // For each read at this locus for( PileupElement p : context.getBasePileup() ) { - gatkRead = (GATKSAMRecord) p.getRead(); - offset = p.getOffset(); + GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead(); + int offset = p.getOffset(); if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) { continue; @@ -297,8 +301,8 @@ public class CovariateCounterWalker extends LocusWalker { // Skip this position if base quality is zero if( gatkRead.getBaseQualities()[offset] > 0 ) { - bases = gatkRead.getReadBases(); - refBase = ref.getBase(); + byte[] bases = gatkRead.getReadBases(); + byte refBase = ref.getBase(); // Skip if this base is an 'N' or etc. if( BaseUtils.isRegularBase( bases[offset] ) ) { @@ -390,8 +394,6 @@ public class CovariateCounterWalker extends LocusWalker { * @param refBase The reference base at this locus */ private void updateDataFromRead(final GATKSAMRecord gatkRead, final int offset, final byte refBase) { - - final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE); final Object[] key = covars[offset]; @@ -426,11 +428,7 @@ public class CovariateCounterWalker extends LocusWalker { * @return returns A PrintStream created from the -recalFile filename argument specified to the walker */ public PrintStream reduceInit() { - try { - return new PrintStream( RAC.RECAL_FILE ); - } catch ( FileNotFoundException e ) { - throw new RuntimeException( "Couldn't open output file: ", e ); - } + return RECAL_FILE; } /** @@ -440,7 +438,11 @@ public class CovariateCounterWalker extends LocusWalker { * @return returns The PrintStream used to output the CSV data */ public PrintStream reduce( Integer value, PrintStream recalTableStream ) { - return recalTableStream; // Nothing to do here + return recalTableStream; // Nothing to do here, just return our open stream + } + + public PrintStream treeReduce( PrintStream recalTableStream1, PrintStream recalTableStream2 ) { + return recalTableStream1; // Nothing to do here, just return our open stream } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java index b3bb58f61..c4ad3869a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java @@ -71,22 +71,22 @@ public class RecalDatumOptimized { // //--------------------------------------------------------------------------------------------------------------- - public final void increment( final long incObservations, final long incMismatches ) { + public synchronized final void increment( final long incObservations, final long incMismatches ) { numObservations += incObservations; numMismatches += incMismatches; } - public final void increment( final RecalDatumOptimized other ) { + public synchronized final void increment( final RecalDatumOptimized other ) { increment( other.numObservations, other.numMismatches ); } - public final void increment( final List data ) { + public synchronized final void increment( final List data ) { for ( RecalDatumOptimized other : data ) { this.increment( other ); } } - public final void increment( final char curBase, final char refBase ) { + public synchronized final void increment( final char curBase, final char refBase ) { increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index da7cf996c..62d8543a6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -41,8 +41,6 @@ public class RecalibrationArgumentCollection { ////////////////////////////////// // Shared Command Line Arguments ////////////////////////////////// - @Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file") - public String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") public String DEFAULT_READ_GROUP = null; @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 7d876ddff..4a8c85415 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -87,6 +87,9 @@ public class TableRecalibrationWalker extends ReadWalker entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); + for ( String parallelism : Arrays.asList("") ) { // todo -- enable parallel tests. They work but there's a system bug Arrays.asList("", " -nt 4")) { + for ( Map.Entry entry : e.entrySet() ) { + String bam = entry.getKey(); + String md5 = entry.getValue(); - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + oneKGLocation + "reference/human_b36_both.fasta" + - " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + - " -T CountCovariates" + - " -I " + bam + - ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) - ? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) + - " -cov ReadGroupCovariate" + - " -cov QualityScoreCovariate" + - " -cov CycleCovariate" + - " -cov DinucCovariate" + - " -cov TileCovariate" + - " --solid_recal_mode SET_Q_ZERO" + - " -recalFile %s", - 1, // just one output file - Arrays.asList(md5)); - List result = executeTest("testCountCovariates1", spec).getFirst(); - paramsFiles.put(bam, result.get(0).getAbsolutePath()); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + oneKGLocation + "reference/human_b36_both.fasta" + + " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + + " -T CountCovariates" + + " -I " + bam + + ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) + ? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) + + " -cov ReadGroupCovariate" + + " -cov QualityScoreCovariate" + + " -cov CycleCovariate" + + " -cov DinucCovariate" + + " -cov TileCovariate" + + " --solid_recal_mode SET_Q_ZERO" + + " -recalFile %s" + parallelism, + 1, // just one output file + Arrays.asList(md5)); + List result = executeTest("testCountCovariates1" + parallelism, spec).getFirst(); + paramsFiles.put(bam, result.get(0).getAbsolutePath()); + } } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java index 016fac9cc..0f7243ef0 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java @@ -8,8 +8,7 @@ import java.util.ArrayList; public class RecalibrationWalkersPerformanceTest extends WalkerTest { - @Test - public void testCountCovariatesWholeGenome() { + private void testCountCovariatesWholeGenomeRunner(String moreArgs) { WalkerTestSpec spec = new WalkerTestSpec( "-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" + " -T CountCovariates" + @@ -17,14 +16,13 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L chr1:1-50,000,000" + " -standard" + " -OQ" + - " -recalFile /dev/null", + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); executeTest("testCountCovariatesWholeGenome", spec); } - @Test - public void testCountCovariatesWholeExome() { + private void testCountCovariatesWholeExomeRunner(String moreArgs) { WalkerTestSpec spec = new WalkerTestSpec( "-R " + seqLocation + "references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" + " -T CountCovariates" + @@ -32,12 +30,22 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" + " -standard" + " -OQ" + - " -recalFile /dev/null", + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); executeTest("testCountCovariatesWholeExome", spec); } + @Test + public void testCountCovariatesWholeGenome() { testCountCovariatesWholeGenomeRunner(""); } + @Test + public void testCountCovariatesWholeGenomeParallel() { testCountCovariatesWholeGenomeRunner(" -nt 4"); } + + @Test + public void testCountCovariatesWholeExome() { testCountCovariatesWholeExomeRunner(""); } + @Test + public void testCountCovariatesWholeExomeParallel() { testCountCovariatesWholeExomeRunner(" -nt 4"); } + @Test public void testTableRecalibratorWholeGenome() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3d7a34b56..c82aa13ac 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -29,7 +29,7 @@ public class String extraArgs = "-L 1:1-10,000,000"; for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -o %s", - 1, Arrays.asList("97d2471ed6ee79d70ce5bd9cc0be2239")); + 1, Arrays.asList("119601d7e9e67a1053663b2e546250ed")); executeTest("testSelect1", spec); } } @@ -38,7 +38,7 @@ public class public void testSelect2() { String extraArgs = "-L 1:1-10,000,000"; WalkerTestSpec spec = new WalkerTestSpec( withSelect(withSelect(root, "DP < 50", "DP50"), "set==\"Intersection\"", "intersection") + " " + extraArgs + " -o %s", - 1, Arrays.asList("e3366d73ee7cdf630c29809ce230e32e")); + 1, Arrays.asList("06d495ab8169a2570eebdc54ecdffe10")); executeTest("testSelect2", spec); } @@ -48,7 +48,7 @@ public class for (String vcfFile : vcfFiles) { WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -B eval,VCF," + validationDataLocation + vcfFile + " -B comp,VCF," + validationDataLocation + "GenotypeConcordanceComp.vcf -E GenotypeConcordance -reportType CSV -o %s", 1, - Arrays.asList("51574b4ab0b381c5a01268f91e78b25c")); + Arrays.asList("15d1075d384da2bb7445f7493f2b6a07")); executeTest("testVEGenotypeConcordance" + vcfFile, spec); } @@ -57,8 +57,8 @@ public class @Test public void testVESimple() { HashMap expectations = new HashMap(); - expectations.put("-L 1:1-10,000,000", "8f76d8c0e3a8a5836bb5bf423e04c268"); - expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "6b128bacbd0402471bd6d4e3f9283c47"); + expectations.put("-L 1:1-10,000,000", "629b8b124306435ff56b66357354dfbc"); + expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "f51c299d500b347d098c7ab25f54a436"); for ( Map.Entry entry : expectations.entrySet() ) { String extraArgs = entry.getKey(); @@ -80,10 +80,10 @@ public class " -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf"; - String matchingMD5 = "aca5a64a0e4850906db2bd820253b784"; + String matchingMD5 = "d01725ce4e46c8fea0855a923c1598fd"; expectations.put("", matchingMD5); expectations.put(" -known comp_hapmap -known dbsnp", matchingMD5); - expectations.put(" -known comp_hapmap", "442213609c2866f7a90cbc4b3486441a"); + expectations.put(" -known comp_hapmap", "a50be9240f6c90503fb6333d8a78b974"); for (String tests : testsEnumerations) { for (Map.Entry entry : expectations.entrySet()) { String extraArgs2 = entry.getKey(); @@ -118,7 +118,7 @@ public class for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(tests + " " + extraArgs + " -o %s -outputVCF %s", 2, - Arrays.asList("dc53aaf7db9f05e3b0a38bf5efe3fbbe", "d94328f4a5f7c40e95edf2ef13f38ae0")); + Arrays.asList("483f821ce96f4cf571e9bba356c9f325", "d94328f4a5f7c40e95edf2ef13f38ae0")); executeTest("testVEWriteVCF", spec); } }