From c3b7dd1374ece9cb8bffedd6518f5d6d9582eec0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 26 Nov 2012 12:19:11 -0500 Subject: [PATCH] Misc cleanup in the HaplotypeCaller. Cleaning up unused arguments after recent changes to HC-GenotypingEngine --- .../haplotypecaller/GenotypingEngine.java | 4 +- .../haplotypecaller/HaplotypeCaller.java | 66 ++----------------- .../LikelihoodCalculationEngine.java | 1 - .../HaplotypeCallerIntegrationTest.java | 17 ++--- .../annotator/MappingQualityRankSumTest.java | 5 +- .../gatk/walkers/annotator/RankSumTest.java | 2 +- .../walkers/annotator/ReadPosRankSumTest.java | 2 +- 7 files changed, 21 insertions(+), 76 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index beec8a92e..4fc2dc8f7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -41,13 +41,11 @@ import java.util.*; public class GenotypingEngine { private final boolean DEBUG; - private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + public GenotypingEngine( final boolean DEBUG ) { this.DEBUG = DEBUG; - this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; noCall.add(Allele.NO_CALL); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 2b739a321..24b3309f1 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -131,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; - @Advanced - @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false) - protected boolean GENOTYPE_FULL_ACTIVE_REGION = false; - - @Advanced - @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false) - protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; - @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; @@ -248,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); @@ -273,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_PL_KEY); - // header lines for the experimental HaplotypeCaller-specific annotations - headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant")); - headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region")); - headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region")); - headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region")); - headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles")); - headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL")); - headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX")); - headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant")); // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used @@ -298,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + genotypingEngine = new GenotypingEngine( DEBUG ); } //--------------------------------------------------------------------------------------------------------------- @@ -428,43 +412,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - - if( !GENOTYPE_FULL_ACTIVE_REGION ) { - // add some custom annotations to the calls - - // Calculate the number of variants on the haplotype - int maxNumVar = 0; - for( final Allele allele : callResult.getFirst().getAlleles() ) { - if( !allele.isReference() ) { - for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { - final int numVar = haplotype.getEventMap().size(); - if( numVar > maxNumVar ) { maxNumVar = numVar; } - } - } - } - // Calculate the event length - int maxLength = 0; - for ( final Allele a : annotatedCall.getAlternateAlleles() ) { - final int length = a.length() - annotatedCall.getReference().length(); - if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } - } - - myAttributes.put("NVH", maxNumVar); - myAttributes.put("NumHapEval", bestHaplotypes.size()); - myAttributes.put("NumHapAssembly", haplotypes.size()); - myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); - myAttributes.put("EVENTLENGTH", maxLength); - myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); - myAttributes.put("extType", annotatedCall.getType().toString() ); - - //if( likelihoodCalculationEngine.haplotypeScore != null ) { - // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); - //} - if( annotatedCall.hasAttribute("QD") ) { - myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); - } - } - vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); } @@ -520,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage + // BUGBUG: remove when positinal downsampler is hooked up to ART/HC if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { activeRegion.add(clippedRead); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 304f8d5cb..29622ca17 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -169,7 +169,6 @@ public class LikelihoodCalculationEngine { } // compute the diploid haplotype likelihoods - // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a57462d1d..007df3602 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,18 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3"); + HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f"); + HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f"); } // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + "541aa8291f03ba33bd1ad3d731fd5657"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -43,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -54,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -65,7 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762"); } @Test @@ -78,7 +79,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e")); executeTest("HCTestStructuralIndels: ", spec); } @@ -92,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("425f1a0fb00d7145edf1c55e54346fae")); + Arrays.asList("40bf739fb2b1743642498efe79ea6342")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 82596a501..2679a169b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn final List refQuals, final List altQuals) { if (pileup != null && likelihoodMap == null) { - // no per-read likelihoods available: + // old UG snp-only path through the annotations for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { @@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + // BUGBUG: There needs to be a comparable isUsableBase check here if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) refQuals.add((double)el.getKey().getMappingQuality()); else if (allAlleles.contains(a)) altQuals.add((double)el.getKey().getMappingQuality()); - - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 0df7aff71..e7c0e6b14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR ReadBackedPileup pileup = null; - if (stratifiedContexts != null) { + if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context != null ) pileup = context.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index d01233bb2..334b89f01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio final List refQuals, final List altQuals) { if (alleleLikelihoodMap == null) { - // use fast SNP-based version if we don't have per-read allele likelihoods + // use old UG SNP-based version if we don't have per-read allele likelihoods for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);