From 300b474c964e668476410dfc9868de3d6ced56d3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 12 Feb 2014 09:02:02 -0500 Subject: [PATCH] Several improvements to the single sample combining steps. 1. updated QualByDepth not to use AD-restricted depth if it is zero. Added unit test this change. 2. Fixed small bug in CombineGVCFs where spanning deletions were not being treated consistently throughout. Added test for this situation. 3. Make sure GenotypeGVCFs puts in the required headers. Updated test files to make sure this is covered. 4. Have GenotypeGVCFs propagate up the MLEAC/AF (which were getting clobbered out). Tests updated to account for this. --- .../gatk/walkers/annotator/QualByDepth.java | 36 +++++++++++-------- .../walkers/variantutils/CombineGVCFs.java | 21 ++++++++--- .../walkers/variantutils/GenotypeGVCFs.java | 19 ++++++---- .../annotator/QualByDepthUnitTest.java | 4 +-- .../CombineGVCFsIntegrationTest.java | 11 ++++-- .../GenotypeGVCFsIntegrationTest.java | 8 ++--- 6 files changed, 62 insertions(+), 37 deletions(-) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index aff38419c..7ebbd49dd 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -54,6 +54,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -86,7 +87,8 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( genotypes == null || genotypes.size() == 0 ) return null; - int depth = 0; + int standardDepth = 0; + int ADrestrictedDepth = 0; for ( final Genotype genotype : genotypes ) { @@ -101,39 +103,43 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati // TODO -- annotations must come afterwards (so that QD can use the AD). if ( genotype.hasAD() ) { final int[] AD = genotype.getAD(); - int variantDepth = 0; - for ( int i = 1; i < AD.length; i++ ) - variantDepth += AD[i]; - if ( variantDepth <= 1 ) - continue; + final int totalADdepth = (int)MathUtils.sum(AD); + if ( totalADdepth - AD[0] > 1 ) + ADrestrictedDepth += totalADdepth; + standardDepth += totalADdepth; + continue; } if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; - depth += context.getBasePileup().depthOfCoverage(); + standardDepth += context.getBasePileup().depthOfCoverage(); } else if (perReadAlleleLikelihoodMap != null) { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) continue; - depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); + standardDepth += perReadAlleleLikelihoods.getNumberOfStoredElements(); } else if ( genotype.hasDP() ) { - depth += genotype.getDP(); + standardDepth += genotype.getDP(); } } - if ( depth == 0 ) + // if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward + if ( ADrestrictedDepth > 0 ) + standardDepth = ADrestrictedDepth; + + if ( standardDepth == 0 ) return null; final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); - // Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a - // full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper) - double QD = -10.0 * vc.getLog10PError() / ((double)depth * indelNormalizationFactor(altAlleleLength, ref != null)); + // Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a + // full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper) + double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null)); - // Hack: see note in the fixTooHighQD method below + // Hack: see note in the fixTooHighQD method below QD = fixTooHighQD(QD); final Map map = new HashMap<>(); @@ -149,7 +155,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati * @return a possitive double */ private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) { - return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0); + return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0); } /** diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java index 314f6ae42..0f577cb23 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFs.java @@ -205,12 +205,25 @@ public class CombineGVCFs extends RodWalker allele). + * + * @param vc the variant context + * @param pos the position to query against + * @return true if this variant context "ends" at this position, false otherwise + */ + private boolean isEndingContext(final VariantContext vc, final int pos) { + return vc.getNAlleles() > 2 || vc.getEnd() == pos; + } + /** * Disrupt the VariantContexts so that they all stop at the given pos, write them out, and put the remainder back in the list. * @@ -228,10 +241,8 @@ public class CombineGVCFs extends RodWalker allele) - if ( vc.getNAlleles() > 2 || vc.getEnd() == pos ) + // if it was ending anyways, then remove it from the future state + if ( isEndingContext(vc, pos) ) state.VCs.remove(i); } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java index afd657ea3..134ed0cd1 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFs.java @@ -56,10 +56,8 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.GenomeLoc; @@ -154,10 +152,14 @@ public class GenotypeGVCFs extends RodWalkeremptyList(), this, getToolkit()); + // take care of the VCF headers final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); - headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions()); + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.MLE_ALLELE_COUNT_KEY, VCFConstants.MLE_ALLELE_FREQUENCY_KEY); if ( dbsnp != null && dbsnp.dbsnp.isBound() ) VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); @@ -168,9 +170,6 @@ public class GenotypeGVCFs extends RodWalkeremptyList(), this, getToolkit()); - // collect the actual rod bindings into a list for use later for ( final RodBindingCollection variantCollection : variantCollections ) variants.addAll(variantCollection.getRodBindings()); @@ -206,7 +205,13 @@ public class GenotypeGVCFs extends RodWalker attrs = new HashMap<>(originalVC.getAttributes()); + attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); + attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + + result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make(); } // if it turned monomorphic and we don't want such sites, quit diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java index 4494e16bf..a118a462d 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepthUnitTest.java @@ -69,9 +69,6 @@ public class QualByDepthUnitTest extends WalkerTest { final List AA = Arrays.asList(A,A); final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); final List GG = Arrays.asList(G,G); final List ACG = Arrays.asList(A,C,G); @@ -81,6 +78,7 @@ public class QualByDepthUnitTest extends WalkerTest { final Genotype gGG = new GenotypeBuilder("4", GG).DP(10).AD(new int[]{1,9}).make(); tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC)).make(), 5.0}); + tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gACerror)).make(), 5.0}); tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAA, gAC)).make(), 5.0}); tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC, gACerror)).make(), 5.0}); tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, ACG).log10PError(-5).genotypes(Arrays.asList(gAA, gAC, gACerror, gGG)).make(), 2.5}); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java index 17b9124aa..03d136290 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -139,7 +139,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { final File gVCF = executeTest("testOneHasDeletionAndTwoHasRefBlock", spec).first.get(0); final List allVCs = GATKVCFUtils.readVCF(gVCF).getSecond(); - Assert.assertEquals(allVCs.size(), 2); + Assert.assertEquals(allVCs.size(), 3); final VariantContext first = allVCs.get(0); Assert.assertEquals(first.getStart(), 69772); @@ -149,14 +149,19 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { final VariantContext second = allVCs.get(1); Assert.assertEquals(second.getStart(), 69773); - Assert.assertEquals(second.getEnd(), 69783); + Assert.assertEquals(second.getEnd(), 69774); Assert.assertEquals(second.getGenotypes().size(), 2); + + final VariantContext third = allVCs.get(2); + Assert.assertEquals(third.getStart(), 69775); + Assert.assertEquals(third.getEnd(), 69783); + Assert.assertEquals(third.getGenotypes().size(), 2); } @Test public void testMD5s() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("d2a5ca67a8ef6e27854e7a439883f05d")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("aecdfa9eb32b802cd629e9f811ef15fd")); spec.disableShadowBCF(); executeTest("testMD5s", spec); } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index da7ec3dd4..fff353c60 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -65,7 +65,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("54487ea151c49d36a15eac8097a7e460")); + Arrays.asList("ebda39d3343b34d21490a78284ed88e8")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -89,7 +89,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("9e6ef126d5e872e5b2a68c3d73471566")); + Arrays.asList("8eeda24a07f66d67b7639a31fda5c903")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -99,7 +99,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + " -V " + privateTestDir + "gvcfExample1.vcf", 1, - Arrays.asList("dd0e2846b3be9692ecb94f152b45c316")); + Arrays.asList("2541e164056d5632ad7de784a9af3880")); executeTest("testJustOneSample", spec); } @@ -110,7 +110,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V " + privateTestDir + "gvcfExample1.vcf" + " -V " + privateTestDir + "gvcfExample2.vcf", 1, - Arrays.asList("a4f76a094af4708fc7f96a9b7a1b8726")); + Arrays.asList("9daf9602338db9d06c075c6e9a15ee2c")); executeTest("testSamplesWithDifferentLs", spec); } } \ No newline at end of file