From 7f9f58dbd1c357ca07cae22ea875505b4797ebf8 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Thu, 27 Feb 2014 15:48:40 -0500 Subject: [PATCH] Added hidden flag to GenotypeConcordance to output sites of discordant genotypes (to System.out) Revised ConcondanceMetrics tests to adapt to change Added comments to PosteriorLikelihoodsUtils --- .../PosteriorLikelihoodsUtils.java | 33 ++++++++++++++++--- .../ConcordanceMetricsUnitTest.java | 22 ++++++------- .../variantutils/ConcordanceMetrics.java | 24 +++++++++++--- .../variantutils/GenotypeConcordance.java | 21 ++++++++---- 4 files changed, 75 insertions(+), 25 deletions(-) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java index 6f56415f7..4b81c199f 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java @@ -67,14 +67,18 @@ public class PosteriorLikelihoodsUtils { throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented"); final Map totalAlleleCounts = new HashMap<>(); + + //store the allele counts for each allele in the variant priors for ( final VariantContext resource : resources ) { addAlleleCounts(totalAlleleCounts,resource,useAC); } + //add the allele counts from the input samples (if applicable) if ( useInputSamples ) { addAlleleCounts(totalAlleleCounts,vc1,useAC); } + //add zero allele counts for any reference alleles not seen in priors (if applicable) totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources); // now extract the counts of the alleles present within vc1, and in order @@ -86,6 +90,7 @@ public class PosteriorLikelihoodsUtils { totalAlleleCounts.get(allele) : 0 ); } + //parse the likelihoods for each sample's genotype final List likelihoods = new ArrayList<>(vc1.getNSamples()); for ( final Genotype genotype : vc1.getGenotypes() ) { likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null ); @@ -196,13 +201,24 @@ public class PosteriorLikelihoodsUtils { return priors; } + /** + * Parse counts for each allele + * @param counts - Map to store and return data + * @param context - line to be parsed from the input VCF file + * @param useAC - use allele count annotation value from VariantContext (vs. MLEAC) + */ private static void addAlleleCounts(final Map counts, final VariantContext context, final boolean useAC) { final int[] ac; + //use MLEAC value... if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { ac = extractInts(context.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); - } else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { + } + //...unless specified by the user in useAC + else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { ac = extractInts(context.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); - } else { + } + //if VariantContext annotation doesn't contain AC/MLEAC then get the data from another field + else { ac = new int[context.getAlternateAlleles().size()]; int idx = 0; for ( final Allele allele : context.getAlternateAlleles() ) { @@ -210,24 +226,33 @@ public class PosteriorLikelihoodsUtils { } } + //since the allele count for the reference allele is not given in the VCF format, + //calculate it from the allele number minus the total counts for alternate alleles for ( final Allele allele : context.getAlleles() ) { final int count; if ( allele.isReference() ) { if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { - count = context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac); + count = Math.max(context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac),0); //occasionally an MLEAC value will sneak in that's greater than the AN } else { - count = context.getCalledChrCount() - (int) MathUtils.sum(ac); + count = Math.max(context.getCalledChrCount() - (int) MathUtils.sum(ac),0); } } else { count = ac[context.getAlternateAlleles().indexOf(allele)]; } + //if this allele isn't in the map yet, add it if ( ! counts.containsKey(allele) ) { counts.put(allele,0); } + //add the count for the current allele to the existing value in the map counts.put(allele,count + counts.get(allele)); } } + /** + * Check the formatting on the Object returned by a call to VariantContext::getAttribute() and parse appropriately + * @param integerListContainingVCField - Object returned by a call to VariantContext::getAttribute() + * @return - array of ints + */ public static int[] extractInts(final Object integerListContainingVCField) { List mleList = null; if ( integerListContainingVCField instanceof List ) { diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index 50c896450..6ece527ce 100755 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -135,7 +135,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); metrics.update(eval,truth); Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),1); @@ -185,7 +185,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); metrics.update(eval,truth); Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2); @@ -205,7 +205,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { codec = new VCFCodec(); evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - metrics = new ConcordanceMetrics(evalHeader,compHeader); + metrics = new ConcordanceMetrics(evalHeader,compHeader,false); metrics.update(eval,truth); Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2); @@ -260,7 +260,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); metrics.update(eval,truth); Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample1").getnMismatchingAlt(),1); Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); @@ -313,7 +313,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); metrics.update(eval,truth); Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[2][1],0); @@ -362,7 +362,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); metrics.update(eval,truth); Assert.assertTrue(eval.getGenotype("test1_sample2").getType().equals(GenotypeType.UNAVAILABLE)); Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); @@ -516,7 +516,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); for ( Pair contextPair : data ) { VariantContext eval = contextPair.getFirst(); @@ -550,7 +550,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); int[][] table = metrics.getOverallGenotypeConcordance().getTable(); // set up the table table[0] = new int[] {30, 12, 7, 5, 6, 0}; @@ -585,8 +585,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_1)))); VCFHeader disjointCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_2)))); VCFHeader overlapCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_3)))); - ConcordanceMetrics disjointMetrics = new ConcordanceMetrics(evalHeader,disjointCompHeader); - ConcordanceMetrics overlapMetrics = new ConcordanceMetrics(evalHeader,overlapCompHeader); + ConcordanceMetrics disjointMetrics = new ConcordanceMetrics(evalHeader,disjointCompHeader,false); + ConcordanceMetrics overlapMetrics = new ConcordanceMetrics(evalHeader,overlapCompHeader,false); // test what happens if you put in disjoint sets and start making requests Assert.assertEquals(0,disjointMetrics.getPerSampleGenotypeConcordance().size()); @@ -716,7 +716,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VCFCodec codec = new VCFCodec(); VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); + ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,false); List> data = getData7(); diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java index 848261d73..79e0e8f9c 100755 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java @@ -42,8 +42,9 @@ public class ConcordanceMetrics { private Map perSampleGenotypeConcordance; private GenotypeConcordanceTable overallGenotypeConcordance; private SiteConcordanceTable overallSiteConcordance; + private Boolean printInterestingSites; - public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth) { + public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth, Boolean printSitesEnabled) { HashSet overlappingSamples = new HashSet(evaluate.getGenotypeSamples()); overlappingSamples.retainAll(truth.getGenotypeSamples()); perSampleGenotypeConcordance = new HashMap(overlappingSamples.size()); @@ -52,6 +53,7 @@ public class ConcordanceMetrics { } overallGenotypeConcordance = new GenotypeConcordanceTable(); overallSiteConcordance = new SiteConcordanceTable(); + printInterestingSites = printSitesEnabled; } public GenotypeConcordanceTable getOverallGenotypeConcordance() { @@ -114,6 +116,7 @@ public class ConcordanceMetrics { @Requires({"eval != null","truth != null"}) public void update(VariantContext eval, VariantContext truth) { + Boolean doPrint = false; overallSiteConcordance.update(eval,truth); Set alleleTruth = new HashSet(8); String truthRef = truth.getReference().getBaseString(); @@ -130,7 +133,12 @@ public class ConcordanceMetrics { throw new UserException(String.format("Concordance Metrics is currently only implemented for DIPLOID genotypes, found eval ploidy: %d, comp ploidy: %d",evalGenotype.getPloidy(),truthGenotype.getPloidy())); } perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef); - overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef); + doPrint = overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef); + if(printInterestingSites && doPrint) + System.out.println(eval.getChr() + ":" + eval.getStart() + "\t truth is:" + truthGenotype.getType() + "\t eval is:" + evalGenotype.getType()); + + //Below is code to print out mismatched alternate alleles + //System.out.println(eval.getChr() + ":" + eval.getStart() + "\t truth is:" + truthGenotype.getAlleles() + "\t eval is:" + evalGenotype.getAlleles()); } } @@ -212,13 +220,14 @@ public class ConcordanceMetrics { } @Requires({"eval!=null","truth != null","truthAlleles != null"}) - public void update(Genotype eval, Genotype truth, Set truthAlleles, String truthRef) { + public Boolean update(Genotype eval, Genotype truth, Set truthAlleles, String truthRef) { // this is slow but correct. // NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles // that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will // all match, so long as A and T are alleles in the truth callset. boolean matchingAlt = true; + int evalGT, truthGT; if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) { // by default, no-calls "match" between alleles, so if // one or both sites are no-call or unavailable, the alt alleles match @@ -241,10 +250,17 @@ public class ConcordanceMetrics { } if ( matchingAlt ) { - genotypeCounts[eval.getType().ordinal()][truth.getType().ordinal()]++; + evalGT = eval.getType().ordinal(); + truthGT = truth.getType().ordinal(); + genotypeCounts[evalGT][truthGT]++; + if(evalGT != truthGT) //report variants where genotypes don't match + return true; } else { nMismatchingAlt++; + return false; + //return true; //alternatively, report variants where alt alleles don't match } + return false; } public int[][] getTable() { diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java index 8c8961cb5..08c938583 100755 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -25,10 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -213,6 +210,16 @@ public class GenotypeConcordance extends RodWalker> evalCompList, ConcordanceMetrics metrics) { - for ( Pair evalComp : evalCompList) + for ( Pair evalComp : evalCompList){ metrics.update(evalComp.getFirst(),evalComp.getSecond()); + + } return metrics; }