From 92c7c103c1fb671024dc14bf157b9cd357fd332f Mon Sep 17 00:00:00 2001 From: Phillip Dexheimer Date: Wed, 28 Jan 2015 22:31:20 -0500 Subject: [PATCH] GenotypeConcordance: monomorphic sites in truth are no longer called "Mismatching Alleles" when the comp genotype has an alternate allele * PT 84700606 --- .../ConcordanceMetricsUnitTest.java | 41 +++++++++ .../variantutils/ConcordanceMetrics.java | 88 ++++++++----------- .../variantutils/GenotypeConcordance.java | 4 +- 3 files changed, 82 insertions(+), 51 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java index 38ac62287..c7043bad9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -742,4 +742,45 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH),1); } + + private Pair getMonoallelicData() { + + final Allele ref = Allele.create(BaseUtils.Base.T.base,true); + final Allele alt = Allele.create(BaseUtils.Base.C.base); + + //Site in eval is monoallelic, both samples are HOM_REF + //sample1 in comp is HOM_VAR, sample2 is NO_CALL + //None of these should trigger mismatching alleles + final GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1",1,1); + final VariantContextBuilder site1Comp = new VariantContextBuilder(); + final VariantContextBuilder site1Eval = new VariantContextBuilder(); + site1Comp.loc(loc.getContig(), loc.getStart(), loc.getStop()); + site1Eval.loc(loc.getContig(), loc.getStart(), loc.getStop()); + site1Comp.alleles(Arrays.asList(ref)); + site1Eval.alleles(Arrays.asList(ref, alt)); + site1Comp.genotypes(GenotypeBuilder.create("test2_sample1", Arrays.asList(ref, ref)), + GenotypeBuilder.create("test2_sample2", Arrays.asList(ref, ref))); + site1Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(alt,alt)), + GenotypeBuilder.create("test2_sample2",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL))); + + return new Pair<>(site1Eval.make(), site1Comp.make()); + } + + @Test + public void testMonoallelicSite() { + final Pair data = getMonoallelicData(); + final VariantContext eval = data.getFirst(); + final VariantContext truth = data.getSecond(); + final VCFCodec codec = new VCFCodec(); + final VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + final VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + final ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,null); + metrics.update(eval,truth); + + + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample1").getnMismatchingAlt(),0); + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample2").getnMismatchingAlt(),0); + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample1").getTable()[3][1],1); + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample2").getTable()[0][1],1); + } } \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetrics.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetrics.java index 2b1897c04..7085b515b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetrics.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetrics.java @@ -46,9 +46,9 @@ public class ConcordanceMetrics { final PrintStream sitesFile; public ConcordanceMetrics(VCFHeader evaluate, VCFHeader truth, PrintStream inputSitesFile) { - HashSet overlappingSamples = new HashSet(evaluate.getGenotypeSamples()); + HashSet overlappingSamples = new HashSet<>(evaluate.getGenotypeSamples()); overlappingSamples.retainAll(truth.getGenotypeSamples()); - perSampleGenotypeConcordance = new HashMap(overlappingSamples.size()); + perSampleGenotypeConcordance = new HashMap<>(overlappingSamples.size()); for ( String sample : overlappingSamples ) { perSampleGenotypeConcordance.put(sample,new GenotypeConcordanceTable()); } @@ -82,7 +82,7 @@ public class ConcordanceMetrics { } public Map getPerSampleNRD() { - Map nrd = new HashMap(perSampleGenotypeConcordance.size()); + Map nrd = new HashMap<>(perSampleGenotypeConcordance.size()); for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { nrd.put(sampleTable.getKey(),calculateNRD(sampleTable.getValue())); } @@ -91,7 +91,7 @@ public class ConcordanceMetrics { } public Map getPerSampleOGC() { - Map ogc = new HashMap(perSampleGenotypeConcordance.size()); + Map ogc = new HashMap<>(perSampleGenotypeConcordance.size()); for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { ogc.put(sampleTable.getKey(),calculateOGC(sampleTable.getValue())); } @@ -108,7 +108,7 @@ public class ConcordanceMetrics { } public Map getPerSampleNRS() { - Map nrs = new HashMap(perSampleGenotypeConcordance.size()); + Map nrs = new HashMap<>(perSampleGenotypeConcordance.size()); for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { nrs.put(sampleTable.getKey(),calculateNRS(sampleTable.getValue())); } @@ -121,25 +121,20 @@ public class ConcordanceMetrics { } @Requires({"eval != null","truth != null"}) - public void update(VariantContext eval, VariantContext truth) { - boolean doPrint = false; + public void update(final VariantContext eval, final VariantContext truth) { overallSiteConcordance.update(eval,truth); - Set alleleTruth = new HashSet(8); - String truthRef = truth.getReference().getBaseString(); - alleleTruth.add(truthRef); - for ( Allele a : truth.getAlternateAlleles() ) { - alleleTruth.add(a.getBaseString()); - } - for ( String sample : perSampleGenotypeConcordance.keySet() ) { - Genotype evalGenotype = eval.getGenotype(sample); - Genotype truthGenotype = truth.getGenotype(sample); + final Set truthAlleles = new HashSet<>(truth.getAlleles()); + for ( final String sample : perSampleGenotypeConcordance.keySet() ) { + final Genotype evalGenotype = eval.getGenotype(sample); + final Genotype truthGenotype = truth.getGenotype(sample); // ensure genotypes are either no-call ("."), missing (empty alleles), or diploid if ( ( ! evalGenotype.isNoCall() && evalGenotype.getPloidy() != 2 && evalGenotype.getPloidy() > 0) || ( ! truthGenotype.isNoCall() && truthGenotype.getPloidy() != 2 && truthGenotype.getPloidy() > 0) ) { throw new UserException(String.format("Concordance Metrics is currently only implemented for DIPLOID genotypes, found eval ploidy: %d, comp ploidy: %d",evalGenotype.getPloidy(),truthGenotype.getPloidy())); } - perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef); - doPrint = overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef); + final boolean allelesMatch = doAllelesMatch(evalGenotype, truthGenotype, truth.getReference(), truthAlleles); + perSampleGenotypeConcordance.get(sample).update(allelesMatch, evalGenotype, truthGenotype); + final boolean doPrint = overallGenotypeConcordance.update(allelesMatch, evalGenotype, truthGenotype); if(sitesFile != null && doPrint) sitesFile.println(eval.getChr() + ":" + eval.getStart() + "\t" + sample + "\t" + truthGenotype.getType() + "\t" + evalGenotype.getType()); } @@ -211,6 +206,29 @@ public class ConcordanceMetrics { return total == 0l ? 0.0 : ( (double) confirmedVariant ) / ( (double) ( total ) ); } + private boolean doAllelesMatch(final Genotype eval, final Genotype truth, + final Allele truthRef, final Set truthSiteAlleles) { + // When determining if alleles match, there are a number of cases to consider. In order: + // 1) If either genotype is uncalled or unavailable, the alleles MATCH + // 2) If the truth genotype is hom ref, then: + // a) If the truth variant is mononallelic (no alternate alleles), the alleles MATCH + // b) Otherwise, the alleles match IFF the alleles in the eval genotype are a subset + // of the alleles in the truth VARIANT + // 3) Otherwise, the alleles match IFF the alleles in the eval genotype are a subset + // of the alleles in (the truth GENOTYPE + the truth REF allele) + boolean matching = true; + if (eval.isCalled() && truth.isCalled()) { // Case 1 + if (truth.isHomRef()) { // Case 2 + matching = truthSiteAlleles.size() == 1 || truthSiteAlleles.containsAll(eval.getAlleles()); + } else { // Case 3 + final Set truthAlleles = new HashSet<>(truth.getAlleles()); + truthAlleles.add(truthRef); + matching = truthAlleles.containsAll(eval.getAlleles()); + } + } + return matching; + } + class GenotypeConcordanceTable { @@ -223,38 +241,10 @@ public class ConcordanceMetrics { } @Requires({"eval!=null","truth != null","truthAlleles != null"}) - public Boolean update(Genotype eval, Genotype truth, Set truthAlleles, String truthRef) { - // this is slow but correct. - - // NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles - // that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will - // all match, so long as A and T are alleles in the truth callset. - boolean matchingAlt = true; - int evalGT, truthGT; - if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) { - // by default, no-calls "match" between alleles, so if - // one or both sites are no-call or unavailable, the alt alleles match - // otherwise, check explicitly: if the eval has an allele that's not ref, no-call, or present in truth - // the alt allele is mismatching - regardless of whether the genotype is correct. - for ( Allele evalAllele : eval.getAlleles() ) { - matchingAlt &= truthAlleles.contains(evalAllele.getBaseString()); - } - } else if ( eval.isCalled() && truth.isCalled() ) { - // otherwise, the eval genotype has to match either the alleles in the truth genotype, or the truth reference allele - // todo -- this can be sped up by caching the truth allele sets - Set genoAlleles = new HashSet(3); - genoAlleles.add(truthRef); - for ( Allele truthGenoAl : truth.getAlleles() ) { - genoAlleles.add(truthGenoAl.getBaseString()); - } - for ( Allele evalAllele : eval.getAlleles() ) { - matchingAlt &= genoAlleles.contains(evalAllele.getBaseString()); - } - } - + public Boolean update(final boolean matchingAlt, final Genotype eval, final Genotype truth) { if ( matchingAlt ) { - evalGT = eval.getType().ordinal(); - truthGT = truth.getType().ordinal(); + final int evalGT = eval.getType().ordinal(); + final int truthGT = truth.getType().ordinal(); genotypeCounts[evalGT][truthGT]++; if(evalGT != truthGT) //report variants where genotypes don't match return true; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java index e7a8cd23b..d87c7ec71 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java @@ -74,8 +74,8 @@ import java.util.*; *

Tables

*

* Headers for the (non-moltenized -- see below) GenotypeConcordance counts and proportions tables give the genotype of - * the COMP callset followed by the genotype of the EVAL callset. For example the value corresponding to HOM_REF_HET - * reflects variants called HOM_REF in the COMP callset and HET in the EVAL callset. Variants for which the alternate + * the EVAL callset followed by the genotype of the COMP callset. For example the value corresponding to HOM_REF_HET + * reflects variants called HOM_REF in the EVAL callset and HET in the COMP callset. Variants for which the alternate * alleles between the EVAL and COMP sample did not match are excluded from genotype comparisons and given in the * "Mismatching_Alleles" field. *