From 6ff74deac71fb405638c85612ea762acbaee4e39 Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Mon, 6 May 2013 12:11:04 -0400 Subject: [PATCH] Add overall genotype concordance to the genotype concordance tool. In addition, protect from non-diploid genotypes, which can cause very strange behavior. Update MD5 sums. As expected, md5 changes are consistent with the genotype concordance field being added to each output. --- .../ConcordanceMetricsUnitTest.java | 2 + .../GenotypeConcordanceIntegrationTest.java | 14 +++---- .../variantutils/ConcordanceMetrics.java | 41 +++++++++++++++++++ .../variantutils/GenotypeConcordance.java | 10 +++++ 4 files changed, 60 insertions(+), 7 deletions(-) mode change 100644 => 100755 protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java mode change 100644 => 100755 protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java mode change 100644 => 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java mode change 100644 => 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java old mode 100644 new mode 100755 index bca912d63..bd9ff4f80 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -567,8 +567,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { table[5] = new int[] {12, 0, 34, 20, 10, 0}; double EXPEC_NRS = 0.8969957; double EXPEC_NRD = 0.1071429; + double EXPEC_OGC = 0.92592592; // (100+150+50)/(100+5+1+150+7+3+50+2+6) Assert.assertEquals(EXPEC_NRS,metrics.getOverallNRS(),1e-7); Assert.assertEquals(EXPEC_NRD,metrics.getOverallNRD(),1e-7); + Assert.assertEquals(EXPEC_OGC,metrics.getOverallOGC(),1e-7); int EXPEC_EVAL_REF = 124; int EXPEC_EVAL_HET = 169; int EXPEC_EVAL_VAR = 62; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java old mode 100644 new mode 100755 index d5f75a8cd..ffd358a6e --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java @@ -64,7 +64,7 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("NA12878.Jan2013.haplotypeCaller.subset.indels.vcf", "NA12878.Jan2013.bestPractices.subset.indels.vcf"), 0, - Arrays.asList("0f29a0c6dc44066228c8cb204fd53ec0") + Arrays.asList("6fe03c63a76cb61a76e550137ebf8c5e") ); executeTest("test indel concordance", spec); @@ -75,7 +75,7 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("GenotypeConcordanceNonOverlapTest_Eval.vcf", "GenotypeConcordanceNonOverlapTest_Comp.vcf"), 0, - Arrays.asList("fc725022d47b4b5f8a6ef87f0f1ffe89") + Arrays.asList("6246d81b25a9a96e379c47056177a65d") ); executeTest("test non-overlapping samples", spec); @@ -86,7 +86,7 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("GenotypeConcordanceNonOverlapTest_Eval.vcf", "GenotypeConcordanceNonOverlapTest_Comp.vcf") + " -moltenize", 0, - Arrays.asList("3993709e38b033e89017dfbb63226e94") + Arrays.asList("ee1da9b0119ce7869b2d05d81cef255e") ); executeTest("Test moltenized output",spec); @@ -97,7 +97,7 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("GenotypeConcordance.multipleRecordsTest1.eval.vcf","GenotypeConcordance.multipleRecordsTest1.comp.vcf"), 0, - Arrays.asList("352d59c4ac0cee5eb8ddbc9404b19ce9") + Arrays.asList("a1c48b041b0f0b8bf9387d5db337e5a1") ); executeTest("test multiple records per site",spec); @@ -108,7 +108,7 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("genotypeConcordanceFilterTest.vcf","genotypeConcordanceFilterTest.vcf") + " -gfe 'GQ<30'", 0, - Arrays.asList("b7b495ccfa6d50a6be3e095d3f6d3c52") + Arrays.asList("7f52e70482c30031bedf2fcc6bd359b2") ); executeTest("Test filtering on the EVAL rod",spec); @@ -119,7 +119,7 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("genotypeConcordanceFilterTest.vcf","genotypeConcordanceFilterTest.vcf") + " -gfc 'LX<0.50'", 0, - Arrays.asList("6406b16cde7960b8943edf594303afd6") + Arrays.asList("1402712d1ab18bafa5bac130af2f974c") ); executeTest("Test filtering on the COMP rod", spec); @@ -130,7 +130,7 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("genotypeConcordanceFilterTest.vcf","genotypeConcordanceFilterTest.vcf") + " -gfc 'LX<0.52' -gfe 'DP<5' -gfe 'GQ<37'", 0, - Arrays.asList("26ffd06215b6177acce0ea9f35d73d31") + Arrays.asList("6b83695122481d2dcbe3c792caf743a1") ); executeTest("Test filtering on both rods",spec); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java old mode 100644 new mode 100755 index b3b4857b6..005acf27b --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFHeader; @@ -81,10 +82,23 @@ public class ConcordanceMetrics { return Collections.unmodifiableMap(nrd); } + public Map getPerSampleOGC() { + Map ogc = new HashMap(perSampleGenotypeConcordance.size()); + for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { + ogc.put(sampleTable.getKey(),calculateOGC(sampleTable.getValue())); + } + + return Collections.unmodifiableMap(ogc); + } + public Double getOverallNRD() { return calculateNRD(overallGenotypeConcordance); } + public Double getOverallOGC() { + return calculateOGC(overallGenotypeConcordance); + } + public Map getPerSampleNRS() { Map nrs = new HashMap(perSampleGenotypeConcordance.size()); for ( Map.Entry sampleTable : perSampleGenotypeConcordance.entrySet() ) { @@ -110,6 +124,11 @@ public class ConcordanceMetrics { for ( String sample : perSampleGenotypeConcordance.keySet() ) { Genotype evalGenotype = eval.getGenotype(sample); Genotype truthGenotype = truth.getGenotype(sample); + // ensure genotypes are either no-call ("."), missing (empty alleles), or diploid + if ( ( ! evalGenotype.isNoCall() && evalGenotype.getPloidy() != 2 && evalGenotype.getPloidy() > 0) || + ( ! truthGenotype.isNoCall() && truthGenotype.getPloidy() != 2 && truthGenotype.getPloidy() > 0) ) { + throw new UserException(String.format("Concordance Metrics is currently only implemented for DIPLOID genotypes, found eval ploidy: %d, comp ploidy: %d",evalGenotype.getPloidy(),truthGenotype.getPloidy())); + } perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef); overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef); } @@ -136,10 +155,32 @@ public class ConcordanceMetrics { return total == 0 ? 1.0 : 1.0 - ( (double) correct)/( (double) total); } + private static double calculateOGC(int[][] concordanceCounts) { + int correct = 0; + int total = 0; + correct += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_REF.ordinal()]; + correct += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HET.ordinal()]; + correct += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += correct; + total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HET.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_REF.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_REF.ordinal()]; + total += concordanceCounts[GenotypeType.HET.ordinal()][GenotypeType.HOM_VAR.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HOM_REF.ordinal()]; + total += concordanceCounts[GenotypeType.HOM_VAR.ordinal()][GenotypeType.HET.ordinal()]; + // NRD is by definition incorrec/total = 1.0-correct/total + // note: if there are no observations (so the ratio is NaN), set this to 100% + return total == 0 ? 1.0 : ( (double) correct)/( (double) total); + } + private static double calculateNRS(GenotypeConcordanceTable table) { return calculateNRS(table.getTable()); } + private static double calculateOGC(GenotypeConcordanceTable table) { + return calculateOGC(table.getTable()); + } + private static double calculateNRS(int[][] concordanceCounts) { long confirmedVariant = 0; long unconfirmedVariant = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java old mode 100644 new mode 100755 index 3110d25b9..dd9e822c8 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -293,6 +293,7 @@ public class GenotypeConcordance extends RodWalker entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) { ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue(); @@ -378,9 +379,13 @@ public class GenotypeConcordance extends RodWalker nrdEntry : metrics.getPerSampleNRD().entrySet() ) { concordanceSummary.set(nrdEntry.getKey(),"Non-Reference_Discrepancy",nrdEntry.getValue()); } + for ( Map.Entry nrdEntry : metrics.getPerSampleOGC().entrySet() ) { + concordanceSummary.set(nrdEntry.getKey(),"Overall_Genotype_Concordance",nrdEntry.getValue()); + } concordanceSummary.set("ALL_NRS_NRD","Sample","ALL"); concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Sensitivity",metrics.getOverallNRS()); concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Discrepancy",metrics.getOverallNRD()); + concordanceSummary.set("ALL_NRS_NRD","Overall_Genotype_Concordance",metrics.getOverallOGC()); for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { @@ -411,6 +416,7 @@ public class GenotypeConcordance extends RodWalker nrdEntry : metrics.getPerSampleNRD().entrySet() ) { concordanceSummary.set(nrdEntry.getKey(),"Non-Reference Discrepancy",nrdEntry.getValue()); } + for ( Map.Entry ogcEntry : metrics.getPerSampleOGC().entrySet() ) { + concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue()); + } concordanceSummary.set("ALL","Sample","ALL"); concordanceSummary.set("ALL","Non-Reference Sensitivity",metrics.getOverallNRS()); concordanceSummary.set("ALL","Non-Reference Discrepancy",metrics.getOverallNRD()); + concordanceSummary.set("ALL","Overall_Genotype_Concordance",metrics.getOverallOGC()); for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) { siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type));