From 7c9ef59d6594eb53f210ccaca9f19b1505d35cb0 Mon Sep 17 00:00:00 2001 From: chartl Date: Wed, 13 Oct 2010 22:26:15 +0000 Subject: [PATCH] This is simultaneously a minor and major change to VariantEval, so take heed: The core walker has been modified so that when variant contexts (eval and comp) are subset to command-line-specified sample(s), the chromosome count annotations (AC/AN/AF) are altered to reflect the AC/AN/AF of only those samples involved in the comparison. No more getting AC500 when you're comparing a 10-sample overlap. Interestingly enough, this didn't break any integration tests. GenotypeConcordance now has two additional tables: Allele Count Statistics, and Allele Count Summary Statistics. These work exactly identically to the Sample Statistics and Sample Summary Statistics tables, except that the partition being used is no longer the sample, but instead the allele count of the variant sites. These tables stratify by both eval and comp ACs, e.g. evalAC0 evalAC1 evalAC2 compAC0 compAC1 compAC2 Differences with previous integration tests were verified to only be in the Allele Count tables (by grepping them out of the diff); a new test has been added for the simple case of an AC=1 site in the eval becoming an AC=2 site in the comp. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4491 348d0f76-0448-11de-a6fe-93d51630548a --- .../varianteval/GenotypeConcordance.java | 146 ++++++++++++++++-- .../varianteval/VariantEvalWalker.java | 3 + .../VariantEvalIntegrationTest.java | 23 ++- 3 files changed, 147 insertions(+), 25 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java index a85198ac8..1f3712a4a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java @@ -1,8 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; +import org.apache.commons.lang.ArrayUtils; import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; import org.broadinstitute.sting.gatk.contexts.*; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.utils.report.tags.Analysis; @@ -61,6 +63,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva @DataPoint(description = "the variant quality score histograms for true positive and false positive calls") QualityScoreHistograms qualityScoreHistograms = null; + @DataPoint(description = "the concordance statistics summary by allele count") + ACSummaryStats alleleCountSummary = null; + + @DataPoint(description = "the concordance statistics by allele count") + ACStats alleleCountStats = null; + private static final int MAX_MISSED_VALIDATION_DATA = 100; private boolean discordantInteresting = false; @@ -73,12 +81,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva public long nFound = 0; public long nMissed = 0; } - public HashMap alleleCountStats = new HashMap(); + public HashMap foundMissedByAC = new HashMap(); public Object[] getRowKeys() { - String rows[] = new String[alleleCountStats.size()]; + String rows[] = new String[foundMissedByAC.size()]; int index = 0; - for (int i : alleleCountStats.keySet()) rows[index++] = "Allele Count " + i; + for (int i : foundMissedByAC.keySet()) rows[index++] = "Allele Count " + i; return rows; } @@ -91,23 +99,23 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva } public String getCell(int x, int y) { - if (x >= alleleCountStats.size()) throw new IllegalStateException(x + " is greater than the max index of " + (alleleCountStats.size()-1)); - if (y == 0) return String.valueOf(alleleCountStats.get(alleleCountStats.keySet().toArray(new Integer[alleleCountStats.size()])[x]).nFound); - else return String.valueOf(alleleCountStats.get(alleleCountStats.keySet().toArray(new Integer[alleleCountStats.size()])[x]).nMissed); + if (x >= foundMissedByAC.size()) throw new IllegalStateException(x + " is greater than the max index of " + (foundMissedByAC.size()-1)); + if (y == 0) return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nFound); + else return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nMissed); } public void incrementFoundCount(int alleleFreq) { - if (!alleleCountStats.containsKey(alleleFreq)) - alleleCountStats.put(alleleFreq,new Stats(1,0)); + if (!foundMissedByAC.containsKey(alleleFreq)) + foundMissedByAC.put(alleleFreq,new Stats(1,0)); else - alleleCountStats.get(alleleFreq).nFound++; + foundMissedByAC.get(alleleFreq).nFound++; } public void incrementMissedCount(int alleleFreq) { - if (!alleleCountStats.containsKey(alleleFreq)) - alleleCountStats.put(alleleFreq,new Stats(0,1)); + if (!foundMissedByAC.containsKey(alleleFreq)) + foundMissedByAC.put(alleleFreq,new Stats(0,1)); else - alleleCountStats.get(alleleFreq).nMissed++; + foundMissedByAC.get(alleleFreq).nMissed++; } } @@ -256,7 +264,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva if (eval != null) { // initialize the concordance table sampleStats = new SampleStats(eval,Genotype.Type.values().length); + alleleCountStats = new ACStats(eval,Genotype.Type.values().length); sampleSummaryStats = new SampleSummaryStats(eval); + alleleCountSummary = new ACSummaryStats(eval); for (final VariantContext vc : missedValidationData) { determineStats(null, vc); } @@ -287,6 +297,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva String interesting = null; final boolean validationIsValidVC = isValidVC(validation); + final String evalAC = (eval != null && eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) ? String.format("evalAC%s",eval.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null ; + final String validationAC = ( validationIsValidVC && validation.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) ? String.format("compAC%s",validation.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null; // determine concordance for eval data if (eval != null) { @@ -306,6 +318,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva } sampleStats.incrValue(sample, truth, called); + if ( evalAC != null ) { + alleleCountStats.incrValue(evalAC,truth,called); + } + if ( validationAC != null ) { + alleleCountStats.incrValue(validationAC,truth,called); + } } } // otherwise, mark no-calls for all samples @@ -315,7 +333,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva for (final String sample : validation.getSampleNames()) { final Genotype.Type truth = validation.getGenotype(sample).getType(); sampleStats.incrValue(sample, truth, called); - + if ( evalAC != null ) { + alleleCountStats.incrValue(evalAC,truth,called); + } // print out interesting sites if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) { if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) { @@ -328,7 +348,7 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva } } - // determine allele count concordance () + // determine allele count concordance () // this is really a FN rate estimate -- CH if (validationIsValidVC && validation.isPolymorphic()) { int trueAlleleCount = 0; for (final Allele a : validation.getAlternateAlleles()) { @@ -371,6 +391,10 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva if( sampleSummaryStats != null && sampleStats != null ) { sampleSummaryStats.generateSampleSummaryStats( sampleStats ); } + + if ( alleleCountSummary != null && alleleCountStats != null ) { + alleleCountSummary.generateSampleSummaryStats( alleleCountStats ); + } } } @@ -417,12 +441,17 @@ class SampleStats implements TableType { "n_hom/ref","n_hom/het","n_hom/hom"}; } + public SampleStats(VariantContext vc, int nGenotypeTypes) { this.nGenotypeTypes = nGenotypeTypes; for (String sample : vc.getSampleNames()) concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]); } + public SampleStats(int genotypeTypes) { + nGenotypeTypes = genotypeTypes; + } + public Object getCell(int x, int y) { // we have three rows of 6 right now for output (rows: ref, het, hom) Genotype.Type type = Genotype.Type.values()[(y/6)+1]; // get the row type @@ -449,12 +478,35 @@ class SampleStats implements TableType { } } +/** + * Sample stats, but for AC + */ +class ACStats extends SampleStats { + public ACStats(VariantContext vc, int nGenotypeTypes) { + super(nGenotypeTypes); + for ( int i = 0; i <= 2*vc.getNSamples(); i++ ) { // todo -- assuming ploidy 2 here... + concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); + concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); + } + } + + public String getName() { + return "Allele Count Statistics"; + } + + public Object[] getRowKeys() { + String[] acNames = (String[]) super.getRowKeys(); + Arrays.sort(acNames,new CompACNames()); + return acNames; + } +} + /** * a table of sample names to genotype concordance summary statistics */ class SampleSummaryStats implements TableType { - private final static String ALL_SAMPLES_KEY = "allSamples"; - private final static String[] COLUMN_KEYS = new String[]{ + protected final static String ALL_SAMPLES_KEY = "allSamples"; + protected final static String[] COLUMN_KEYS = new String[]{ "percent_comp_ref_called_var", "percent_comp_het_called_het", "percent_comp_het_called_var", @@ -465,7 +517,7 @@ class SampleSummaryStats implements TableType { "percent_non-reference_discrepancy_rate"}; // sample to concordance stats object - private final HashMap concordanceSummary = new HashMap(); + protected final HashMap concordanceSummary = new HashMap(); /** * @@ -490,6 +542,10 @@ class SampleSummaryStats implements TableType { } } + public SampleSummaryStats() { + + } + public Object getCell(int x, int y) { final Object[] rowKeys = getRowKeys(); return String.format("%.2f",concordanceSummary.get(rowKeys[x])[y]); @@ -615,3 +671,59 @@ class SampleSummaryStats implements TableType { return "Sample Summary Statistics"; } } + +/** + * SampleSummaryStats .. but for allele counts + */ +class ACSummaryStats extends SampleSummaryStats { + public ACSummaryStats (final VariantContext vc) { + concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); + for( int i = 0; i <= 2*vc.getNSamples() ; i ++ ) { + concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); + concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); + } + } + + public String getName() { + return "Allele Count Summary Statistics"; + } + + public Object[] getRowKeys() { + String[] acNames = (String[]) super.getRowKeys(); + Arrays.sort(acNames,new CompACNames()); + return acNames; + } +} + +class CompACNames implements Comparator{ + + public boolean equals(Object o) { + return ( o.getClass() == CompACNames.class ); + } + + public int compare(Object o1, Object o2) { + //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); + return getRank(o1) - getRank(o2); + } + + public int getRank(Object o) { + if ( o.getClass() != String.class ) { + return Integer.MIN_VALUE/4; + } else { + String s = (String) o; + if ( s.startsWith("eval") ) { + return Integer.MIN_VALUE/4 + 1 + parseAC(s); + } else if ( s.startsWith("comp") ) { + return 1+ parseAC(s); + } else { + return Integer.MIN_VALUE/4; + } + } + } + + public int parseAC(String s) { + String[] g = s.split("AC"); + return Integer.parseInt(g[1]); + } +} + diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 60a8298a4..2997040a6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -673,6 +673,9 @@ public class VariantEvalWalker extends RodWalker { if ( vc != null && vc.hasGenotypes(SAMPLES_LIST) && SAMPLES_LIST.size() > 0 ) { //if ( ! name.equals("eval") ) logger.info(String.format("subsetting VC %s", vc)); vc = vc.subContextFromGenotypes(vc.getGenotypes(SAMPLES_LIST).values()); + HashMap newAts = new HashMap(vc.getAttributes()); + VariantContextUtils.calculateChromosomeCounts(vc,newAts,true); + vc = VariantContext.modifyAttributes(vc,newAts); //if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc)); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index eb39cd234..10c1f7c2b 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -29,7 +29,7 @@ public class String extraArgs = "-L 1:1-10,000,000"; for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -o %s", - 1, Arrays.asList("482c868400f59e17dbc59d667b4b2eca")); + 1, Arrays.asList("158ac8e6d32eb2ea1bbeebfa512965de")); executeTest("testSelect1", spec); } } @@ -38,7 +38,7 @@ public class public void testSelect2() { String extraArgs = "-L 1:1-10,000,000"; WalkerTestSpec spec = new WalkerTestSpec( withSelect(withSelect(root, "DP < 50", "DP50"), "set==\"Intersection\"", "intersection") + " " + extraArgs + " -o %s", - 1, Arrays.asList("4dfaa72b23ce297a2c29d9f7e9661c37")); + 1, Arrays.asList("cee96f61ffa1d042fe0c63550c508ec9")); executeTest("testSelect2", spec); } @@ -48,7 +48,7 @@ public class for (String vcfFile : vcfFiles) { WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -B:eval,VCF " + validationDataLocation + vcfFile + " -B:comp,VCF " + validationDataLocation + "GenotypeConcordanceComp.vcf -noStandard -E GenotypeConcordance -reportType CSV -o %s", 1, - Arrays.asList("15d1075d384da2bb7445f7493f2b6a07")); + Arrays.asList("7e9ce1b26cdeaa50705f5de163847638")); executeTest("testVEGenotypeConcordance" + vcfFile, spec); } @@ -57,8 +57,8 @@ public class @Test public void testVESimple() { HashMap expectations = new HashMap(); - expectations.put("-L 1:1-10,000,000", "8891969e7522e728b64c112a2b2f9d1e"); - expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "ace2f6170e740a9ee6abc25f130c6848"); + expectations.put("-L 1:1-10,000,000", "ff8c4ba16c7c14b4edbaf440f20641f9"); + expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "9b509ce5d31658eb09bb9597799b2908"); for ( Map.Entry entry : expectations.entrySet() ) { String extraArgs = entry.getKey(); @@ -80,10 +80,10 @@ public class " -B:comp_hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf"; - String matchingMD5 = "dd513bc72860133a58e9ee542782162b"; + String matchingMD5 = "1e6d6e152c9a90513dd5b6eaa3729388"; expectations.put("", matchingMD5); expectations.put(" -known comp_hapmap -known dbsnp", matchingMD5); - expectations.put(" -known comp_hapmap", "bef6d1e5fa3a79faf745711e0d8fa2dd"); + expectations.put(" -known comp_hapmap", "d28dd504017f39a91cde8e6f096879d6"); for (String tests : testsEnumerations) { for (Map.Entry entry : expectations.entrySet()) { String extraArgs2 = entry.getKey(); @@ -118,11 +118,18 @@ public class for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(tests + " " + extraArgs + " -o %s -outputVCF %s", 2, - Arrays.asList("77abdb58b3166d87daadf397e7fb51c4", "989bc30dea6c8a4cf771cd1b9fdab488")); + Arrays.asList("6b97a019402b3984fead9a4e8b7c7c2a", "989bc30dea6c8a4cf771cd1b9fdab488")); executeTest("testVEWriteVCF", spec); } } + @Test + public void testCompVsEvalAC() { + String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -E GenotypeConcordance -B:evalYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.ug.very.few.lines.vcf -B:compYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.fake.genotypes.ac.test.vcf -reportType CSV"; + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("25a681855cb26e7380fbf1a93de0a41f")); + executeTest("testACDiscordanceAtAC1EvalAC2Comp",spec); + } + private static String withSelect(String cmd, String select, String name) { return String.format("%s -select '%s' -selectName %s", cmd, select, name); }