diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java index a85198ac8..1f3712a4a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java @@ -1,8 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; +import org.apache.commons.lang.ArrayUtils; import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; import org.broadinstitute.sting.gatk.contexts.*; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.utils.report.tags.Analysis; @@ -61,6 +63,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva @DataPoint(description = "the variant quality score histograms for true positive and false positive calls") QualityScoreHistograms qualityScoreHistograms = null; + @DataPoint(description = "the concordance statistics summary by allele count") + ACSummaryStats alleleCountSummary = null; + + @DataPoint(description = "the concordance statistics by allele count") + ACStats alleleCountStats = null; + private static final int MAX_MISSED_VALIDATION_DATA = 100; private boolean discordantInteresting = false; @@ -73,12 +81,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva public long nFound = 0; public long nMissed = 0; } - public HashMap alleleCountStats = new HashMap(); + public HashMap foundMissedByAC = new HashMap(); public Object[] getRowKeys() { - String rows[] = new String[alleleCountStats.size()]; + String rows[] = new String[foundMissedByAC.size()]; int index = 0; - for (int i : alleleCountStats.keySet()) rows[index++] = "Allele Count " + i; + for (int i : foundMissedByAC.keySet()) rows[index++] = "Allele Count " + i; return rows; } @@ -91,23 +99,23 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva } public String getCell(int x, int y) { - if (x >= alleleCountStats.size()) throw new IllegalStateException(x + " is greater than the max index of " + (alleleCountStats.size()-1)); - if (y == 0) return String.valueOf(alleleCountStats.get(alleleCountStats.keySet().toArray(new Integer[alleleCountStats.size()])[x]).nFound); - else return String.valueOf(alleleCountStats.get(alleleCountStats.keySet().toArray(new Integer[alleleCountStats.size()])[x]).nMissed); + if (x >= foundMissedByAC.size()) throw new IllegalStateException(x + " is greater than the max index of " + (foundMissedByAC.size()-1)); + if (y == 0) return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nFound); + else return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nMissed); } public void incrementFoundCount(int alleleFreq) { - if (!alleleCountStats.containsKey(alleleFreq)) - alleleCountStats.put(alleleFreq,new Stats(1,0)); + if (!foundMissedByAC.containsKey(alleleFreq)) + foundMissedByAC.put(alleleFreq,new Stats(1,0)); else - alleleCountStats.get(alleleFreq).nFound++; + foundMissedByAC.get(alleleFreq).nFound++; } public void incrementMissedCount(int alleleFreq) { - if (!alleleCountStats.containsKey(alleleFreq)) - alleleCountStats.put(alleleFreq,new Stats(0,1)); + if (!foundMissedByAC.containsKey(alleleFreq)) + foundMissedByAC.put(alleleFreq,new Stats(0,1)); else - alleleCountStats.get(alleleFreq).nMissed++; + foundMissedByAC.get(alleleFreq).nMissed++; } } @@ -256,7 +264,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva if (eval != null) { // initialize the concordance table sampleStats = new SampleStats(eval,Genotype.Type.values().length); + alleleCountStats = new ACStats(eval,Genotype.Type.values().length); sampleSummaryStats = new SampleSummaryStats(eval); + alleleCountSummary = new ACSummaryStats(eval); for (final VariantContext vc : missedValidationData) { determineStats(null, vc); } @@ -287,6 +297,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva String interesting = null; final boolean validationIsValidVC = isValidVC(validation); + final String evalAC = (eval != null && eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) ? String.format("evalAC%s",eval.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null ; + final String validationAC = ( validationIsValidVC && validation.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) ? String.format("compAC%s",validation.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null; // determine concordance for eval data if (eval != null) { @@ -306,6 +318,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva } sampleStats.incrValue(sample, truth, called); + if ( evalAC != null ) { + alleleCountStats.incrValue(evalAC,truth,called); + } + if ( validationAC != null ) { + alleleCountStats.incrValue(validationAC,truth,called); + } } } // otherwise, mark no-calls for all samples @@ -315,7 +333,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva for (final String sample : validation.getSampleNames()) { final Genotype.Type truth = validation.getGenotype(sample).getType(); sampleStats.incrValue(sample, truth, called); - + if ( evalAC != null ) { + alleleCountStats.incrValue(evalAC,truth,called); + } // print out interesting sites if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) { if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) { @@ -328,7 +348,7 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva } } - // determine allele count concordance () + // determine allele count concordance () // this is really a FN rate estimate -- CH if (validationIsValidVC && validation.isPolymorphic()) { int trueAlleleCount = 0; for (final Allele a : validation.getAlternateAlleles()) { @@ -371,6 +391,10 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva if( sampleSummaryStats != null && sampleStats != null ) { sampleSummaryStats.generateSampleSummaryStats( sampleStats ); } + + if ( alleleCountSummary != null && alleleCountStats != null ) { + alleleCountSummary.generateSampleSummaryStats( alleleCountStats ); + } } } @@ -417,12 +441,17 @@ class SampleStats implements TableType { "n_hom/ref","n_hom/het","n_hom/hom"}; } + public SampleStats(VariantContext vc, int nGenotypeTypes) { this.nGenotypeTypes = nGenotypeTypes; for (String sample : vc.getSampleNames()) concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]); } + public SampleStats(int genotypeTypes) { + nGenotypeTypes = genotypeTypes; + } + public Object getCell(int x, int y) { // we have three rows of 6 right now for output (rows: ref, het, hom) Genotype.Type type = Genotype.Type.values()[(y/6)+1]; // get the row type @@ -449,12 +478,35 @@ class SampleStats implements TableType { } } +/** + * Sample stats, but for AC + */ +class ACStats extends SampleStats { + public ACStats(VariantContext vc, int nGenotypeTypes) { + super(nGenotypeTypes); + for ( int i = 0; i <= 2*vc.getNSamples(); i++ ) { // todo -- assuming ploidy 2 here... + concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); + concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); + } + } + + public String getName() { + return "Allele Count Statistics"; + } + + public Object[] getRowKeys() { + String[] acNames = (String[]) super.getRowKeys(); + Arrays.sort(acNames,new CompACNames()); + return acNames; + } +} + /** * a table of sample names to genotype concordance summary statistics */ class SampleSummaryStats implements TableType { - private final static String ALL_SAMPLES_KEY = "allSamples"; - private final static String[] COLUMN_KEYS = new String[]{ + protected final static String ALL_SAMPLES_KEY = "allSamples"; + protected final static String[] COLUMN_KEYS = new String[]{ "percent_comp_ref_called_var", "percent_comp_het_called_het", "percent_comp_het_called_var", @@ -465,7 +517,7 @@ class SampleSummaryStats implements TableType { "percent_non-reference_discrepancy_rate"}; // sample to concordance stats object - private final HashMap concordanceSummary = new HashMap(); + protected final HashMap concordanceSummary = new HashMap(); /** * @@ -490,6 +542,10 @@ class SampleSummaryStats implements TableType { } } + public SampleSummaryStats() { + + } + public Object getCell(int x, int y) { final Object[] rowKeys = getRowKeys(); return String.format("%.2f",concordanceSummary.get(rowKeys[x])[y]); @@ -615,3 +671,59 @@ class SampleSummaryStats implements TableType { return "Sample Summary Statistics"; } } + +/** + * SampleSummaryStats .. but for allele counts + */ +class ACSummaryStats extends SampleSummaryStats { + public ACSummaryStats (final VariantContext vc) { + concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); + for( int i = 0; i <= 2*vc.getNSamples() ; i ++ ) { + concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); + concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); + } + } + + public String getName() { + return "Allele Count Summary Statistics"; + } + + public Object[] getRowKeys() { + String[] acNames = (String[]) super.getRowKeys(); + Arrays.sort(acNames,new CompACNames()); + return acNames; + } +} + +class CompACNames implements Comparator{ + + public boolean equals(Object o) { + return ( o.getClass() == CompACNames.class ); + } + + public int compare(Object o1, Object o2) { + //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); + return getRank(o1) - getRank(o2); + } + + public int getRank(Object o) { + if ( o.getClass() != String.class ) { + return Integer.MIN_VALUE/4; + } else { + String s = (String) o; + if ( s.startsWith("eval") ) { + return Integer.MIN_VALUE/4 + 1 + parseAC(s); + } else if ( s.startsWith("comp") ) { + return 1+ parseAC(s); + } else { + return Integer.MIN_VALUE/4; + } + } + } + + public int parseAC(String s) { + String[] g = s.split("AC"); + return Integer.parseInt(g[1]); + } +} + diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 60a8298a4..2997040a6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -673,6 +673,9 @@ public class VariantEvalWalker extends RodWalker { if ( vc != null && vc.hasGenotypes(SAMPLES_LIST) && SAMPLES_LIST.size() > 0 ) { //if ( ! name.equals("eval") ) logger.info(String.format("subsetting VC %s", vc)); vc = vc.subContextFromGenotypes(vc.getGenotypes(SAMPLES_LIST).values()); + HashMap newAts = new HashMap(vc.getAttributes()); + VariantContextUtils.calculateChromosomeCounts(vc,newAts,true); + vc = VariantContext.modifyAttributes(vc,newAts); //if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc)); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index eb39cd234..10c1f7c2b 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -29,7 +29,7 @@ public class String extraArgs = "-L 1:1-10,000,000"; for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -o %s", - 1, Arrays.asList("482c868400f59e17dbc59d667b4b2eca")); + 1, Arrays.asList("158ac8e6d32eb2ea1bbeebfa512965de")); executeTest("testSelect1", spec); } } @@ -38,7 +38,7 @@ public class public void testSelect2() { String extraArgs = "-L 1:1-10,000,000"; WalkerTestSpec spec = new WalkerTestSpec( withSelect(withSelect(root, "DP < 50", "DP50"), "set==\"Intersection\"", "intersection") + " " + extraArgs + " -o %s", - 1, Arrays.asList("4dfaa72b23ce297a2c29d9f7e9661c37")); + 1, Arrays.asList("cee96f61ffa1d042fe0c63550c508ec9")); executeTest("testSelect2", spec); } @@ -48,7 +48,7 @@ public class for (String vcfFile : vcfFiles) { WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -B:eval,VCF " + validationDataLocation + vcfFile + " -B:comp,VCF " + validationDataLocation + "GenotypeConcordanceComp.vcf -noStandard -E GenotypeConcordance -reportType CSV -o %s", 1, - Arrays.asList("15d1075d384da2bb7445f7493f2b6a07")); + Arrays.asList("7e9ce1b26cdeaa50705f5de163847638")); executeTest("testVEGenotypeConcordance" + vcfFile, spec); } @@ -57,8 +57,8 @@ public class @Test public void testVESimple() { HashMap expectations = new HashMap(); - expectations.put("-L 1:1-10,000,000", "8891969e7522e728b64c112a2b2f9d1e"); - expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "ace2f6170e740a9ee6abc25f130c6848"); + expectations.put("-L 1:1-10,000,000", "ff8c4ba16c7c14b4edbaf440f20641f9"); + expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "9b509ce5d31658eb09bb9597799b2908"); for ( Map.Entry entry : expectations.entrySet() ) { String extraArgs = entry.getKey(); @@ -80,10 +80,10 @@ public class " -B:comp_hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf"; - String matchingMD5 = "dd513bc72860133a58e9ee542782162b"; + String matchingMD5 = "1e6d6e152c9a90513dd5b6eaa3729388"; expectations.put("", matchingMD5); expectations.put(" -known comp_hapmap -known dbsnp", matchingMD5); - expectations.put(" -known comp_hapmap", "bef6d1e5fa3a79faf745711e0d8fa2dd"); + expectations.put(" -known comp_hapmap", "d28dd504017f39a91cde8e6f096879d6"); for (String tests : testsEnumerations) { for (Map.Entry entry : expectations.entrySet()) { String extraArgs2 = entry.getKey(); @@ -118,11 +118,18 @@ public class for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(tests + " " + extraArgs + " -o %s -outputVCF %s", 2, - Arrays.asList("77abdb58b3166d87daadf397e7fb51c4", "989bc30dea6c8a4cf771cd1b9fdab488")); + Arrays.asList("6b97a019402b3984fead9a4e8b7c7c2a", "989bc30dea6c8a4cf771cd1b9fdab488")); executeTest("testVEWriteVCF", spec); } } + @Test + public void testCompVsEvalAC() { + String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -E GenotypeConcordance -B:evalYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.ug.very.few.lines.vcf -B:compYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.fake.genotypes.ac.test.vcf -reportType CSV"; + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("25a681855cb26e7380fbf1a93de0a41f")); + executeTest("testACDiscordanceAtAC1EvalAC2Comp",spec); + } + private static String withSelect(String cmd, String select, String name) { return String.format("%s -select '%s' -selectName %s", cmd, select, name); }