This is simultaneously a minor and major change to VariantEval, so take heed:

The core walker has been modified so that when variant contexts (eval and comp) are subset to command-line-specified sample(s), the chromosome count annotations (AC/AN/AF) are altered to reflect the AC/AN/AF of only those samples involved in the comparison. No more getting AC500 when you're comparing a 10-sample overlap. Interestingly enough, this didn't break any integration tests.

GenotypeConcordance now has two additional tables: Allele Count Statistics, and Allele Count Summary Statistics. These work exactly identically to the Sample Statistics and Sample Summary Statistics tables, except that the partition being used is no longer the sample, but instead the allele count of the variant sites. These tables stratify by both eval and comp ACs, e.g.

evalAC0
evalAC1
evalAC2
compAC0
compAC1
compAC2

Differences with previous integration tests were verified to only be in the Allele Count tables (by grepping them out of the diff); a new test has been added for the simple case of an AC=1 site in the eval becoming an AC=2 site in the comp.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4491 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-10-13 22:26:15 +00:00
parent 5ee12875fb
commit 7c9ef59d65
3 changed files with 147 additions and 25 deletions

View File

@ -1,8 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.apache.commons.lang.ArrayUtils;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.utils.report.tags.Analysis;
@ -61,6 +63,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
@DataPoint(description = "the variant quality score histograms for true positive and false positive calls")
QualityScoreHistograms qualityScoreHistograms = null;
@DataPoint(description = "the concordance statistics summary by allele count")
ACSummaryStats alleleCountSummary = null;
@DataPoint(description = "the concordance statistics by allele count")
ACStats alleleCountStats = null;
private static final int MAX_MISSED_VALIDATION_DATA = 100;
private boolean discordantInteresting = false;
@ -73,12 +81,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
public long nFound = 0;
public long nMissed = 0;
}
public HashMap<Integer, Stats> alleleCountStats = new HashMap<Integer, Stats>();
public HashMap<Integer, Stats> foundMissedByAC = new HashMap<Integer, Stats>();
public Object[] getRowKeys() {
String rows[] = new String[alleleCountStats.size()];
String rows[] = new String[foundMissedByAC.size()];
int index = 0;
for (int i : alleleCountStats.keySet()) rows[index++] = "Allele Count " + i;
for (int i : foundMissedByAC.keySet()) rows[index++] = "Allele Count " + i;
return rows;
}
@ -91,23 +99,23 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
}
public String getCell(int x, int y) {
if (x >= alleleCountStats.size()) throw new IllegalStateException(x + " is greater than the max index of " + (alleleCountStats.size()-1));
if (y == 0) return String.valueOf(alleleCountStats.get(alleleCountStats.keySet().toArray(new Integer[alleleCountStats.size()])[x]).nFound);
else return String.valueOf(alleleCountStats.get(alleleCountStats.keySet().toArray(new Integer[alleleCountStats.size()])[x]).nMissed);
if (x >= foundMissedByAC.size()) throw new IllegalStateException(x + " is greater than the max index of " + (foundMissedByAC.size()-1));
if (y == 0) return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nFound);
else return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nMissed);
}
public void incrementFoundCount(int alleleFreq) {
if (!alleleCountStats.containsKey(alleleFreq))
alleleCountStats.put(alleleFreq,new Stats(1,0));
if (!foundMissedByAC.containsKey(alleleFreq))
foundMissedByAC.put(alleleFreq,new Stats(1,0));
else
alleleCountStats.get(alleleFreq).nFound++;
foundMissedByAC.get(alleleFreq).nFound++;
}
public void incrementMissedCount(int alleleFreq) {
if (!alleleCountStats.containsKey(alleleFreq))
alleleCountStats.put(alleleFreq,new Stats(0,1));
if (!foundMissedByAC.containsKey(alleleFreq))
foundMissedByAC.put(alleleFreq,new Stats(0,1));
else
alleleCountStats.get(alleleFreq).nMissed++;
foundMissedByAC.get(alleleFreq).nMissed++;
}
}
@ -256,7 +264,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
if (eval != null) {
// initialize the concordance table
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
alleleCountStats = new ACStats(eval,Genotype.Type.values().length);
sampleSummaryStats = new SampleSummaryStats(eval);
alleleCountSummary = new ACSummaryStats(eval);
for (final VariantContext vc : missedValidationData) {
determineStats(null, vc);
}
@ -287,6 +297,8 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
String interesting = null;
final boolean validationIsValidVC = isValidVC(validation);
final String evalAC = (eval != null && eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) ? String.format("evalAC%s",eval.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null ;
final String validationAC = ( validationIsValidVC && validation.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) ? String.format("compAC%s",validation.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY)) : null;
// determine concordance for eval data
if (eval != null) {
@ -306,6 +318,12 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
}
sampleStats.incrValue(sample, truth, called);
if ( evalAC != null ) {
alleleCountStats.incrValue(evalAC,truth,called);
}
if ( validationAC != null ) {
alleleCountStats.incrValue(validationAC,truth,called);
}
}
}
// otherwise, mark no-calls for all samples
@ -315,7 +333,9 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
for (final String sample : validation.getSampleNames()) {
final Genotype.Type truth = validation.getGenotype(sample).getType();
sampleStats.incrValue(sample, truth, called);
if ( evalAC != null ) {
alleleCountStats.incrValue(evalAC,truth,called);
}
// print out interesting sites
if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) {
if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) {
@ -328,7 +348,7 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
}
}
// determine allele count concordance ()
// determine allele count concordance () // this is really a FN rate estimate -- CH
if (validationIsValidVC && validation.isPolymorphic()) {
int trueAlleleCount = 0;
for (final Allele a : validation.getAlternateAlleles()) {
@ -371,6 +391,10 @@ public class GenotypeConcordance extends VariantEvaluator implements StandardEva
if( sampleSummaryStats != null && sampleStats != null ) {
sampleSummaryStats.generateSampleSummaryStats( sampleStats );
}
if ( alleleCountSummary != null && alleleCountStats != null ) {
alleleCountSummary.generateSampleSummaryStats( alleleCountStats );
}
}
}
@ -417,12 +441,17 @@ class SampleStats implements TableType {
"n_hom/ref","n_hom/het","n_hom/hom"};
}
public SampleStats(VariantContext vc, int nGenotypeTypes) {
this.nGenotypeTypes = nGenotypeTypes;
for (String sample : vc.getSampleNames())
concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]);
}
public SampleStats(int genotypeTypes) {
nGenotypeTypes = genotypeTypes;
}
public Object getCell(int x, int y) {
// we have three rows of 6 right now for output (rows: ref, het, hom)
Genotype.Type type = Genotype.Type.values()[(y/6)+1]; // get the row type
@ -449,12 +478,35 @@ class SampleStats implements TableType {
}
}
/**
* Sample stats, but for AC
*/
class ACStats extends SampleStats {
public ACStats(VariantContext vc, int nGenotypeTypes) {
super(nGenotypeTypes);
for ( int i = 0; i <= 2*vc.getNSamples(); i++ ) { // todo -- assuming ploidy 2 here...
concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]);
concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]);
}
}
public String getName() {
return "Allele Count Statistics";
}
public Object[] getRowKeys() {
String[] acNames = (String[]) super.getRowKeys();
Arrays.sort(acNames,new CompACNames());
return acNames;
}
}
/**
* a table of sample names to genotype concordance summary statistics
*/
class SampleSummaryStats implements TableType {
private final static String ALL_SAMPLES_KEY = "allSamples";
private final static String[] COLUMN_KEYS = new String[]{
protected final static String ALL_SAMPLES_KEY = "allSamples";
protected final static String[] COLUMN_KEYS = new String[]{
"percent_comp_ref_called_var",
"percent_comp_het_called_het",
"percent_comp_het_called_var",
@ -465,7 +517,7 @@ class SampleSummaryStats implements TableType {
"percent_non-reference_discrepancy_rate"};
// sample to concordance stats object
private final HashMap<String, double[]> concordanceSummary = new HashMap<String, double[]>();
protected final HashMap<String, double[]> concordanceSummary = new HashMap<String, double[]>();
/**
*
@ -490,6 +542,10 @@ class SampleSummaryStats implements TableType {
}
}
public SampleSummaryStats() {
}
public Object getCell(int x, int y) {
final Object[] rowKeys = getRowKeys();
return String.format("%.2f",concordanceSummary.get(rowKeys[x])[y]);
@ -615,3 +671,59 @@ class SampleSummaryStats implements TableType {
return "Sample Summary Statistics";
}
}
/**
* SampleSummaryStats .. but for allele counts
*/
class ACSummaryStats extends SampleSummaryStats {
public ACSummaryStats (final VariantContext vc) {
concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]);
for( int i = 0; i <= 2*vc.getNSamples() ; i ++ ) {
concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]);
concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]);
}
}
public String getName() {
return "Allele Count Summary Statistics";
}
public Object[] getRowKeys() {
String[] acNames = (String[]) super.getRowKeys();
Arrays.sort(acNames,new CompACNames());
return acNames;
}
}
class CompACNames implements Comparator{
public boolean equals(Object o) {
return ( o.getClass() == CompACNames.class );
}
public int compare(Object o1, Object o2) {
//System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2));
return getRank(o1) - getRank(o2);
}
public int getRank(Object o) {
if ( o.getClass() != String.class ) {
return Integer.MIN_VALUE/4;
} else {
String s = (String) o;
if ( s.startsWith("eval") ) {
return Integer.MIN_VALUE/4 + 1 + parseAC(s);
} else if ( s.startsWith("comp") ) {
return 1+ parseAC(s);
} else {
return Integer.MIN_VALUE/4;
}
}
}
public int parseAC(String s) {
String[] g = s.split("AC");
return Integer.parseInt(g[1]);
}
}

View File

@ -673,6 +673,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
if ( vc != null && vc.hasGenotypes(SAMPLES_LIST) && SAMPLES_LIST.size() > 0 ) {
//if ( ! name.equals("eval") ) logger.info(String.format("subsetting VC %s", vc));
vc = vc.subContextFromGenotypes(vc.getGenotypes(SAMPLES_LIST).values());
HashMap<String,Object> newAts = new HashMap<String,Object>(vc.getAttributes());
VariantContextUtils.calculateChromosomeCounts(vc,newAts,true);
vc = VariantContext.modifyAttributes(vc,newAts);
//if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc));
}

View File

@ -29,7 +29,7 @@ public class
String extraArgs = "-L 1:1-10,000,000";
for (String tests : testsEnumerations) {
WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -o %s",
1, Arrays.asList("482c868400f59e17dbc59d667b4b2eca"));
1, Arrays.asList("158ac8e6d32eb2ea1bbeebfa512965de"));
executeTest("testSelect1", spec);
}
}
@ -38,7 +38,7 @@ public class
public void testSelect2() {
String extraArgs = "-L 1:1-10,000,000";
WalkerTestSpec spec = new WalkerTestSpec( withSelect(withSelect(root, "DP < 50", "DP50"), "set==\"Intersection\"", "intersection") + " " + extraArgs + " -o %s",
1, Arrays.asList("4dfaa72b23ce297a2c29d9f7e9661c37"));
1, Arrays.asList("cee96f61ffa1d042fe0c63550c508ec9"));
executeTest("testSelect2", spec);
}
@ -48,7 +48,7 @@ public class
for (String vcfFile : vcfFiles) {
WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -B:eval,VCF " + validationDataLocation + vcfFile + " -B:comp,VCF " + validationDataLocation + "GenotypeConcordanceComp.vcf -noStandard -E GenotypeConcordance -reportType CSV -o %s",
1,
Arrays.asList("15d1075d384da2bb7445f7493f2b6a07"));
Arrays.asList("7e9ce1b26cdeaa50705f5de163847638"));
executeTest("testVEGenotypeConcordance" + vcfFile, spec);
}
@ -57,8 +57,8 @@ public class
@Test
public void testVESimple() {
HashMap<String, String> expectations = new HashMap<String, String>();
expectations.put("-L 1:1-10,000,000", "8891969e7522e728b64c112a2b2f9d1e");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "ace2f6170e740a9ee6abc25f130c6848");
expectations.put("-L 1:1-10,000,000", "ff8c4ba16c7c14b4edbaf440f20641f9");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0 -E MendelianViolationEvaluator", "9b509ce5d31658eb09bb9597799b2908");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs = entry.getKey();
@ -80,10 +80,10 @@ public class
" -B:comp_hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf";
String matchingMD5 = "dd513bc72860133a58e9ee542782162b";
String matchingMD5 = "1e6d6e152c9a90513dd5b6eaa3729388";
expectations.put("", matchingMD5);
expectations.put(" -known comp_hapmap -known dbsnp", matchingMD5);
expectations.put(" -known comp_hapmap", "bef6d1e5fa3a79faf745711e0d8fa2dd");
expectations.put(" -known comp_hapmap", "d28dd504017f39a91cde8e6f096879d6");
for (String tests : testsEnumerations) {
for (Map.Entry<String, String> entry : expectations.entrySet()) {
String extraArgs2 = entry.getKey();
@ -118,11 +118,18 @@ public class
for (String tests : testsEnumerations) {
WalkerTestSpec spec = new WalkerTestSpec(tests + " " + extraArgs + " -o %s -outputVCF %s",
2,
Arrays.asList("77abdb58b3166d87daadf397e7fb51c4", "989bc30dea6c8a4cf771cd1b9fdab488"));
Arrays.asList("6b97a019402b3984fead9a4e8b7c7c2a", "989bc30dea6c8a4cf771cd1b9fdab488"));
executeTest("testVEWriteVCF", spec);
}
}
@Test
public void testCompVsEvalAC() {
String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -E GenotypeConcordance -B:evalYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.ug.very.few.lines.vcf -B:compYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.fake.genotypes.ac.test.vcf -reportType CSV";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("25a681855cb26e7380fbf1a93de0a41f"));
executeTest("testACDiscordanceAtAC1EvalAC2Comp",spec);
}
private static String withSelect(String cmd, String select, String name) {
return String.format("%s -select '%s' -selectName %s", cmd, select, name);
}