Added genotype concordance module. Not at all finished, but needed to give something to Aaron to look at for help in printing the output nicely.

Also misc cleanup and fixes (e.g. perform evalulation even when no comp tracks are provided).



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2996 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-14 19:02:24 +00:00
parent ecb59f5d0d
commit e367a50e9b
6 changed files with 231 additions and 26 deletions

View File

@ -6,7 +6,6 @@ import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import java.util.ArrayList;
import java.util.List;
import java.util.Arrays;
@ -47,7 +46,7 @@ public class DbSNPPercentage extends VariantEvaluator {
/**
* What fraction of the evaluated site variants were also found in the db?
*
* @return
* @return db rate
*/
public double dbSNPRate() { return rate(nSNPsAtdbSNPs(), nEvalSNPs()); }
public double concordanceRate() { return rate(nConcordant(), nSNPsAtdbSNPs()); }
@ -79,9 +78,9 @@ public class DbSNPPercentage extends VariantEvaluator {
/**
* Returns true if every allele in eval is also in dbsnp
*
* @param eval
* @param dbsnp
* @return
* @param eval eval context
* @param dbsnp db context
* @return true if eval and db are discordant
*/
public boolean discordantP(VariantContext eval, VariantContext dbsnp ) {
for ( Allele a : eval.getAlleles() ) {

View File

@ -0,0 +1,207 @@
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class GenotypeConcordance extends VariantEvaluator {
private static final int nGenotypeTypes = Genotype.Type.values().length;
class SampleStats {
long[][] concordance = new long[nGenotypeTypes][nGenotypeTypes];
public String toString() {
StringBuffer sb = new StringBuffer();
for (int truth = 0; truth < nGenotypeTypes; truth++ ) {
// don't print out when truth = no-call
if ( truth == Genotype.Type.NO_CALL.ordinal() )
continue;
long total = 0;
for (int called = 0; called < nGenotypeTypes; called++ )
total += concordance[truth][called];
sb.append(String.format("%d %d %.2f ", total, concordance[truth][truth], total == 0 ? 0.0 : (100.0 * (double)concordance[truth][truth] / (double)total)));
for (int called = 0; called < nGenotypeTypes; called++ ) {
if ( called != truth )
sb.append(String.format("%d ", concordance[truth][called]));
}
}
return sb.toString();
}
}
class FrequencyStats {
long nFound = 0;
long nMissed = 0;
public String toString() {
long total = nFound + nMissed;
return String.format("%d %d %.2f ", nFound, nMissed, total == 0 ? 0.0 : (100.0 * rate(nFound, total)));
}
}
// a mapping from allele count to stats
private HashMap<Integer, FrequencyStats> alleleCountStats = new HashMap<Integer, FrequencyStats>();
// a mapping from sample to stats
private HashMap<String, SampleStats> concordanceStats = null;
// keep a list of the validation data we saw before the first eval data
private HashSet<VariantContext> missedValidationData = new HashSet<VariantContext>();
public GenotypeConcordance(VariantEval2Walker parent) {
// don't do anything
}
public String getName() {
return "genotypeConcordance";
}
public int getComparisonOrder() {
return 2; // we need to see each eval track and each comp track
}
public boolean enabled() { return true; }
public String toString() {
return getName() + ": " + getTableRows();
}
private static List<String> SAMPLE_HEADER =
Arrays.asList("sample",
"total_true_ref", "n_ref/ref", "%_ref/ref",
"n_ref/no-call", "n_ref/het", "n_ref/hom",
"total_true_het", "n_het/het", "%_het/het",
"n_het/no-call", "n_het/ref", "n_het/hom",
"total_true_hom", "n_hom/hom", "%_hom/hom",
"n_hom/no-call", "n_hom/ref", "n_hom/het");
private static List<String> FREQUENCY_HEADER =
Arrays.asList("alleleCount", "n_found", "n_missed", "%_found");
// making it a table
public List<String> getTableHeader() {
ArrayList<String> header = new ArrayList<String>();
header.addAll(SAMPLE_HEADER);
header.addAll(FREQUENCY_HEADER);
return header;
}
public List<List<String>> getTableRows() {
ArrayList<List<String>> rows = new ArrayList<List<String>>();
if ( concordanceStats != null ) {
for ( Map.Entry<String, SampleStats> sample : concordanceStats.entrySet() )
rows.add(Arrays.asList(String.format("%s %s", sample.getKey(), sample.getValue().toString()).split(" ")));
}
if ( alleleCountStats != null ) {
for ( Map.Entry<Integer, FrequencyStats> alleleCount : alleleCountStats.entrySet() )
rows.add(Arrays.asList(String.format("%d %s", alleleCount.getKey(), alleleCount.getValue().toString()).split(" ")));
}
return rows;
}
public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
String interesting = null;
// sanity check that we at least have either eval or validation data
if ( eval == null && !isValidVC(validation) )
return interesting;
if ( concordanceStats == null ) {
if ( eval != null ) {
// initialize the concordance table
createConcordanceTable(eval);
for ( VariantContext vc : missedValidationData )
determineStats(null, vc);
missedValidationData = null;
} else {
missedValidationData.add(validation);
return interesting;
}
}
determineStats(eval, validation);
return interesting; // we don't capture any interesting sites
}
private void determineStats(VariantContext eval, VariantContext validation) {
// determine concordance for eval data
if ( eval != null ) {
for ( String sample : eval.getSampleNames() ) {
Genotype.Type called = eval.getGenotype(sample).getType();
Genotype.Type truth;
if ( !isValidVC(validation) || !validation.hasGenotype(sample) )
truth = Genotype.Type.NO_CALL;
else
truth = validation.getGenotype(sample).getType();
SampleStats stats = concordanceStats.get(sample);
if ( stats == null )
throw new StingException("Sample " + sample + " has not been seen in a previous eval; this analysis module assumes that all samples are present in each variant context");
stats.concordance[truth.ordinal()][called.ordinal()]++;
}
}
// otherwise, mark no-calls for all samples
else {
Genotype.Type called = Genotype.Type.NO_CALL;
for ( String sample : validation.getSampleNames() ) {
SampleStats stats = concordanceStats.get(sample);
if ( stats == null )
continue;
Genotype.Type truth = validation.getGenotype(sample).getType();
stats.concordance[truth.ordinal()][called.ordinal()]++;
}
}
// determine allele count concordance ()
if ( isValidVC(validation) && validation.isPolymorphic() ) {
int trueAlleleCount = 0;
for ( Allele a : validation.getAlternateAlleles() )
trueAlleleCount += validation.getChromosomeCount(a);
if ( !alleleCountStats.containsKey(trueAlleleCount) )
alleleCountStats.put(trueAlleleCount, new FrequencyStats());
FrequencyStats stats = alleleCountStats.get(trueAlleleCount);
if ( eval != null )
stats.nFound++;
else
stats.nMissed++;
}
}
private static boolean isValidVC(VariantContext vc) {
return (vc != null && !vc.isFiltered());
}
private void createConcordanceTable(VariantContext vc) {
concordanceStats = new HashMap<String, SampleStats>();
for ( String sample : vc.getSampleNames() )
concordanceStats.put(sample, new SampleStats());
}
}

View File

@ -109,7 +109,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
if ( isViolation(vc, momG, dadG, childG) ) {
nViolations++;
String label = null;
String label;
if ( childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar() )) {
label = "KidHomRef_ParentHomVar";
KidHomRef_ParentHomVar++;

View File

@ -4,9 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import java.util.ArrayList;
import java.util.List;
import java.util.Arrays;
@ -84,7 +82,7 @@ public class ValidationRate extends VariantEvaluator {
String interesting = null;
if ( validation != null && validation.hasGenotypes() && validation.isNotFiltered() ) {
SiteStats overlap = null;
SiteStats overlap;
if ( validation.isPolymorphic() ) {
validationStats.nPoly++;

View File

@ -3,10 +3,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableVariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
@ -30,7 +27,6 @@ import java.util.*;
// todo -- write a simple column table system and have the evaluators return this instead of the list<list<string>> objects
// todo -- site frequency spectrum eval (freq. of variants in eval as a function of their AC and AN numbers)
// todo -- multiple sample concordance tool (genotypes in eval vs. genotypes in truth)
// todo -- allele freqeuncy discovery tool (FREQ in true vs. discovery counts in eval). Needs to process subset of samples in true (pools)
// todo -- clustered SNP counter
// todo -- HWEs
@ -121,7 +117,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
@Argument(shortName="rsID", fullName="rsID", doc="If provided, list of rsID and build number for capping known snps by their build date", required=false)
protected String rsIDFile = null;
@Argument(shortName="maxRsIDBuild", fullName="maxRsIDBuild", doc="If provided, only variants with rsIDs <= maxRsIDBuild will be included in the set of knwon snps", required=false)
@Argument(shortName="maxRsIDBuild", fullName="maxRsIDBuild", doc="If provided, only variants with rsIDs <= maxRsIDBuild will be included in the set of known snps", required=false)
protected int maxRsIDBuild = Integer.MAX_VALUE;
Set<String> rsIDsToExclude = null;
@ -183,6 +179,8 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
private static String KNOWN_SET_NAME = "known";
private static String NOVEL_SET_NAME = "novel";
private static String NO_COMP_NAME = "N/A";
private final static String CONTEXT_HEADER = "eval.comp.select.filter.novelty";
private final static int N_CONTEXT_NAME_PARTS = CONTEXT_HEADER.split("\\.").length;
private static int[] nameSizes = new int[N_CONTEXT_NAME_PARTS];
@ -221,6 +219,10 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
}
}
// if no comp rod was provided, we still want to be able to do evaluations, so use a default comp name
if ( compNames.size() == 0 )
compNames.add(NO_COMP_NAME);
contexts = initializeEvaluationContexts(evalNames, compNames, selectExps);
determineContextNamePartSizes();

View File

@ -6,14 +6,15 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import java.util.List;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jan 29, 2010
* Time: 3:38:02 PM
* To change this template use File | Settings | File Templates.
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
abstract class VariantEvaluator {
// protected boolean accumulateInterestingSites = false, printInterestingSites = false;
@ -40,13 +41,11 @@ abstract class VariantEvaluator {
//public boolean processedAnySites() { return processedASite; }
//protected void markSiteAsProcessed() { processedASite = true; }
/** Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 */
// Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2
public abstract int getComparisonOrder();
/** called at all sites, regardless of eval context itself; useful for counting processed bases */
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
}
// called at all sites, regardless of eval context itself; useful for counting processed bases
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { }
public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return null;