diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java index 7a3630b35..79a0a12e3 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java @@ -6,7 +6,6 @@ import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; -import java.util.ArrayList; import java.util.List; import java.util.Arrays; @@ -47,7 +46,7 @@ public class DbSNPPercentage extends VariantEvaluator { /** * What fraction of the evaluated site variants were also found in the db? * - * @return + * @return db rate */ public double dbSNPRate() { return rate(nSNPsAtdbSNPs(), nEvalSNPs()); } public double concordanceRate() { return rate(nConcordant(), nSNPsAtdbSNPs()); } @@ -79,9 +78,9 @@ public class DbSNPPercentage extends VariantEvaluator { /** * Returns true if every allele in eval is also in dbsnp * - * @param eval - * @param dbsnp - * @return + * @param eval eval context + * @param dbsnp db context + * @return true if eval and db are discordant */ public boolean discordantP(VariantContext eval, VariantContext dbsnp ) { for ( Allele a : eval.getAlleles() ) { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java new file mode 100755 index 000000000..702a23452 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java @@ -0,0 +1,207 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2; + +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.contexts.variantcontext.*; +import org.broadinstitute.sting.utils.StingException; + +import java.util.*; + +/** + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public class GenotypeConcordance extends VariantEvaluator { + + private static final int nGenotypeTypes = Genotype.Type.values().length; + + class SampleStats { + + long[][] concordance = new long[nGenotypeTypes][nGenotypeTypes]; + + public String toString() { + StringBuffer sb = new StringBuffer(); + for (int truth = 0; truth < nGenotypeTypes; truth++ ) { + // don't print out when truth = no-call + if ( truth == Genotype.Type.NO_CALL.ordinal() ) + continue; + long total = 0; + for (int called = 0; called < nGenotypeTypes; called++ ) + total += concordance[truth][called]; + sb.append(String.format("%d %d %.2f ", total, concordance[truth][truth], total == 0 ? 0.0 : (100.0 * (double)concordance[truth][truth] / (double)total))); + for (int called = 0; called < nGenotypeTypes; called++ ) { + if ( called != truth ) + sb.append(String.format("%d ", concordance[truth][called])); + } + } + + return sb.toString(); + } + } + + class FrequencyStats { + long nFound = 0; + long nMissed = 0; + + public String toString() { + long total = nFound + nMissed; + return String.format("%d %d %.2f ", nFound, nMissed, total == 0 ? 0.0 : (100.0 * rate(nFound, total))); + } + } + + // a mapping from allele count to stats + private HashMap alleleCountStats = new HashMap(); + + // a mapping from sample to stats + private HashMap concordanceStats = null; + + // keep a list of the validation data we saw before the first eval data + private HashSet missedValidationData = new HashSet(); + + + public GenotypeConcordance(VariantEval2Walker parent) { + // don't do anything + } + + public String getName() { + return "genotypeConcordance"; + } + + public int getComparisonOrder() { + return 2; // we need to see each eval track and each comp track + } + + public boolean enabled() { return true; } + + public String toString() { + return getName() + ": " + getTableRows(); + } + + private static List SAMPLE_HEADER = + Arrays.asList("sample", + "total_true_ref", "n_ref/ref", "%_ref/ref", + "n_ref/no-call", "n_ref/het", "n_ref/hom", + "total_true_het", "n_het/het", "%_het/het", + "n_het/no-call", "n_het/ref", "n_het/hom", + "total_true_hom", "n_hom/hom", "%_hom/hom", + "n_hom/no-call", "n_hom/ref", "n_hom/het"); + + private static List FREQUENCY_HEADER = + Arrays.asList("alleleCount", "n_found", "n_missed", "%_found"); + + // making it a table + public List getTableHeader() { + ArrayList header = new ArrayList(); + header.addAll(SAMPLE_HEADER); + header.addAll(FREQUENCY_HEADER); + return header; + } + + public List> getTableRows() { + ArrayList> rows = new ArrayList>(); + + if ( concordanceStats != null ) { + for ( Map.Entry sample : concordanceStats.entrySet() ) + rows.add(Arrays.asList(String.format("%s %s", sample.getKey(), sample.getValue().toString()).split(" "))); + } + + if ( alleleCountStats != null ) { + for ( Map.Entry alleleCount : alleleCountStats.entrySet() ) + rows.add(Arrays.asList(String.format("%d %s", alleleCount.getKey(), alleleCount.getValue().toString()).split(" "))); + } + + return rows; + } + + public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + String interesting = null; + + // sanity check that we at least have either eval or validation data + if ( eval == null && !isValidVC(validation) ) + return interesting; + + if ( concordanceStats == null ) { + if ( eval != null ) { + // initialize the concordance table + createConcordanceTable(eval); + for ( VariantContext vc : missedValidationData ) + determineStats(null, vc); + missedValidationData = null; + } else { + missedValidationData.add(validation); + return interesting; + } + } + + determineStats(eval, validation); + + return interesting; // we don't capture any interesting sites + } + + private void determineStats(VariantContext eval, VariantContext validation) { + + // determine concordance for eval data + if ( eval != null ) { + + for ( String sample : eval.getSampleNames() ) { + Genotype.Type called = eval.getGenotype(sample).getType(); + Genotype.Type truth; + + if ( !isValidVC(validation) || !validation.hasGenotype(sample) ) + truth = Genotype.Type.NO_CALL; + else + truth = validation.getGenotype(sample).getType(); + + SampleStats stats = concordanceStats.get(sample); + if ( stats == null ) + throw new StingException("Sample " + sample + " has not been seen in a previous eval; this analysis module assumes that all samples are present in each variant context"); + stats.concordance[truth.ordinal()][called.ordinal()]++; + } + } + // otherwise, mark no-calls for all samples + else { + + Genotype.Type called = Genotype.Type.NO_CALL; + + for ( String sample : validation.getSampleNames() ) { + SampleStats stats = concordanceStats.get(sample); + if ( stats == null ) + continue; + + Genotype.Type truth = validation.getGenotype(sample).getType(); + stats.concordance[truth.ordinal()][called.ordinal()]++; + } + } + + // determine allele count concordance () + if ( isValidVC(validation) && validation.isPolymorphic() ) { + int trueAlleleCount = 0; + for ( Allele a : validation.getAlternateAlleles() ) + trueAlleleCount += validation.getChromosomeCount(a); + + if ( !alleleCountStats.containsKey(trueAlleleCount) ) + alleleCountStats.put(trueAlleleCount, new FrequencyStats()); + FrequencyStats stats = alleleCountStats.get(trueAlleleCount); + if ( eval != null ) + stats.nFound++; + else + stats.nMissed++; + } + } + + private static boolean isValidVC(VariantContext vc) { + return (vc != null && !vc.isFiltered()); + } + + private void createConcordanceTable(VariantContext vc) { + concordanceStats = new HashMap(); + for ( String sample : vc.getSampleNames() ) + concordanceStats.put(sample, new SampleStats()); + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java index 02ce6058d..32039d96d 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java @@ -109,7 +109,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator { if ( isViolation(vc, momG, dadG, childG) ) { nViolations++; - String label = null; + String label; if ( childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar() )) { label = "KidHomRef_ParentHomVar"; KidHomRef_ParentHomVar++; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/ValidationRate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/ValidationRate.java index 66272b3d1..aa600232c 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/ValidationRate.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/ValidationRate.java @@ -4,9 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; -import java.util.ArrayList; import java.util.List; import java.util.Arrays; @@ -84,7 +82,7 @@ public class ValidationRate extends VariantEvaluator { String interesting = null; if ( validation != null && validation.hasGenotypes() && validation.isNotFiltered() ) { - SiteStats overlap = null; + SiteStats overlap; if ( validation.isPolymorphic() ) { validationStats.nPoly++; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java index f5ed2dfae..888900e94 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java @@ -3,10 +3,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableVariantContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.*; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; @@ -30,7 +27,6 @@ import java.util.*; // todo -- write a simple column table system and have the evaluators return this instead of the list> objects // todo -- site frequency spectrum eval (freq. of variants in eval as a function of their AC and AN numbers) -// todo -- multiple sample concordance tool (genotypes in eval vs. genotypes in truth) // todo -- allele freqeuncy discovery tool (FREQ in true vs. discovery counts in eval). Needs to process subset of samples in true (pools) // todo -- clustered SNP counter // todo -- HWEs @@ -121,7 +117,7 @@ public class VariantEval2Walker extends RodWalker { @Argument(shortName="rsID", fullName="rsID", doc="If provided, list of rsID and build number for capping known snps by their build date", required=false) protected String rsIDFile = null; - @Argument(shortName="maxRsIDBuild", fullName="maxRsIDBuild", doc="If provided, only variants with rsIDs <= maxRsIDBuild will be included in the set of knwon snps", required=false) + @Argument(shortName="maxRsIDBuild", fullName="maxRsIDBuild", doc="If provided, only variants with rsIDs <= maxRsIDBuild will be included in the set of known snps", required=false) protected int maxRsIDBuild = Integer.MAX_VALUE; Set rsIDsToExclude = null; @@ -183,6 +179,8 @@ public class VariantEval2Walker extends RodWalker { private static String KNOWN_SET_NAME = "known"; private static String NOVEL_SET_NAME = "novel"; + private static String NO_COMP_NAME = "N/A"; + private final static String CONTEXT_HEADER = "eval.comp.select.filter.novelty"; private final static int N_CONTEXT_NAME_PARTS = CONTEXT_HEADER.split("\\.").length; private static int[] nameSizes = new int[N_CONTEXT_NAME_PARTS]; @@ -221,6 +219,10 @@ public class VariantEval2Walker extends RodWalker { } } + // if no comp rod was provided, we still want to be able to do evaluations, so use a default comp name + if ( compNames.size() == 0 ) + compNames.add(NO_COMP_NAME); + contexts = initializeEvaluationContexts(evalNames, compNames, selectExps); determineContextNamePartSizes(); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java index f12f05692..6727c59bf 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java @@ -6,14 +6,15 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import java.util.List; -import java.util.ArrayList; /** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Jan 29, 2010 - * Time: 3:38:02 PM - * To change this template use File | Settings | File Templates. + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. */ abstract class VariantEvaluator { // protected boolean accumulateInterestingSites = false, printInterestingSites = false; @@ -40,13 +41,11 @@ abstract class VariantEvaluator { //public boolean processedAnySites() { return processedASite; } //protected void markSiteAsProcessed() { processedASite = true; } - /** Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 */ + // Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 public abstract int getComparisonOrder(); - /** called at all sites, regardless of eval context itself; useful for counting processed bases */ - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - } + // called at all sites, regardless of eval context itself; useful for counting processed bases + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return null;