From 0548026a2e4cf89db6947643dece37494f19b3db Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 16 Jul 2009 21:03:26 +0000 Subject: [PATCH] Now understanding GLFs for calculating genotyping results like callable bases, as well as avoids emitting stupid amounts of data when doing a genotype evaluation (i.e., ignores non-SNP() calls) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1267 348d0f76-0448-11de-a6fe-93d51630548a --- .../varianteval/CallableBasesAnalysis.java | 88 +++++++++++++++++++ .../varianteval/ClusterCounterAnalysis.java | 2 +- .../varianteval/NeighborDistanceAnalysis.java | 2 +- .../varianteval/VariantDBCoverage.java | 4 +- .../varianteval/VariantEvalWalker.java | 48 +++++++++- 5 files changed, 139 insertions(+), 5 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/CallableBasesAnalysis.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/CallableBasesAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/CallableBasesAnalysis.java new file mode 100755 index 000000000..f237dfd69 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/CallableBasesAnalysis.java @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; + +import java.io.PrintStream; +import java.util.List; +import java.util.Arrays; +import java.util.ArrayList; + +/** + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ +public class CallableBasesAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis { + long all_bases = 0; + long all_calls = 0; + //final static double[] Qthresholds = { 10, 20, 30, 40, 50, 100, 200, 500, 1000 }; + final static double[] thresholds = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100 }; + long[] discoverable_bases = new long[thresholds.length]; + long[] genotypable_bases = new long[thresholds.length]; + + public CallableBasesAnalysis() { + super("callable_bases"); + } + + public long nSites() { return all_bases; } + public long nCalls() { return all_calls; } + public long nDiscoverable(int index) { return discoverable_bases[index]; } + public double percentDiscoverable(int index) { return (100.0*nDiscoverable(index)) / nSites(); } + public long nGenotypable(int index) { return genotypable_bases[index]; } + public double percentGenotypable(int index) { return (100.0*nGenotypable(index)) / nSites(); } + + public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { + all_bases++; + + if ( eval == null ) // no data here! + return null; + + // we actually have a record here + if ( ! eval.isGenotype() ) { // evaluation record isn't a genotype, die! + throw new RuntimeException("Evaluation track isn't an Genotype!"); + } + + all_calls++; + // For every threshold, updated discoverable and callable + for ( int i = 0; i < thresholds.length; i++ ) { + double threshold = thresholds[i]; + + // update discoverable + if ( eval.isSNP() && eval.getVariationConfidence() >= threshold ) + discoverable_bases[i]++; + if ( eval.isReference() && eval.getConsensusConfidence() >= threshold ) + discoverable_bases[i]++; + + if ( eval.getConsensusConfidence() >= threshold ) + genotypable_bases[i]++; + + //System.out.printf("Updating %s SNP=%b, REF=%b VarConf=%f ConConf=%f where threshold=%f: discoverable = %d, genotypable = %d%n", + // eval.getLocation(), eval.isSNP(), eval.isReference(), eval.getVariationConfidence(), + // eval.getConsensusConfidence(), threshold, discoverable_bases[i], genotypable_bases[i]); + } + + return null; + } + + public List done() { + List s = new ArrayList(); + s.add(String.format("total_no_sites %d", nSites())); + s.add(String.format("total_no_calls %d", nCalls())); + s.add(String.format("")); + s.add(String.format("confidence_threshold\tdiscoverable_bases\tdiscoverable_bases_percent\tgenotypable_bases\tgenotypable_bases_percent")); + + for ( int i = 0; i < thresholds.length; i++ ) { + double threshold = thresholds[i]; + s.add(String.format("%6.2f\t%d\t%.6f\t%d\t%.6f", threshold, nDiscoverable(i), percentDiscoverable(i), nGenotypable(i), percentGenotypable(i))); + } + + return s; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java index df54ac8cb..e4beb9c5e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ClusterCounterAnalysis.java @@ -40,7 +40,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { String r = null; - if ( eval != null ) { + if ( eval != null && eval.isSNP() ) { IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null); GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java index 5c0d33fc2..02c4e5e8e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java @@ -39,7 +39,7 @@ public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implement public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { String r = null; - if ( eval != null ) { + if ( eval != null && eval.isSNP() ) { IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null); GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java index d1cf902f9..20b2cab4c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java @@ -53,8 +53,8 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { // There are four cases here: AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null); - inc(dbsnp != null, eval != null); - return dbsnp == null && eval != null ? "Novel " + eval : null; + inc(dbsnp != null, eval != null && eval.isSNP()); + return dbsnp == null && eval != null && eval.isSNP() ? "Novel " + eval : null; } /** diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index 19e994d5c..b892b2479 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -5,6 +5,10 @@ import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotype.glf.GLFRecord; +import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.*; @@ -87,6 +91,7 @@ public class VariantEvalWalker extends RefWalker { analyses.add(new NeighborDistanceAnalysis()); analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold)); analyses.add(new ClusterCounterAnalysis()); + analyses.add(new CallableBasesAnalysis()); // // Filter out analyzes inappropriate for our evaluation type Population or Genotype @@ -97,7 +102,7 @@ public class VariantEvalWalker extends RefWalker { boolean disableForGenotyping = evalContainsGenotypes && ! (analysis instanceof GenotypeAnalysis); boolean disableForPopulation = ! evalContainsGenotypes && ! (analysis instanceof PopulationAnalysis); boolean disable = disableForGenotyping | disableForPopulation; - String causeName = disableForGenotyping ? "genotype" : (disableForPopulation ? "population" : null); + String causeName = disableForGenotyping ? "population" : (disableForPopulation ? "genotype" : null); if ( disable ) { logger.info(String.format("Disabling %s-only analysis %s in set %s", causeName, analysis, setName)); iter.remove(); @@ -141,9 +146,50 @@ public class VariantEvalWalker extends RefWalker { } } + // OMG this is painful + private rodVariants glf2geli(char ref, RodGLF glf) { + SinglePointCall rec = (SinglePointCall)glf.mRecord; + // contig pos refBase depth maxMappingQ bestGenotype lodBtr lodBtnb genotypes + Integer[] sorted = Utils.SortPermutation(rec.getLikelihoods()); + int bestIndex = sorted[0]; + char[] refs = {ref, ref}; + String homRef = new String(refs); + int refIndex = rodVariants.Genotype.valueOf(homRef).ordinal(); + double refLikelihood = rec.getLikelihoods()[refIndex]; + double bestLikelihood = rec.getLikelihoods()[bestIndex]; + double secondBestLikelihood = rec.getLikelihoods()[sorted[1]]; + + rodVariants var = new rodVariants("eval"); + var.loc = glf.getLocation(); + var.refBase = ref; + var.depth = rec.getReadDepth(); + var.maxMappingQuality = rec.getRmsMapQ(); + var.bestGenotype = rodVariants.Genotype.values()[bestIndex].toString(); + var.lodBtr = Math.abs((bestLikelihood - refLikelihood) / GLFRecord.LIKELIHOOD_SCALE_FACTOR); + var.lodBtnb = Math.abs((bestLikelihood - secondBestLikelihood) / GLFRecord.LIKELIHOOD_SCALE_FACTOR); + var.genotypeLikelihoods = rec.getLikelihoods(); + for ( int i = 0; i < var.genotypeLikelihoods.length; i++ ) + var.genotypeLikelihoods[i] /= GLFRecord.LIKELIHOOD_SCALE_FACTOR; + + if ( false ) { + System.out.printf("Converting : %s%n", glf); + System.out.printf(" homRef: %s%n", homRef); + System.out.printf(" refindex : %d%n", refIndex); + System.out.printf(" bestIndex : %d%n", sorted[0]); + System.out.printf(" 2ndindex : %d%n", sorted[1]); + System.out.printf(" => %s%n", var); + } + + return var; + } + public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { nSites++; + if ( tracker.lookup("eval", null) instanceof RodGLF) { + tracker.bind("eval", glf2geli(ref, (RodGLF)tracker.lookup("eval", null))); + } + // Iterate over each analysis, and update it AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);