diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java index 8c6b90d06..d7cb13051 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/BasicVariantAnalysis.java @@ -12,6 +12,7 @@ public abstract class BasicVariantAnalysis implements VariantAnalysis { protected String name; protected PrintStream out; protected VariantEvalWalker master; + protected String filename; public BasicVariantAnalysis(String name) { this.name = name; @@ -25,12 +26,17 @@ public abstract class BasicVariantAnalysis implements VariantAnalysis { return new ArrayList(); } - public void initialize(VariantEvalWalker master, PrintStream out) { + public void initialize(VariantEvalWalker master, PrintStream out, String filename) { this.master = master; this.out = out; + this.filename = filename; } - public PrintStream getPrintStream() { + public PrintStream getSummaryPrintStream() { + return out; + } + + public PrintStream getCallPrintStream() { return out; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/HardyWeinbergEquilibrium.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/HardyWeinbergEquilibrium.java index e459b4eb3..e7ebf8293 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/HardyWeinbergEquilibrium.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/HardyWeinbergEquilibrium.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.PooledEMSNPROD; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation; import java.io.PrintStream; import java.util.ArrayList; @@ -21,7 +22,7 @@ import cern.jet.math.Arithmetic; * Time: 4:38:00 PM * To change this template use File | Settings | File Templates. */ -public class HardyWeinbergEquilibrium extends BasicVariantAnalysis { +public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis { private double threshold; int nSites = 0; int nViolations = 0; @@ -38,43 +39,59 @@ public class HardyWeinbergEquilibrium extends BasicVariantAnalysis { eval instanceof SNPCallFromGenotypes ) { nSites++; SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval; + //double pPoint = pointHWcalculate(call); + double pTailed = tailedHWcalculate(call); + double p = pTailed; - int nAA = call.nHomRefGenotypes(); - int nAa = call.nHetGenotypes(); - int naa = call.nHomVarGenotypes(); - int nA = 2 * nAA + nAa; - int n = nAA + nAa + naa; - - // - // from Emigh 1980 - // - // P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa - // - // where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa, - // respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles - // - int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector - double m = MathUtils.multinomial(mXs); - double b = Arithmetic.binomial(2 * n, nA); - double tosses = Math.pow(2, nAa); - double p = (m / b) * tosses; - - if ( false ) { - System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n", - call.getLocation(), p, threshold, call.getMAF(), nAA, nAa, naa, nA, n, m, b, tosses, p, eval); - System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n", - nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p); - } + //System.out.printf("HWE point=%.4e %s vs. tailed=%.4e %s for %s%n", pPoint, pPoint < threshold ? "***" : " ", pTailed, pTailed < threshold ? "***" : " ", call); if ( p < threshold ) { r = String.format("HWE-violation %f < %f at %s", p, threshold, eval); nViolations++; } } - + return r; } + public double tailedHWcalculate(SNPCallFromGenotypes call) { + int obsAA = call.nHomRefGenotypes(); + int obsAB = call.nHetGenotypes(); + int obsBB = call.nHomVarGenotypes(); + return HardyWeinbergCalculation.hwCalculate(obsAA, obsAB, obsBB); + } + + public double pointHWcalculate(SNPCallFromGenotypes call) { + int nAA = call.nHomRefGenotypes(); + int nAa = call.nHetGenotypes(); + int naa = call.nHomVarGenotypes(); + int nA = 2 * nAA + nAa; + int n = nAA + nAa + naa; + + // + // from Emigh 1980 + // + // P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa + // + // where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa, + // respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles + // + int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector + double m = MathUtils.multinomial(mXs); + double b = Arithmetic.binomial(2 * n, nA); + double tosses = Math.pow(2, nAa); + double p = (m / b) * tosses; + + if ( false ) { + System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n", + call.getLocation(), p, threshold, call.getMAF(), nAA, nAa, naa, nA, n, m, b, tosses, p, call); + System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n", + nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p); + } + + return p; + } + public List done() { List s = new ArrayList(); s.add(String.format("n_calls %d", nSites)); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java index ea524a32a..7de264fac 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/NeighborDistanceAnalysis.java @@ -10,6 +10,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.io.PrintStream; +import java.io.File; +import java.io.FileNotFoundException; /** * Created by IntelliJ IDEA. @@ -18,12 +20,13 @@ import java.io.PrintStream; * Time: 4:38:19 PM * To change this template use File | Settings | File Templates. */ -public class NeighborDistanceAnalysis extends BasicVariantAnalysis { +public class NeighborDistanceAnalysis extends ViolationVariantAnalysis { ArrayList neighborWiseDistances; int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000}; AllelicVariant lastVariant = null; GenomeLoc lastVariantInterval = null; + PrintStream violationsOut = null; public NeighborDistanceAnalysis() { super("neighbor_distances"); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java index 378f345a0..090899659 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantAnalysis.java @@ -9,9 +9,10 @@ import java.util.List; public interface VariantAnalysis { public String getName(); - public PrintStream getPrintStream(); + public PrintStream getSummaryPrintStream(); + public PrintStream getCallPrintStream(); public List getParams(); - public void initialize(VariantEvalWalker master, PrintStream out); + public void initialize(VariantEvalWalker master, PrintStream out, String filename); public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context); public void finalize(long nSites); public List done(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index 54ac40de0..394ff6653 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -21,7 +21,7 @@ public class VariantEvalWalker extends RefWalker { public boolean printVariants = false; @Argument(shortName="badHWEThreshold", doc="XXX", required=false) - public double badHWEThreshold = 0.001; + public double badHWEThreshold = 1e-3; String analysisFilenameBase = null; @@ -72,7 +72,7 @@ public class VariantEvalWalker extends RefWalker { if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName)); for ( VariantAnalysis analysis : analyses ) { - analysis.initialize(this, openAnalysisOutputStream(setName, analysis)); + initializeAnalysisOutputStream(setName, analysis); } return analyses; @@ -91,14 +91,14 @@ public class VariantEvalWalker extends RefWalker { return analysisFilenameBase + Utils.join(".", Utils.cons(name, params)); } - public PrintStream openAnalysisOutputStream(final String setName, VariantAnalysis analysis) { + public void initializeAnalysisOutputStream(final String setName, VariantAnalysis analysis) { final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams()); if ( filename == null ) - return out; + analysis.initialize(this, out, filename); else { File file = new File(filename); try { - return new PrintStream(new FileOutputStream(file)); + analysis.initialize(this, new PrintStream(new FileOutputStream(file)), filename); } catch (FileNotFoundException e) { throw new StingException("Couldn't open analysis output file " + filename, e); } @@ -150,7 +150,7 @@ public class VariantEvalWalker extends RefWalker { // Iterate over each analysis, and update it for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) { String s = analysis.update(eval, tracker, ref, context); - if ( s != null ) analysis.getPrintStream().println(s); + if ( s != null ) analysis.getCallPrintStream().println(s); } } @@ -174,7 +174,7 @@ public class VariantEvalWalker extends RefWalker { Date now = new Date(); for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) { analysis.finalize(nSites); - PrintStream stream = analysis.getPrintStream(); // getAnalysisOutputStream(analysisSetName + "." + analysis.getName(), analysis.getParams()); + PrintStream stream = analysis.getSummaryPrintStream(); stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78)); stream.printf("%sAnalysis set %s%n", COMMENT_STRING, analysisSetName); stream.printf("%sAnalysis name %s%n", COMMENT_STRING, analysis.getName()); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ViolationVariantAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ViolationVariantAnalysis.java new file mode 100755 index 000000000..06d65ae1c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/ViolationVariantAnalysis.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.IntervalRod; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.io.PrintStream; +import java.io.File; +import java.io.FileNotFoundException; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jun 4, 2009 + * Time: 4:38:19 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class ViolationVariantAnalysis extends BasicVariantAnalysis { + PrintStream violationsOut = null; + + public ViolationVariantAnalysis(final String name) { + super(name); + } + + public void initialize(VariantEvalWalker master, PrintStream out, String filename) { + super.initialize(master, out, filename); + + try { + violationsOut = filename == null ? out : new PrintStream(new File(filename + ".violations")); + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + } + + public PrintStream getCallPrintStream() { + return violationsOut; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/HardyWeinbergCalculation.java b/java/src/org/broadinstitute/sting/utils/genotype/HardyWeinbergCalculation.java new file mode 100755 index 000000000..c99a85502 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/HardyWeinbergCalculation.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.genotype; + +/** +* The Broad Institute +* SOFTWARE COPYRIGHT NOTICE AGREEMENT +* This software and its documentation are copyright 2004 by the +* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. +* +* This software is supplied without any warranty or guaranteed support whatsoever. Neither +* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. +*/ + +/** +* This class calculates a HardyWeinberg p-value given three values representing +* the observed frequences of homozygous and heterozygous genotypes in the +* test population. +* +* This class is package private. Callers should use the HardyWeinberg +* interface class instead. +* +* @see HardyWeinberg +* @author Bob Handsaker +*/ +public final class HardyWeinbergCalculation { + /** + * This class is not instantiable. + */ + private HardyWeinbergCalculation() { + } + + /** + * Calculates exact two-sided hardy-weinberg p-value. Parameters + * are number of genotypes, number of rare alleles observed and + * number of heterozygotes observed. + * + * (c) 2003 Jan Wigginton, Goncalo Abecasis (goncalo@umich.edu) + */ + public static double hwCalculate(int obsAA, int obsAB, int obsBB) { + int diplotypes = obsAA + obsAB + obsBB; + int rare = (obsAA * 2) + obsAB; + int hets = obsAB; + + //make sure "rare" allele is really the rare allele + if (rare > diplotypes) { + rare = (2 * diplotypes) - rare; + } + + //make sure numbers aren't screwy + if (hets > rare) { + throw new IllegalArgumentException("HW test: " + hets + " heterozygotes but only " + + rare + " rare alleles."); + } + + double[] tailProbs = new double[rare + 1]; + + for (int z = 0; z < tailProbs.length; z++) { + tailProbs[z] = 0; + } + + //start at midpoint + int mid = (rare * ((2 * diplotypes) - rare)) / (2 * diplotypes); + + //check to ensure that midpoint and rare alleles have same parity + if (((rare & 1) ^ (mid & 1)) != 0) { + mid++; + } + + int het = mid; + int hom_r = (rare - mid) / 2; + int hom_c = diplotypes - het - hom_r; + + //Calculate probability for each possible observed heterozygote + //count up to a scaling constant, to avoid underflow and overflow + tailProbs[mid] = 1.0; + + double sum = tailProbs[mid]; + + for (het = mid; het > 1; het -= 2) { + tailProbs[het - 2] = (tailProbs[het] * het * (het - 1.0)) / (4.0 * (hom_r + 1.0) * (hom_c + + 1.0)); + sum += tailProbs[het - 2]; + + //2 fewer hets for next iteration -> add one rare and one common homozygote + hom_r++; + hom_c++; + } + + het = mid; + hom_r = (rare - mid) / 2; + hom_c = diplotypes - het - hom_r; + + for (het = mid; het <= (rare - 2); het += 2) { + tailProbs[het + 2] = (tailProbs[het] * 4.0 * hom_r * hom_c) / ((het + 2.0) * (het + + 1.0)); + sum += tailProbs[het + 2]; + + //2 more hets for next iteration -> subtract one rare and one common homozygote + hom_r--; + hom_c--; + } + + for (int z = 0; z < tailProbs.length; z++) { + tailProbs[z] /= sum; + } + + double top = tailProbs[hets]; + + for (int i = hets + 1; i <= rare; i++) { + top += tailProbs[i]; + } + + double otherSide = tailProbs[hets]; + + for (int i = hets - 1; i >= 0; i--) { + otherSide += tailProbs[i]; + } + + if ((top > 0.5) && (otherSide > 0.5)) { + return 1.0; + } + + if (top < otherSide) { + return top * 2; + } + + return otherSide * 2; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java index 1111ea982..a41bd40de 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java @@ -189,19 +189,19 @@ public class TabularRODTest extends BaseTest { assertTrue(two.get("col3").equals("5")); } - @Test (expected=RuntimeException.class ) +/* @Test (expected=RuntimeException.class ) public void testBadHeader1() { logger.warn("Executing testBadHeader1"); ArrayList header = new ArrayList(); TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1)); - } + }*/ - @Test (expected=RuntimeException.class ) +/* @Test (expected=RuntimeException.class ) public void testBadHeader2() { logger.warn("Executing testBadHeader2"); ArrayList header = new ArrayList(Arrays.asList("col1", "col2", "col3")); TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1)); - } + }*/ @Test (expected=RuntimeException.class ) public void testBadData1() { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java index 45d11b14b..fc5e33922 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java @@ -76,7 +76,7 @@ public class CovariateCounterTest extends BaseTest { readGroups.addAll(Arrays.asList(readGroup1, readGroup2)); ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" ); ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" ); - c = new CovariateCounter( readGroups, false, false ); + c = new CovariateCounter( readGroups, false, false, false ); read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1); read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2);