From 1ab00e5895236f9e500db686b5c9d914c92faed3 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 20 May 2010 14:10:56 +0000 Subject: [PATCH] Retiring multi-sample genotyper git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3401 348d0f76-0448-11de-a6fe-93d51630548a --- .../multisamplecaller/BasicPileup.java | 186 --- .../ClassicGenotypeLikelihoods.java | 508 -------- .../multisamplecaller/ConfusionMatrix.java | 65 - .../multisamplecaller/IndelLikelihood.java | 113 -- .../multisamplecaller/MultiSampleCaller.java | 1093 ----------------- 5 files changed, 1965 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/BasicPileup.java delete mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ClassicGenotypeLikelihoods.java delete mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ConfusionMatrix.java delete mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/IndelLikelihood.java delete mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/MultiSampleCaller.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/BasicPileup.java b/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/BasicPileup.java deleted file mode 100755 index 88a4ef3aa..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/BasicPileup.java +++ /dev/null @@ -1,186 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import org.broadinstitute.sting.utils.*; - -import net.sf.samtools.*; - -import java.util.List; -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Apr 14, 2009 - * Time: 8:54:05 AM - * To change this template use File | Settings | File Templates. - */ -abstract public class BasicPileup { - public static final char DELETION_CHAR = 'D'; - - @Deprecated - abstract GenomeLoc getLocation(); - @Deprecated - abstract char getRef(); - @Deprecated - abstract int size(); - - - /** - * This is the right way to get bases - * - * @return - */ - @Deprecated - byte[] getBases() { return null; } - - /** - * This is the right way to get quals - * - * @return - */ - @Deprecated - byte[] getQuals() { return null; } - - /** - * This is a terrible way to get bases. Use getBases() or getBasesAsArrayList() - * - * @return - */ - @Deprecated - String getBasesAsString() { return null; } - - /** - * This is a terrible way to get quals. Use getQuals() or getQualsAsArrayList() - * - * @return - */ - @Deprecated - String getQualsAsString() { return null; } - - public String getPileupString() { - return String.format("%s: %s %s %s", getLocation(), getRef(), getBasesAsString(), getQualsAsString()); - } - - // - // ArrayList methods - // - public static byte[] getBasesAsArray( List reads, List offsets ) { - byte array[] = new byte[reads.size()]; - int index = 0; - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - if ( offset == -1 ) { - array[index++] = ((byte)DELETION_CHAR); - } else { - array[index++] = read.getReadBases()[offset]; - } - } - return array; - } - - @Deprecated - public static ArrayList getBasesAsArrayList( List reads, List offsets ) { - ArrayList bases = new ArrayList(reads.size()); - for (byte value : getBasesAsArray(reads, offsets)) - bases.add(value); - return bases; - } - - @Deprecated - public static ArrayList getQualsAsArrayList( List reads, List offsets ) { - ArrayList quals = new ArrayList(reads.size()); - for (byte value : getQualsAsArray(reads, offsets)) - quals.add(value); - return quals; - } - - @Deprecated - public static byte[] getQualsAsArray( List reads, List offsets ) { - byte array[] = new byte[reads.size()]; - int index = 0; - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - // skip deletion sites - if ( offset == -1 ) { - array[index++] = ((byte)0); - } else { - array[index++] = read.getBaseQualities()[offset]; - } - } - return array; - } - - @Deprecated - public static ArrayList mappingQualPileup( List reads) { - ArrayList quals = new ArrayList(reads.size()); - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - byte qual = (byte)read.getMappingQuality(); - quals.add(qual); - } - return quals; - } - - @Deprecated // todo -- delete me - public static String[] indelPileup( List reads, List offsets ) - { - String[] indels = new String[reads.size()]; - - for (int i = 0; i < reads.size(); i++) - { - SAMRecord read = reads.get(i); - Cigar cigar = read.getCigar(); - int offset = offsets.get(i); - - String cigar_string = read.getCigarString(); - if (! (cigar_string.contains("I") || cigar_string.contains("D"))) { indels[i] = "null"; continue; } - - //System.out.printf("%s:%d %s %s %s ", read.getReferenceName(), read.getAlignmentStart(), read.getReadName(), read.getReadString(), cigar_string); - int k = 0; - for (int j = 0; j < cigar.numCigarElements(); j++) - { - CigarOperator operator = cigar.getCigarElement(j).getOperator(); - int length = cigar.getCigarElement(j).getLength(); - if (operator == CigarOperator.M) - { - k += length; - } - else if ((k == offset+1) && (operator == CigarOperator.I)) - { - // this insertion is associated with this offset (kinda ;) ). - indels[i] = read.getReadString().substring(k, k+length); - //System.out.printf("(I,%d,%d)", k, offset); - break; - } - else if ((k != offset+1) && (operator == CigarOperator.I)) - { - //System.out.printf("(i,%d,%d)", k, offset); - k += length; - } - else if ((k == offset) && (operator == CigarOperator.D)) - { - // this deletion is associated with this offset. - indels[i] = length + "D"; - //System.out.printf("(D,%d,%d)", k, offset); - break; - } - else if (k >= offset) - { - // no indel here. - indels[i] = "null"; - //System.out.printf("(N,%d,%d)", k, offset); - break; - } - } - if (indels[i] == null) { indels[i] = "null"; } - //System.out.printf("\n"); - } - - return indels; - } -} - - diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ClassicGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ClassicGenotypeLikelihoods.java deleted file mode 100644 index 8209fc949..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ClassicGenotypeLikelihoods.java +++ /dev/null @@ -1,508 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import org.broadinstitute.sting.utils.MathUtils; - -import static java.lang.Math.log10; -import static java.lang.Math.pow; -import java.lang.Cloneable; - -public class ClassicGenotypeLikelihoods implements Cloneable { - // precalculate these for performance (pow/log10 is expensive!) - private static final double[] oneMinusData = new double[Byte.MAX_VALUE]; - - static { - for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { - oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0))); - //oneMinusData[qual] = log10(1.0 - QualityUtils.qualToProb(qual)); - } - } - - private static double getOneMinusQual(final byte qual) { - return oneMinusData[qual]; - } - - private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE]; - - static { - for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { - oneHalfMinusData[qual] = log10(0.5 - pow(10, (qual / -10.0)) / 2.0); - //oneHalfMinusData[qual] = log10(0.5 - QualityUtils.qualToProb(qual) / 2.0); - } - } - - private static double getOneHalfMinusQual(final byte qual) { - return oneHalfMinusData[qual]; - } - - public double[] likelihoods; - public String[] genotypes; - public int coverage; - - // The genotype priors; - private double priorHomRef; - private double priorHet; - private double priorHomVar; - public String[] sorted_genotypes; - public double[] sorted_likelihoods; - double ref_likelihood = Double.NaN; - private IndelLikelihood indel_likelihood; - - // Store the 2nd-best base priors for on-genotype primary bases - //private HashMap onNextBestBasePriors = new HashMap(); - - // Store the 2nd-best base priors for off-genotype primary bases - //private HashMap offNextBestBasePriors = new HashMap(); - - private static double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; - private static double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; - - public ClassicGenotypeLikelihoods() { - initialize(1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff); - } - - public ClassicGenotypeLikelihoods(boolean foo) { - } - - public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { - initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); - } - - public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { - initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); - } - - public ClassicGenotypeLikelihoods clone() { - ClassicGenotypeLikelihoods c = new ClassicGenotypeLikelihoods(false); - c.likelihoods = this.likelihoods.clone(); - c.genotypes = this.genotypes.clone(); - c.coverage = this.coverage; - - // The genotype priors; - c.priorHomRef = this.priorHomRef; - c.priorHet = this.priorHet; - c.priorHomVar = this.priorHomVar; - //public String[] sorted_genotypes; - //public double[] sorted_likelihoods; - //double ref_likelihood = Double.NaN; - //private IndelLikelihood indel_likelihood; - return c; - } - - private void initialize(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { - this.priorHomRef = priorHomRef; - this.priorHet = priorHet; - this.priorHomVar = priorHomVar; - - likelihoods = new double[10]; - genotypes = new String[10]; - coverage = 0; - - for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } - - genotypes[0] = "AA"; - genotypes[1] = "AC"; - genotypes[2] = "AG"; - genotypes[3] = "AT"; - genotypes[4] = "CC"; - genotypes[5] = "CG"; - genotypes[6] = "CT"; - genotypes[7] = "GG"; - genotypes[8] = "GT"; - genotypes[9] = "TT"; - - //for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { - // onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); - // offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); - //} - } - - public double getHomRefPrior() { - return priorHomRef; - } - - public void setHomRefPrior(double priorHomRef) { - this.priorHomRef = priorHomRef; - } - - public double getHetPrior() { - return priorHet; - } - - public void setHetPrior(double priorHet) { - this.priorHet = priorHet; - } - - public double getHomVarPrior() { - return priorHomVar; - } - - public void setHomVarPrior(double priorHomVar) { - this.priorHomVar = priorHomVar; - } - -// public double[] getOnGenotypeSecondaryPriors() { -// double[] p2ndon = new double[10]; -// -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// p2ndon[genotypeIndex] = onNextBestBasePriors.get(genotypes[genotypeIndex]); -// } -// -// return p2ndon; -// } -// -// public void setOnGenotypeSecondaryPriors(double[] p2ndon) { -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); -// } -// } -// -// public double[] getOffGenotypeSecondaryPriors() { -// double[] p2ndoff = new double[10]; -// -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// p2ndoff[genotypeIndex] = offNextBestBasePriors.get(genotypes[genotypeIndex]); -// } -// -// return p2ndoff; -// } -// -// public void setOffGenotypeSecondaryPriors(double[] p2ndoff) { -// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { -// offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); -// } -// } - - public void add(char ref, char read, byte qual) - { - if (qual <= 0) { qual = 1; } - - if (coverage == 0) - { - for (int i = 0; i < likelihoods.length; i++) - { - likelihoods[i] = 0; - } - } - double sum = 0.0; - for (int i = 0; i < genotypes.length; i++) - { - double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); - likelihoods[i] += likelihood; - } - coverage += 1; - } - - public void add(char ref, char read, byte qual, ConfusionMatrix matrix, String platform) - { - if (qual <= 0) { qual = 1; } - if (platform == null) { platform = "ILLUMINA"; } - if (read == 'N') { return; } - - if (coverage == 0) - { - for (int i = 0; i < likelihoods.length; i++) - { - likelihoods[i] = 0; - } - } - double sum = 0.0; - for (int i = 0; i < genotypes.length; i++) - { - char h1 = genotypes[i].charAt(0); - char h2 = genotypes[i].charAt(1); - - double p1 = matrix.lookup(platform, read, h1); - double p2 = matrix.lookup(platform, read, h2); - - double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual, p1, p2); - - //System.out.printf("DBG: %c %c %s %d %f %f %f\n", ref, read, genotypes[i], qual, p1, p2, likelihood); - - likelihoods[i] += likelihood; - } - coverage += 1; - } - - private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { - if (qual == 0) { - qual = 1; - } // zero quals are wrong. - - char h1 = genotype.charAt(0); - char h2 = genotype.charAt(1); - - double p_base; - - if ((h1 == h2) && (h1 == read)) { - // hom - p_base = getOneMinusQual(qual); - } else if ((h1 != h2) && ((h1 == read) || (h2 == read))) { - // het - p_base = getOneHalfMinusQual(qual); - } else { - // error - p_base = qual / -10.0; - } - - return p_base; - } - - public void TEST() - { - double p_A2A = 1.00; - double p_T2T = 1.00; - - double p_A2T = 0.75; - double p_T2A = 0.25; - - char ref = 'A'; - - System.out.printf("\tA\tT\n"); - System.out.printf("A\t%.02f\t%.02f\n", p_A2A, p_A2T); - System.out.printf("T\t%.02f\t%.02f\n", p_T2A, p_T2T); - System.out.printf("\n"); - - System.out.printf("P(A,Q20|AA) = %f\n", calculateAlleleLikelihood(ref, 'A', "AA", (byte)20, p_A2A, p_A2A)); - System.out.printf("P(A,Q20|AT) = %f\n", calculateAlleleLikelihood(ref, 'A', "AT", (byte)20, p_A2A, p_A2T)); - System.out.printf("P(A,Q20|TT) = %f\n", calculateAlleleLikelihood(ref, 'A', "TT", (byte)20, p_A2T, p_A2T)); - - System.out.printf("P(T,Q20|AA) = %f\n", calculateAlleleLikelihood(ref, 'T', "AA", (byte)20, p_T2A, p_T2A)); - System.out.printf("P(T,Q20|AT) = %f\n", calculateAlleleLikelihood(ref, 'T', "AT", (byte)20, p_T2A, p_T2T)); - System.out.printf("P(T,Q20|TT) = %f\n", calculateAlleleLikelihood(ref, 'T', "TT", (byte)20, p_T2T, p_T2T)); - - //System.exit(0); - } - - private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual, double p1, double p2) { - if (qual == 0) { - qual = 1; - } // zero quals are wrong. - - char h1 = genotype.charAt(0); - char h2 = genotype.charAt(1); - - double perr = Math.pow(10.0,qual/-10.0); - - double p_base = 0; - - if (read == h1) - { - p_base += (1.0 - perr); - } - else - { - p_base += perr * p1; - } - - if (read == h2) - { - p_base += (1.0 - perr); - } - else - { - p_base += perr * p2; - } - - p_base = Math.log10(p_base/2.0); - - return p_base; - } - - public void sort() { - Integer[] permutation = MathUtils.sortPermutation(likelihoods); - - Integer[] reverse_permutation = new Integer[permutation.length]; - for (int i = 0; i < reverse_permutation.length; i++) { - reverse_permutation[i] = permutation[(permutation.length - 1) - i]; - } - - sorted_genotypes = MathUtils.permuteArray(genotypes, reverse_permutation); - sorted_likelihoods = MathUtils.permuteArray(likelihoods, reverse_permutation); - } - - public String toString(char ref) { - this.sort(); - double sum = 0; - String s = String.format("%s %f %f ", this.BestGenotype(), this.LodVsNextBest(), this.LodVsRef(ref)); - for (int i = 0; i < sorted_genotypes.length; i++) { - if (i != 0) { - s = s + " "; - } - s = s + sorted_genotypes[i] + ":" + String.format("%.2f", sorted_likelihoods[i]); - sum += Math.pow(10,sorted_likelihoods[i]); - } - s = s + String.format(" %f", sum); - return s; - } - - public void ApplyPrior(char ref, double[] allele_likelihoods) - { - int k = 0; - for (int i = 0; i < 4; i++) - { - for (int j = i; j < 4; j++) - { - if (i == j) - { - this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]); - } - else - { - this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]) + Math.log10(2); - } - k++; - } - } - this.sort(); - } - - public void ApplyPrior(char ref, char alt, double p_alt) { - for (int i = 0; i < genotypes.length; i++) { - if ((p_alt == -1) || (p_alt <= 1e-6)) { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - // hom-ref - likelihoods[i] += Math.log10(priorHomRef); - } else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { - // hom-nonref - likelihoods[i] += Math.log10(priorHomVar); - } else { - // het - likelihoods[i] += Math.log10(priorHet); - } - if (Double.isInfinite(likelihoods[i])) { - likelihoods[i] = -1000; - } - } else { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - // hom-ref - likelihoods[i] += 2.0 * Math.log10(1.0 - p_alt); - } else if ((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == alt)) { - // hom-nonref - likelihoods[i] += 2.0 * Math.log10(p_alt); - } else if (((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == ref)) || - ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == alt))) { - // het - likelihoods[i] += Math.log10((1.0 - p_alt) * p_alt * 2.0); - } else { - // something else (noise!) - likelihoods[i] += Math.log10(1e-5); - } - - if (Double.isInfinite(likelihoods[i])) { - likelihoods[i] = -1000; - } - } - } - this.sort(); - } - - public void ApplyWeight(double weight) - { - double log10weight = Math.log10(weight); - for (int i = 0; i < genotypes.length; i++) { likelihoods[i] += log10weight; } - this.sort(); - } - -// public void applySecondBaseDistributionPrior(String primaryBases, String secondaryBases) { -// for (int genotypeIndex = 0; genotypeIndex < genotypes.length; genotypeIndex++) { -// char firstAllele = genotypes[genotypeIndex].charAt(0); -// char secondAllele = genotypes[genotypeIndex].charAt(1); -// -// int offIsGenotypic = 0; -// int offTotal = 0; -// -// int onIsGenotypic = 0; -// int onTotal = 0; -// -// for (int pileupIndex = 0; pileupIndex < primaryBases.length(); pileupIndex++) { -// char primaryBase = primaryBases.charAt(pileupIndex); -// -// if (secondaryBases != null) { -// char secondaryBase = secondaryBases.charAt(pileupIndex); -// -// if (primaryBase != firstAllele && primaryBase != secondAllele) { -// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { -// offIsGenotypic++; -// } -// offTotal++; -// } else { -// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { -// onIsGenotypic++; -// } -// onTotal++; -// } -// } -// } -// -// double offPrior = MathUtils.binomialProbability(offIsGenotypic, offTotal, offNextBestBasePriors.get(genotypes[genotypeIndex])); -// double onPrior = MathUtils.binomialProbability(onIsGenotypic, onTotal, onNextBestBasePriors.get(genotypes[genotypeIndex])); -// -// likelihoods[genotypeIndex] += Math.log10(offPrior) + Math.log10(onPrior); -// } -// this.sort(); -// } - - public double LodVsNextBest() { - this.sort(); - return sorted_likelihoods[0] - sorted_likelihoods[1]; - } - - public double LodVsRef(char ref) { - if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) { - ref_likelihood = sorted_likelihoods[0]; - return (-1.0 * this.LodVsNextBest()); - } else { - for (int i = 0; i < genotypes.length; i++) { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - ref_likelihood = likelihoods[i]; - } - } - } - return sorted_likelihoods[0] - ref_likelihood; - } - - public String BestGenotype() { - this.sort(); - return this.sorted_genotypes[0]; - } - - public double BestPosterior() { - this.sort(); - return this.sorted_likelihoods[0]; - } - - public double RefPosterior(char ref) - { - this.LodVsRef(ref); - return this.ref_likelihood; - } - - public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; } - public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; } - -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ConfusionMatrix.java b/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ConfusionMatrix.java deleted file mode 100644 index 8b51648e5..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/ConfusionMatrix.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import java.lang.*; -import java.util.*; -import java.io.*; - -import org.broadinstitute.sting.utils.*; - -public class ConfusionMatrix -{ - private double[][] ILLUMINA; - private double[][] solid; - private double[][] LS454; - - public ConfusionMatrix(String file_name) - { - //System.out.println("DBG: ConfusionMatrix constructor! (" + file_name + ")"); - - ILLUMINA = new double[4][4]; - solid = new double[4][4]; - LS454 = new double[4][4]; - - try - { - Scanner sc = new Scanner(new File(file_name)); - while (sc.hasNext()) - { - String platform = sc.next(); - char read = sc.next().charAt(0); - char ref = sc.next().charAt(0); - double p = sc.nextDouble(); - - int read_i = BaseUtils.simpleBaseToBaseIndex(read); - int ref_i = BaseUtils.simpleBaseToBaseIndex(ref); - - if (platform.equals("ILLUMINA")) { ILLUMINA[read_i][ref_i] = p; } - if (platform.equals("solid")) { solid[read_i][ref_i] = p; } - if (platform.equals("LS454")) { LS454[read_i][ref_i] = p; } - - //System.out.println("DBG: " + key + " " + p); - } - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } - - } - - double lookup(String platform, char read, char truth) - { - int read_i = BaseUtils.simpleBaseToBaseIndex(read); - int truth_i = BaseUtils.simpleBaseToBaseIndex(truth); - - double d = 0; - - if (platform.equals("ILLUMINA")) { d = ILLUMINA[read_i][truth_i]; } - else if (platform.equals("solid")) { d = solid[read_i][truth_i]; } - else if (platform.equals("LS454")) { d = LS454[read_i][truth_i]; } - else { assert(false); } - - return d; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/IndelLikelihood.java b/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/IndelLikelihood.java deleted file mode 100755 index 71ebd9498..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/IndelLikelihood.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import java.util.HashMap; - -public class IndelLikelihood { - private String type; - private String[] alleles; - private double p; - private double lod; - - private double pRef; - private double pHet; - private double pHom; - private String alt; - - public IndelLikelihood(String type, String[] alleles, double p, double lod) { - initialize(type, alleles, p, lod); - } - - public IndelLikelihood(String[] indels, double indel_alt_freq) - { - HashMap indel_allele_counts = new HashMap(); - - for (int i = 0; i < indels.length; i++) { - if (! indel_allele_counts.containsKey(indels[i])) { - indel_allele_counts.put(indels[i], 1); - } else { - indel_allele_counts.put(indels[i], indel_allele_counts.get(indels[i])+1); - } - } - - Object[] keys = indel_allele_counts.keySet().toArray(); - String[] alleles = new String[keys.length]; - int[] counts = new int[keys.length]; - //double likelihoods[] = new double[keys.length]; - int null_count = 0; - String max_allele = null; - int max_count = -1; - - if ((keys.length > 0) && (! ((keys.length == 1) && (((String)keys[0]).equals("null"))))) { - for (int i = 0; i < keys.length; i++) { - Integer count = (Integer)indel_allele_counts.get(keys[i]); - alleles[i] = (String)keys[i]; - counts[i] = count; - if (alleles[i].equals("null")) { null_count = counts[i]; } - else if (counts[i] > max_count) { max_count = counts[i]; max_allele = alleles[i]; } - //System.out.printf("%s[%d] ", keys[i], count); - } - //System.out.printf("\n"); - - double eps = 1e-3; - pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + 2*Math.log10(1-indel_alt_freq); - pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10((1-indel_alt_freq)*indel_alt_freq); - pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + 2*Math.log10(indel_alt_freq); - - double lodRef = pRef - Math.max(pHet, pHom); - double lodHet = pHet - pRef; - double lodHom = pHom - pRef; - - //System.out.printf("%s/%s %f %f\n", "null", "null", pRef, lodRef); - //System.out.printf("%s/%s %f %f\n", max_allele, "null", pHet, lodHet); - //System.out.printf("%s/%s %f %f\n", max_allele, max_allele, pHom, lodHom); - //System.out.printf("\n"); - - if (lodRef > 0) { - // reference call - String[] genotype = new String[2]; - genotype[0] = "null"; - genotype[1] = "null"; - - //return new IndelLikelihood("ref", genotype, pRef, lodRef); - initialize("ref", genotype, pRef, lodRef); - } else if (lodHet > lodHom) { - // het call - String[] genotype = new String[2]; - genotype[0] = "null"; - genotype[1] = max_allele; - - //return new IndelLikelihood("het", genotype, pHet, lodHet); - initialize("het", genotype, pHet, lodHet); - } else { - // hom call - String[] genotype = new String[2]; - genotype[0] = max_allele; - genotype[1] = max_allele; - - //return new IndelLikelihood("hom", genotype, pHom, lodHom); - initialize("hom", genotype, pHom, lodHom); - } - } - } - - private void initialize(String type, String[] alleles, double p, double lod) { - this.type = type; - this.alleles = alleles; - this.p = p; - this.lod = lod; - } - - public String getAlt() { return alt; } - public double pRef() { return pRef; } - public double pHet() { return pHet; } - public double pHom() { return pHom; } - - public String getType() { return type; } - public String[] getAlleles() { return alleles; } - public double getPosteriorProbability() { return p; } - public double getLOD() { return lod; } - - public String toString() { - return String.format("%s/%s %f %f", alleles[0], alleles[1], p, lod); - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/MultiSampleCaller.java b/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/MultiSampleCaller.java deleted file mode 100644 index fdc1a61cb..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/multisamplecaller/MultiSampleCaller.java +++ /dev/null @@ -1,1093 +0,0 @@ - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.multisamplecaller; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.*; -import java.util.zip.GZIPOutputStream; - -// Beta iterative multi-sample caller -// j.maguire 6-11-2009 - -public class MultiSampleCaller extends LocusWalker -{ - @Argument(required=false, shortName="fractional_counts", doc="should we use fractional counts?") public boolean FRACTIONAL_COUNTS = false; - @Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10; - @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT = null; - @Argument(fullName="individual_output", shortName="individual_output", required=false, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT = null; - @Argument(fullName="stats_output", shortName="stats_output", required=false, doc="file to write stats to") public String STATS_OUTPUT = null; - @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; - @Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false; - @Argument(fullName="weight_samples", shortName="weight_samples", required=false, doc="rw-weight samples during EM?") public boolean WEIGHT_SAMPLES = false; - - @Argument(fullName="theta", shortName="theta", required=false, doc="rate of sequence divergence") public double THETA = 1e-3; - @Argument(fullName="allele_frequency_prior", shortName="allele_frequency_prior", required=false, doc="use prior on allele frequencies? (P(f) = theta/(N*f)") public boolean ALLELE_FREQUENCY_PRIOR = false; - - @Argument(fullName="confusion_matrix_file", shortName="confusion_matrix_file", required=false, doc="file containing confusion matrix for all three technologies") public String CONFUSION_MATRIX_FILE = null; - - @Argument(fullName="ALLELE_FREQ_TOLERANCE", shortName="AFT", required=false, doc="") - public double ALLELE_FREQ_TOLERANCE = 1e-6; - - @Argument(fullName="append", shortName="append", required=false, doc="if the discovery file already exists, don't re-call sites that are done.") boolean APPEND = false; - - private static final double MIN_LOD_FOR_STRAND = 0.01; - - // Private state. - protected List sample_names; - protected SAMFileHeader header; - protected PrintStream individual_output_file = null; - protected PrintStream discovery_output_file = null; - protected PrintStream stats_output_file = null; - - private boolean INCLUDE_STATS = false; - private boolean INCLUDE_GENOTYPES = false ; - - class MultiSampleCallResult - { - GenomeLoc location; - char ref; - char alt; - EM_Result em_result; - double lod; - double strand_score; - double pD; - double pNull; - String in_dbsnp; - int n_ref; - int n_het; - int n_hom; - int EM_N; - double alt_freq; - - public MultiSampleCallResult(GenomeLoc location, char ref, char alt, EM_Result em_result, double lod, double strand_score, double pD, double pNull, String in_dbsnp, int n_ref, int n_het, int n_hom, int EM_N, double alt_freq) - { - this.location = location; - this.ref = ref; - this.alt = alt; - this.em_result = em_result; - this.lod = lod; - this.strand_score = strand_score; - this.pD = pD; - this.pNull = pNull; - this.in_dbsnp = in_dbsnp; - this.n_ref = n_ref; - this.n_het = n_het; - this.n_hom = n_hom; - this.EM_N = EM_N; - this.alt_freq = alt_freq; - } - - public MultiSampleCallResult() { } // this is just so I can do new MultiSampleCallResult().header(). "inner classes cannot have static declarations" :( - - public String header() - { - return new String("loc ref alt lod strand_score pD pNull in_dbsnp pA pC pG pT EM_alt_freq EM_N n_ref n_het n_hom"); - } - - public String toString() - { - String s = ""; - s = s + String.format("%s %c %c %f %f %f %f %s ", location, ref, alt, lod, strand_score, pD, pNull, in_dbsnp); - for (int i = 0; i < 4; i++) { s = s + String.format("%f ", em_result.allele_likelihoods[i]); } - s = s + String.format("%f %d %d %d %d", alt_freq, em_result.EM_N, n_ref, n_het, n_hom); - return s; - } - } - - public static class DepthStats - { - public static String Header() - { - return "loc ref depth A C G T a c g t mq_min mq_mean mq_median mq_max mq_sd"; - } - - public static String Row(char ref, AlignmentContext context) - { - String ans = ""; - List reads = context.getReads(); - List offsets = context.getOffsets(); - //Pileup pileup = new ReadBackedPileupOld(ref, context); - - ans += String.format("%s ", context.getLocation()); - ans += String.format("%c ", ref); - ans += String.format("%d ", reads.size()); - ans += String.format("%d ", countBase(context, 'A', "+")); - ans += String.format("%d ", countBase(context, 'C', "+")); - ans += String.format("%d ", countBase(context, 'G', "+")); - ans += String.format("%d ", countBase(context, 'T', "+")); - ans += String.format("%d ", countBase(context, 'A', "-")); - ans += String.format("%d ", countBase(context, 'C', "-")); - ans += String.format("%d ", countBase(context, 'G', "-")); - ans += String.format("%d ", countBase(context, 'T', "-")); - - ans += String.format("%s ", Stats(BasicPileup.mappingQualPileup(reads))); - - return ans; - } - - static int countBase(AlignmentContext context, char base, String strand) - { - int count = 0; - List reads = context.getReads(); - List offsets = context.getOffsets(); - for (int i = 0; i < reads.size(); i++) - { - if (reads.get(i).getReadString().charAt(offsets.get(i)) == base) - { - if (strand.equals("+") && (reads.get(i).getReadNegativeStrandFlag()==false)) { count += 1; } - else if (strand.equals("-") && (reads.get(i).getReadNegativeStrandFlag()==true)) { count += 1; } - else if (! (strand.equals("+") || strand.equals("-"))) { count += 1; } - } - } - return count; - } - - public static String Stats(ArrayList X) - { - Collections.sort(X); - - long count = 0; - long sum = 0; - long min = X.get(0); - long max = X.get(0); - long median = X.get(0); - for (int i = 0; i < X.size(); i++) - { - int x = X.get(i); - if (x < min) { min = x; } - if (x > max) { max = x; } - sum += x; - count += 1; - if (i == X.size()/2) { median = x; } - } - - double mean = sum/count; - for (int i = 0; i < X.size(); i++) - { - int x = X.get(i); - sum += (x-mean)*(x-mean); - count += 1; - } - double sd = Math.sqrt(sum/count); - - return String.format("%d %f %d %d %f", min, mean, median, max, sd); - } - } - - GenomeLoc highest_previously_done_loc = null; - private boolean in_skip_mask(GenomeLoc loc) - { - if (highest_previously_done_loc == null) { return false; } - if (highest_previously_done_loc.compareTo(loc) < 0) { return false; } - else { return true; } - } - - private void maybeInitializeDisoveryOutput() - { - if ( discovery_output_file == null ) - { - File file = new File(DISCOVERY_OUTPUT); - if ((APPEND == true) && (file.exists())) - { - try - { - Runtime.getRuntime().exec("cp -v " + DISCOVERY_OUTPUT + " " + DISCOVERY_OUTPUT + ".backup"); - - // 1. Read the existing file and record the highest site we've seen. - Scanner scanner = new Scanner(file); - while(scanner.hasNext()) - { - String line = scanner.nextLine(); - String[] tokens = line.split(" +"); - String loc_string = tokens[0]; - if (loc_string.equals("loc")) { continue; } - highest_previously_done_loc = GenomeLocParser.parseGenomeLoc(loc_string); - } - - // 2. Open the output file for appending. - discovery_output_file = new PrintStream(new FileOutputStream(DISCOVERY_OUTPUT, true)); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - } - else - { - try - { - final String filename = DISCOVERY_OUTPUT; - discovery_output_file = new PrintStream(filename); - discovery_output_file.println(new MultiSampleCallResult().header()); - } - catch (Exception e) - { - throw new RuntimeException(e); - } - } - } - } - - ///////// - // Walker Interface Functions - public void initialize() - { - System.out.printf("\n\n\n\n"); - (new ClassicGenotypeLikelihoods()).TEST(); - - try - { - maybeInitializeDisoveryOutput(); - - INCLUDE_GENOTYPES = INDIVIDUAL_OUTPUT != null; - if ( INCLUDE_GENOTYPES ) { - individual_output_file = new PrintStream(new GZIPOutputStream(new FileOutputStream(INDIVIDUAL_OUTPUT))); - individual_output_file.println("loc ref sample_name genotype lodVsNextBest lodVsRef in_dbsnp AA AC AG AT CC CG CT GG GT TT"); - } - - INCLUDE_STATS = STATS_OUTPUT != null; - if ( INCLUDE_STATS ) { - stats_output_file = new PrintStream(STATS_OUTPUT); - stats_output_file.println(DepthStats.Header()); - } - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } - - GenomeAnalysisEngine toolkit = this.getToolkit(); - this.header = toolkit.getSAMFileHeader(); - List read_groups = header.getReadGroups(); - - sample_names = new ArrayList(); - - HashSet unique_sample_names = new HashSet(); - - for (int i = 0; i < read_groups.size(); i++) - { - String sample_name = read_groups.get(i).getSample(); - String platform = (String)(read_groups.get(i).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - - //System.out.printf("SAMPLE: %s %s\n", sample_name, platform); - - if (unique_sample_names.contains(sample_name)) { continue; } - unique_sample_names.add(sample_name); - sample_names.add(sample_name); - - System.out.printf("UNIQUE_SAMPLE: %s %s\n", sample_name, platform); - } - - // Load the confusion matrix if it exists - if (CONFUSION_MATRIX_FILE != null) - { - this.confusion_matrix = new ConfusionMatrix(CONFUSION_MATRIX_FILE); - } - - } - - - Date start_time = null; - int n_sites_processed = 0; - - public MultiSampleCallResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - if (start_time == null) { start_time = new Date(); } - - if (in_skip_mask(context.getLocation()) == true) { return null; } - - if ((n_sites_processed % 1000) == 0) - { - Date current_time = new Date(); - long elapsed = current_time.getTime() - start_time.getTime(); - out.printf("RUNTIME: %d sites processed in %f seconds; %f seconds per site.\n", - n_sites_processed, - (double)elapsed/1000.0, - ((double)elapsed/1000.0)/(double)n_sites_processed); - } - n_sites_processed += 1; - - context = filter_each_read(context); - - if (ref.getBaseAsChar() == 'N') { return null; } - if (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) == -1) { return null; } - if (context.getReads().size() <= 0) { return null; } - if (context.getReads().size() >= 10000) { return null; } // to deal with big piles -- totally arbitrary threshold - - this.ref = ref.getBaseAsChar(); - MultiSampleCallResult result = this.MultiSampleCall(tracker, ref.getBaseAsChar(), context, sample_names); - if ( INCLUDE_STATS ) stats_output_file.println(DepthStats.Row(ref.getBaseAsChar(), context)); - return result; - } - - public void onTraversalDone(String sum) - { - discovery_output_file.flush(); - discovery_output_file.close(); - - if ( INCLUDE_STATS ) { - stats_output_file.flush(); - stats_output_file.close(); - } - - out.println("MultiSampleCaller done."); - return; - } - - public String reduceInit() - { - return null; - } - - public String reduce(MultiSampleCallResult record, String sum) - { - if (record != null) - { - discovery_output_file.printf(record.toString() + "\n"); - } - return null; - } - - // END Walker Interface Functions - ///////// - - - ///////// - // Calling Functions - - char ref; - protected ConfusionMatrix confusion_matrix; - - ClassicGenotypeLikelihoods reallyMakeGenotypeLikelihood(AlignmentContext context) { - List reads = context.getReads(); - List offsets = context.getOffsets(); - - // Handle single-base polymorphisms. - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - for ( int i = 0; i < reads.size(); i++ ) - { - //System.out.printf("DBG: %s\n", context.getLocation()); - - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - if (CONFUSION_MATRIX_FILE == null) - { - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); - } - else - { - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); - } - } - - return G; - } - - HashMap glCache = new HashMap(); - int cache_size = 0; - - ClassicGenotypeLikelihoods GenotypeOld(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { - //ReadBackedPileup pileup = new ReadBackedPileup(ref, context); - //String bases = pileup.getBases(); - - List reads = context.getReads(); - List offsets = context.getOffsets(); - ref = Character.toUpperCase(ref); - - if (reads.size() == 0) { - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - return G; - } - - // Handle single-base polymorphisms. - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - for ( int i = 0; i < reads.size(); i++ ) - { - //System.out.printf("DBG: %s\n", context.getLocation()); - - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - if (CONFUSION_MATRIX_FILE == null) - { - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); - } - else - { - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); - } - } - G.ApplyPrior(ref, allele_likelihoods); - - // Handle indels - if (CALL_INDELS) - { - String[] indels = BasicPileup.indelPileup(reads, offsets); - IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq); - if (indel_call.getType() != null) - { - G.addIndelLikelihood(indel_call); - } - else - { - G.addIndelLikelihood(null); - } - } - - return G; - } - - ClassicGenotypeLikelihoods Genotype(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { - return GenotypeCache(context, allele_likelihoods, indel_alt_freq ); - //return GenotypeOld(context, allele_likelihoods, indel_alt_freq ); - } - - - ClassicGenotypeLikelihoods GenotypeCache(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) - { - ref = Character.toUpperCase(ref); - - // Handle single-base polymorphisms. - ClassicGenotypeLikelihoods G = null; - if ( context.getReads().size() == 0 ) { - G = new ClassicGenotypeLikelihoods(); - return G; - } else { - if ( true && glCache.containsKey(context) ) { - ClassicGenotypeLikelihoods cached = glCache.get(context); - G = (ClassicGenotypeLikelihoods)cached.clone(); - } else { - G = reallyMakeGenotypeLikelihood(context); - if (cache_size < 5000) - { - //System.out.printf("cache add (%d)\n", cache_size); - glCache.put(context, G.clone()); - cache_size += context.getReads().size(); - } - else - { - //System.out.printf("cache skip (%d)\n", cache_size); - } - } - G.ApplyPrior(ref, allele_likelihoods); - } - - return G; - } - - // This version is a little faster. - double[] CountFreqs(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double[] allele_likelihoods = new double[4]; - - for (int x = 0; x < genotype_likelihoods.length; x++) - { - ClassicGenotypeLikelihoods G = genotype_likelihoods[x]; - - if (G.coverage == 0) { continue; } - - double[] personal_allele_likelihoods = new double[4]; - - int k = 0; - for (int i = 0; i < 4; i++) - { - for (int j = i; j < 4; j++) - { - double likelihood = Math.pow(10,G.likelihoods[k]); - personal_allele_likelihoods[i] += likelihood; - personal_allele_likelihoods[j] += likelihood; - k++; - } - } - double sum = 0; - for (int y = 0; y < 4; y++) { sum += personal_allele_likelihoods[y]; } - for (int y = 0; y < 4; y++) { personal_allele_likelihoods[y] /= sum; } - for (int y = 0; y < 4; y++) { allele_likelihoods[y] += personal_allele_likelihoods[y]; } - } - - double sum = 0; - for (int i = 0; i < 4; i++) { sum += allele_likelihoods[i]; } - for (int i = 0; i < 4; i++) { allele_likelihoods[i] /= sum; } - - return allele_likelihoods; - } - - - double CountIndelFreq(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - HashMap indel_allele_likelihoods = new HashMap(); - - double pRef = 0; - double pAlt = 0; - - for (int j = 0; j < sample_names.size(); j++) - { - double personal_pRef = 0; - double personal_pAlt = 0; - - IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood(); - personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet()); - personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet()); - - personal_pRef = personal_pRef / (personal_pAlt + personal_pRef); - personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef); - - pRef += personal_pRef; - pAlt += personal_pAlt; - } - - pRef = pRef / (pRef + pAlt); - pAlt = pAlt / (pRef + pAlt); - - return pAlt; - } - - // Potential precision error here. - double Compute_pD(ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] sample_weights) - { - double pD = 0; - for (int i = 0; i < sample_names.size(); i++) - { - double sum = 0; - for (int j = 0; j < 10; j++) - { - sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); - } - pD += Math.log10(sample_weights[i] * sum); - } - return pD; - } - - double Compute_pNull(AlignmentContext[] contexts, double[] sample_weights) - { - double[] allele_likelihoods = new double[4]; - for (int i = 0; i < 4; i++) { allele_likelihoods[i] = 1e-6/3.0; } - allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6; - ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; - for (int j = 0; j < sample_names.size(); j++) - { - G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6); - } - return Compute_pD(G, sample_weights); - } - - double[] Compute_SampleWeights(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double[] pD = new double[sample_names.size()]; - double total_pD = 0; - for (int i = 0; i < sample_names.size(); i++) - { - double sum = 0; - for (int j = 0; j < 10; j++) - { - sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); - } - pD[i] = sum; - total_pD += pD[i]; - } - - for (int i = 0; i < sample_names.size(); i++) - { - pD[i] /= total_pD; - } - - return pD; - } - - // Some globals to cache results. - EM_Result em_result; - double pD; - double pNull; - double lod; - double LOD(AlignmentContext[] contexts) - { - em_result = EM(contexts); - ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods; - pD = Compute_pD(G, em_result.sample_weights); - pNull = Compute_pNull(contexts, em_result.sample_weights); - - if (ALLELE_FREQUENCY_PRIOR) - { - // Apply p(f). - double pVar = 0.0; - for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } - - double p0 = Math.log10(1 - pVar); - double pF; - - double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods); - - if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; } - else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); } - - //System.out.printf("DBG %s %c %f %f %f %f (%.20f) %f ", contexts[0].getLocation(), ref, pD, pF, pNull, p0, Compute_alt_freq(ref, em_result.allele_likelihoods), 2.0 * em_result.EM_N); - //for (int i = 0; i < 4; i++) { System.out.printf("%f ", em_result.allele_likelihoods[i]); } - //System.out.printf("\n"); - - pD = pD + pF; - pNull = pNull + p0; - } - - lod = pD - pNull; - return lod; - } - - class EM_Result - { - String[] sample_names; - ClassicGenotypeLikelihoods[] genotype_likelihoods; - double[] allele_likelihoods; - double[] sample_weights; - int EM_N; - - public EM_Result(List sample_names, ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] allele_likelihoods, double[] sample_weights) - { - this.sample_names = new String[1]; - this.sample_names = sample_names.toArray(this.sample_names); - this.genotype_likelihoods = genotype_likelihoods; - this.allele_likelihoods = allele_likelihoods; - this.sample_weights = sample_weights; - - EM_N = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage > 0) { EM_N += 1; } - } - - } - - } - - final static double[] sample_weights = new double[1000]; - static { - for (int i = 0; i < 1000; i++) - { - //sample_weights[i] = 1.0/(double)i; - sample_weights[i] = 1.0; - } - } - - EM_Result EM(AlignmentContext[] contexts) - { - final boolean DEBUG_PRINT = false; - double[] allele_likelihoods = new double[4]; - - // These initial conditions should roughly replicate classic SSG. (at least on hets) - for (int i = 0; i < 4; i++) - { - if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999) - else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999) - } - double indel_alt_freq = 1e-4; - - ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; - //ClassicGenotypeLikelihoods[] Weighted_G = new ClassicGenotypeLikelihoods[sample_names.size()]; - - if ( DEBUG_PRINT ) System.out.printf("%n"); - - for (int i = 0; i < MAX_ITERATIONS; i++) - { - for (int j = 0; j < sample_names.size(); j++) - { - G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq); - //if (WEIGHT_SAMPLES) { G[j].ApplyWeight(sample_weights[j]); } - } - - double[] old_allele_likelihoods = allele_likelihoods; - allele_likelihoods = CountFreqs(G); - double alDelta = 0.0; - for (int j = 0; j < 4; j++) { alDelta += Math.abs(old_allele_likelihoods[j] - allele_likelihoods[j]); } - - if ( DEBUG_PRINT ) - { - System.out.printf("%s AL %f %f %f %f => delta=%e < %e == %b%n", - contexts[0].getLocation(), - Math.log10(allele_likelihoods[0]), Math.log10(allele_likelihoods[1]), Math.log10(allele_likelihoods[2]), Math.log10(allele_likelihoods[3]), - alDelta, ALLELE_FREQ_TOLERANCE, alDelta < ALLELE_FREQ_TOLERANCE); - } - - //if ( alDelta < ALLELE_FREQ_TOLERANCE ) { - // System.out.printf("Converged after %d iterations%n", i); - // break; - //} - -// if (CALL_INDELS) -// { -// indel_alt_freq = CountIndelFreq(G); -// } - -// if (WEIGHT_SAMPLES) -// { -// sample_weights = Compute_SampleWeights(G); -// } - } - - return new EM_Result(sample_names, G, allele_likelihoods, sample_weights); - } - - // Hacky global variables for debugging. - double StrandScore(AlignmentContext context) - { - //AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); - - AlignmentContext fw = filterAlignmentContext(context, "+"); - AlignmentContext bw = filterAlignmentContext(context, "-"); - AlignmentContext[] contexts_fw = filterAlignmentContext(fw, sample_names, 0); - AlignmentContext[] contexts_bw = filterAlignmentContext(bw, sample_names, 0); - - EM_Result em_fw = EM(contexts_fw); - EM_Result em_bw = EM(contexts_bw); - - double pNull_fw = Compute_pNull(contexts_fw, em_fw.sample_weights); - double pNull_bw = Compute_pNull(contexts_bw, em_bw.sample_weights); - - double pD_fw = Compute_pD(em_fw.genotype_likelihoods, em_fw.sample_weights); - double pD_bw = Compute_pD(em_bw.genotype_likelihoods, em_bw.sample_weights); - - if (ALLELE_FREQUENCY_PRIOR) - { - // Apply p(f). - double pVar = 0.0; - for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } - - pD_fw = pD_fw + Math.log10(THETA/(2.0*em_fw.EM_N * Compute_alt_freq(ref, em_fw.allele_likelihoods))); - pNull_fw = pNull_fw + Math.log10(1 - pVar); - - pD_bw = pD_bw + Math.log10(THETA/(2.0*em_bw.EM_N * Compute_alt_freq(ref, em_bw.allele_likelihoods))); - pNull_bw = pNull_bw + Math.log10(1 - pVar); - } - - double EM_alt_freq_fw = Compute_alt_freq(ref, em_fw.allele_likelihoods); - double EM_alt_freq_bw = Compute_alt_freq(ref, em_bw.allele_likelihoods); - - //double pNull = Compute_pNull(contexts); - //double lod = LOD(contexts); - - double lod_fw = (pD_fw + pNull_bw) - pNull; - double lod_bw = (pD_bw + pNull_fw) - pNull; - double strand_score = Math.max(lod_fw - lod, lod_bw - lod); - return strand_score; - } - -// ClassicGenotypeLikelihoods HardyWeinberg(double[] allele_likelihoods) -// { -// ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); -// int k = 0; -// for (int i = 0; i < 4; i++) -// { -// for (int j = i; j < 4; j++) -// { -// G.likelihoods[k] = allele_likelihoods[i] * allele_likelihoods[j]; -// k++; -// } -// } -// return G; -// } - - char PickAlt(char ref, double[] allele_likelihoods) - { - Integer[] perm = MathUtils.sortPermutation(allele_likelihoods); - if (perm[3] != BaseUtils.simpleBaseToBaseIndex(ref)) { return BaseUtils.baseIndexToSimpleBaseAsChar(perm[3]); } - else { return BaseUtils.baseIndexToSimpleBaseAsChar(perm[2]); } - } - - double Compute_discovery_lod(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double pBest = 0; - double pRef = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - pBest += genotype_likelihoods[i].BestPosterior(); - pRef += genotype_likelihoods[i].RefPosterior(ref); - } - return pBest - pRef; - } - - // this one is a bit of a lazy hack. - double Compute_alt_freq(char ref, double[] allele_likelihoods) - { - return allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(PickAlt(ref, allele_likelihoods))]; - } - - int Compute_n_ref(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) == ref) && (g.charAt(1) == ref)) { n += 1; } - } - return n; - } - - int Compute_n_het(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) == ref) && (g.charAt(1) != ref)) { n += 1; } - if ((g.charAt(0) != ref) && (g.charAt(1) == ref)) { n += 1; } - } - return n; - } - - int Compute_n_hom(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) != ref) && (g.charAt(1) != ref)) { n += 1; } - } - return n; - } - - // This should actually return a GLF Record - MultiSampleCallResult MultiSampleCall(RefMetaDataTracker tracker, char ref, AlignmentContext context, List sample_names) - { - String in_dbsnp; - if (tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME).size() > 0) { in_dbsnp = "known"; } else { in_dbsnp = "novel"; } - - AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); - glCache.clear(); // reset the cache - cache_size = 0; - - double lod = LOD(contexts); - - int n_ref = Compute_n_ref(ref, em_result.genotype_likelihoods); - int n_het = Compute_n_het(ref, em_result.genotype_likelihoods); - int n_hom = Compute_n_hom(ref, em_result.genotype_likelihoods); - - //double strand_score = lod > MIN_LOD_FOR_STRAND ? StrandScore(context) : 0.0; - double strand_score; - if (n_het+n_hom > 0) { strand_score = StrandScore(context); } - else { strand_score = 0; } - - //EM_Result em_result = EM(contexts); - //ClassicGenotypeLikelihoods population_genotype_likelihoods = HardyWeinberg(em_result.allele_likelihoods); - - //double pD = Compute_pD(em_result.genotype_likelihoods); - //double pNull = Compute_pNull(contexts); - - //double discovery_lod = Compute_discovery_lod(ref, em_result.genotype_likelihoods); - double alt_freq = Compute_alt_freq(ref, em_result.allele_likelihoods); - - char alt = 'N'; - //if (lod > 0.0) { alt = PickAlt(ref, em_result.allele_likelihoods); } - if ((n_het > 0) || (n_hom > 0)) { alt = PickAlt(ref, em_result.allele_likelihoods); } - - if ( INCLUDE_GENOTYPES ) { - for (int i = 0; i < em_result.genotype_likelihoods.length; i++) - { - individual_output_file.printf("%s %c %s ", context.getLocation(), ref, sample_names.get(i)); - individual_output_file.printf("%s %f %f %s ", em_result.genotype_likelihoods[i].BestGenotype(), - em_result.genotype_likelihoods[i].LodVsNextBest(), - em_result.genotype_likelihoods[i].LodVsRef(ref), - in_dbsnp); - - //individual_output.printf("%s ", new ReadBackedPileup(ref, contexts[i]).getBaseCountsString()); - assert(em_result.genotype_likelihoods[i] != null); - em_result.genotype_likelihoods[i].sort(); - assert(em_result.genotype_likelihoods[i].sorted_likelihoods != null); - - if ( INCLUDE_GENOTYPES ) { - for (int j = 0; j < em_result.genotype_likelihoods[i].sorted_likelihoods.length; j++) - { - individual_output_file.printf("%f ", em_result.genotype_likelihoods[i].likelihoods[j]); - } - individual_output_file.printf("\n"); - } - } - } - - return new MultiSampleCallResult(context.getLocation(), ref, alt, em_result, lod, strand_score, pD, pNull, in_dbsnp, n_ref, n_het, n_hom, em_result.EM_N, alt_freq); - } - - // END Calling Functions - ///////// - - ///////// - // Utility Functions - - /// Filter a locus context by forward and backward - private AlignmentContext filterAlignmentContext(AlignmentContext context, String strand) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - - // Filter for strandedness - if ((!strand.contains("+")) && (read.getReadNegativeStrandFlag() == false)) { continue; } - if ((!strand.contains("-")) && (read.getReadNegativeStrandFlag() == true)) { continue; } - reads.add(read); - offsets.add(offset); - } - return new AlignmentContext(context.getLocation(), reads, offsets); - } - - // Filter a locus context by sample ID - protected AlignmentContext[] filterAlignmentContext(AlignmentContext context, List sample_names, int downsample) - { - HashMap index = new HashMap(); - for (int i = 0; i < sample_names.size(); i++) - { - index.put(sample_names.get(i), i); - } - - AlignmentContext[] contexts = new AlignmentContext[sample_names.size()]; - ArrayList[] reads = new ArrayList[sample_names.size()]; - ArrayList[] offsets = new ArrayList[sample_names.size()]; - - for (int i = 0; i < sample_names.size(); i++) - { - reads[i] = new ArrayList(); - offsets[i] = new ArrayList(); - } - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String sample = header.getReadGroup(RG).getSample(); - if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - reads[index.get(sample)].add(read); - offsets[index.get(sample)].add(offset); - } - - if (downsample != 0) - { - for (int j = 0; j < reads.length; j++) - { - List perm = new ArrayList(); - for (int i = 0; i < reads[j].size(); i++) { perm.add(i); } - perm = MathUtils.randomSubset(perm, downsample); - - ArrayList downsampled_reads = new ArrayList(); - ArrayList downsampled_offsets = new ArrayList(); - - for (int i = 0; i < perm.size(); i++) - { - downsampled_reads.add(reads[j].get(perm.get(i))); - downsampled_offsets.add(offsets[j].get(perm.get(i))); - } - - reads[j] = downsampled_reads; - offsets[j] = downsampled_offsets; - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - else - { - for (int j = 0; j < reads.length; j++) - { - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - - return contexts; - } - - private AlignmentContext filter_each_read(AlignmentContext L) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < L.getReads().size(); i++) - { - SAMRecord read = L.getReads().get(i); - Integer offset = L.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - - assert(this.header != null); - //assert(this.header.getReadGroup(RG) != null); - if (this.header.getReadGroup(RG) == null) { continue; } - - // skip bogus data - if (read.getMappingQuality() == 0) { continue; } - - String sample = this.header.getReadGroup(RG).getSample(); - //if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - - reads.add(read); - offsets.add(offset); - } - - AlignmentContext ans = new AlignmentContext(L.getLocation(), reads, offsets); - - return ans; - } - - // END Utility functions - ///////// - - - - -}