diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/BasicPileup.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/BasicPileup.java new file mode 100755 index 000000000..88a4ef3aa --- /dev/null +++ b/archive/java/src/org/broadinstitute/sting/multisamplecaller/BasicPileup.java @@ -0,0 +1,186 @@ +package org.broadinstitute.sting.oneoffprojects.multisamplecaller; + +import org.broadinstitute.sting.utils.*; + +import net.sf.samtools.*; + +import java.util.List; +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Apr 14, 2009 + * Time: 8:54:05 AM + * To change this template use File | Settings | File Templates. + */ +abstract public class BasicPileup { + public static final char DELETION_CHAR = 'D'; + + @Deprecated + abstract GenomeLoc getLocation(); + @Deprecated + abstract char getRef(); + @Deprecated + abstract int size(); + + + /** + * This is the right way to get bases + * + * @return + */ + @Deprecated + byte[] getBases() { return null; } + + /** + * This is the right way to get quals + * + * @return + */ + @Deprecated + byte[] getQuals() { return null; } + + /** + * This is a terrible way to get bases. Use getBases() or getBasesAsArrayList() + * + * @return + */ + @Deprecated + String getBasesAsString() { return null; } + + /** + * This is a terrible way to get quals. Use getQuals() or getQualsAsArrayList() + * + * @return + */ + @Deprecated + String getQualsAsString() { return null; } + + public String getPileupString() { + return String.format("%s: %s %s %s", getLocation(), getRef(), getBasesAsString(), getQualsAsString()); + } + + // + // ArrayList methods + // + public static byte[] getBasesAsArray( List reads, List offsets ) { + byte array[] = new byte[reads.size()]; + int index = 0; + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + if ( offset == -1 ) { + array[index++] = ((byte)DELETION_CHAR); + } else { + array[index++] = read.getReadBases()[offset]; + } + } + return array; + } + + @Deprecated + public static ArrayList getBasesAsArrayList( List reads, List offsets ) { + ArrayList bases = new ArrayList(reads.size()); + for (byte value : getBasesAsArray(reads, offsets)) + bases.add(value); + return bases; + } + + @Deprecated + public static ArrayList getQualsAsArrayList( List reads, List offsets ) { + ArrayList quals = new ArrayList(reads.size()); + for (byte value : getQualsAsArray(reads, offsets)) + quals.add(value); + return quals; + } + + @Deprecated + public static byte[] getQualsAsArray( List reads, List offsets ) { + byte array[] = new byte[reads.size()]; + int index = 0; + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + // skip deletion sites + if ( offset == -1 ) { + array[index++] = ((byte)0); + } else { + array[index++] = read.getBaseQualities()[offset]; + } + } + return array; + } + + @Deprecated + public static ArrayList mappingQualPileup( List reads) { + ArrayList quals = new ArrayList(reads.size()); + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + byte qual = (byte)read.getMappingQuality(); + quals.add(qual); + } + return quals; + } + + @Deprecated // todo -- delete me + public static String[] indelPileup( List reads, List offsets ) + { + String[] indels = new String[reads.size()]; + + for (int i = 0; i < reads.size(); i++) + { + SAMRecord read = reads.get(i); + Cigar cigar = read.getCigar(); + int offset = offsets.get(i); + + String cigar_string = read.getCigarString(); + if (! (cigar_string.contains("I") || cigar_string.contains("D"))) { indels[i] = "null"; continue; } + + //System.out.printf("%s:%d %s %s %s ", read.getReferenceName(), read.getAlignmentStart(), read.getReadName(), read.getReadString(), cigar_string); + int k = 0; + for (int j = 0; j < cigar.numCigarElements(); j++) + { + CigarOperator operator = cigar.getCigarElement(j).getOperator(); + int length = cigar.getCigarElement(j).getLength(); + if (operator == CigarOperator.M) + { + k += length; + } + else if ((k == offset+1) && (operator == CigarOperator.I)) + { + // this insertion is associated with this offset (kinda ;) ). + indels[i] = read.getReadString().substring(k, k+length); + //System.out.printf("(I,%d,%d)", k, offset); + break; + } + else if ((k != offset+1) && (operator == CigarOperator.I)) + { + //System.out.printf("(i,%d,%d)", k, offset); + k += length; + } + else if ((k == offset) && (operator == CigarOperator.D)) + { + // this deletion is associated with this offset. + indels[i] = length + "D"; + //System.out.printf("(D,%d,%d)", k, offset); + break; + } + else if (k >= offset) + { + // no indel here. + indels[i] = "null"; + //System.out.printf("(N,%d,%d)", k, offset); + break; + } + } + if (indels[i] == null) { indels[i] = "null"; } + //System.out.printf("\n"); + } + + return indels; + } +} + + diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ClassicGenotypeLikelihoods.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ClassicGenotypeLikelihoods.java new file mode 100644 index 000000000..8209fc949 --- /dev/null +++ b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ClassicGenotypeLikelihoods.java @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.multisamplecaller; + +import org.broadinstitute.sting.utils.MathUtils; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; +import java.lang.Cloneable; + +public class ClassicGenotypeLikelihoods implements Cloneable { + // precalculate these for performance (pow/log10 is expensive!) + private static final double[] oneMinusData = new double[Byte.MAX_VALUE]; + + static { + for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { + oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0))); + //oneMinusData[qual] = log10(1.0 - QualityUtils.qualToProb(qual)); + } + } + + private static double getOneMinusQual(final byte qual) { + return oneMinusData[qual]; + } + + private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE]; + + static { + for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { + oneHalfMinusData[qual] = log10(0.5 - pow(10, (qual / -10.0)) / 2.0); + //oneHalfMinusData[qual] = log10(0.5 - QualityUtils.qualToProb(qual) / 2.0); + } + } + + private static double getOneHalfMinusQual(final byte qual) { + return oneHalfMinusData[qual]; + } + + public double[] likelihoods; + public String[] genotypes; + public int coverage; + + // The genotype priors; + private double priorHomRef; + private double priorHet; + private double priorHomVar; + public String[] sorted_genotypes; + public double[] sorted_likelihoods; + double ref_likelihood = Double.NaN; + private IndelLikelihood indel_likelihood; + + // Store the 2nd-best base priors for on-genotype primary bases + //private HashMap onNextBestBasePriors = new HashMap(); + + // Store the 2nd-best base priors for off-genotype primary bases + //private HashMap offNextBestBasePriors = new HashMap(); + + private static double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; + private static double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; + + public ClassicGenotypeLikelihoods() { + initialize(1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff); + } + + public ClassicGenotypeLikelihoods(boolean foo) { + } + + public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { + initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); + } + + public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { + initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); + } + + public ClassicGenotypeLikelihoods clone() { + ClassicGenotypeLikelihoods c = new ClassicGenotypeLikelihoods(false); + c.likelihoods = this.likelihoods.clone(); + c.genotypes = this.genotypes.clone(); + c.coverage = this.coverage; + + // The genotype priors; + c.priorHomRef = this.priorHomRef; + c.priorHet = this.priorHet; + c.priorHomVar = this.priorHomVar; + //public String[] sorted_genotypes; + //public double[] sorted_likelihoods; + //double ref_likelihood = Double.NaN; + //private IndelLikelihood indel_likelihood; + return c; + } + + private void initialize(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { + this.priorHomRef = priorHomRef; + this.priorHet = priorHet; + this.priorHomVar = priorHomVar; + + likelihoods = new double[10]; + genotypes = new String[10]; + coverage = 0; + + for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } + + genotypes[0] = "AA"; + genotypes[1] = "AC"; + genotypes[2] = "AG"; + genotypes[3] = "AT"; + genotypes[4] = "CC"; + genotypes[5] = "CG"; + genotypes[6] = "CT"; + genotypes[7] = "GG"; + genotypes[8] = "GT"; + genotypes[9] = "TT"; + + //for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { + // onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); + // offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); + //} + } + + public double getHomRefPrior() { + return priorHomRef; + } + + public void setHomRefPrior(double priorHomRef) { + this.priorHomRef = priorHomRef; + } + + public double getHetPrior() { + return priorHet; + } + + public void setHetPrior(double priorHet) { + this.priorHet = priorHet; + } + + public double getHomVarPrior() { + return priorHomVar; + } + + public void setHomVarPrior(double priorHomVar) { + this.priorHomVar = priorHomVar; + } + +// public double[] getOnGenotypeSecondaryPriors() { +// double[] p2ndon = new double[10]; +// +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// p2ndon[genotypeIndex] = onNextBestBasePriors.get(genotypes[genotypeIndex]); +// } +// +// return p2ndon; +// } +// +// public void setOnGenotypeSecondaryPriors(double[] p2ndon) { +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); +// } +// } +// +// public double[] getOffGenotypeSecondaryPriors() { +// double[] p2ndoff = new double[10]; +// +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// p2ndoff[genotypeIndex] = offNextBestBasePriors.get(genotypes[genotypeIndex]); +// } +// +// return p2ndoff; +// } +// +// public void setOffGenotypeSecondaryPriors(double[] p2ndoff) { +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); +// } +// } + + public void add(char ref, char read, byte qual) + { + if (qual <= 0) { qual = 1; } + + if (coverage == 0) + { + for (int i = 0; i < likelihoods.length; i++) + { + likelihoods[i] = 0; + } + } + double sum = 0.0; + for (int i = 0; i < genotypes.length; i++) + { + double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); + likelihoods[i] += likelihood; + } + coverage += 1; + } + + public void add(char ref, char read, byte qual, ConfusionMatrix matrix, String platform) + { + if (qual <= 0) { qual = 1; } + if (platform == null) { platform = "ILLUMINA"; } + if (read == 'N') { return; } + + if (coverage == 0) + { + for (int i = 0; i < likelihoods.length; i++) + { + likelihoods[i] = 0; + } + } + double sum = 0.0; + for (int i = 0; i < genotypes.length; i++) + { + char h1 = genotypes[i].charAt(0); + char h2 = genotypes[i].charAt(1); + + double p1 = matrix.lookup(platform, read, h1); + double p2 = matrix.lookup(platform, read, h2); + + double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual, p1, p2); + + //System.out.printf("DBG: %c %c %s %d %f %f %f\n", ref, read, genotypes[i], qual, p1, p2, likelihood); + + likelihoods[i] += likelihood; + } + coverage += 1; + } + + private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { + if (qual == 0) { + qual = 1; + } // zero quals are wrong. + + char h1 = genotype.charAt(0); + char h2 = genotype.charAt(1); + + double p_base; + + if ((h1 == h2) && (h1 == read)) { + // hom + p_base = getOneMinusQual(qual); + } else if ((h1 != h2) && ((h1 == read) || (h2 == read))) { + // het + p_base = getOneHalfMinusQual(qual); + } else { + // error + p_base = qual / -10.0; + } + + return p_base; + } + + public void TEST() + { + double p_A2A = 1.00; + double p_T2T = 1.00; + + double p_A2T = 0.75; + double p_T2A = 0.25; + + char ref = 'A'; + + System.out.printf("\tA\tT\n"); + System.out.printf("A\t%.02f\t%.02f\n", p_A2A, p_A2T); + System.out.printf("T\t%.02f\t%.02f\n", p_T2A, p_T2T); + System.out.printf("\n"); + + System.out.printf("P(A,Q20|AA) = %f\n", calculateAlleleLikelihood(ref, 'A', "AA", (byte)20, p_A2A, p_A2A)); + System.out.printf("P(A,Q20|AT) = %f\n", calculateAlleleLikelihood(ref, 'A', "AT", (byte)20, p_A2A, p_A2T)); + System.out.printf("P(A,Q20|TT) = %f\n", calculateAlleleLikelihood(ref, 'A', "TT", (byte)20, p_A2T, p_A2T)); + + System.out.printf("P(T,Q20|AA) = %f\n", calculateAlleleLikelihood(ref, 'T', "AA", (byte)20, p_T2A, p_T2A)); + System.out.printf("P(T,Q20|AT) = %f\n", calculateAlleleLikelihood(ref, 'T', "AT", (byte)20, p_T2A, p_T2T)); + System.out.printf("P(T,Q20|TT) = %f\n", calculateAlleleLikelihood(ref, 'T', "TT", (byte)20, p_T2T, p_T2T)); + + //System.exit(0); + } + + private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual, double p1, double p2) { + if (qual == 0) { + qual = 1; + } // zero quals are wrong. + + char h1 = genotype.charAt(0); + char h2 = genotype.charAt(1); + + double perr = Math.pow(10.0,qual/-10.0); + + double p_base = 0; + + if (read == h1) + { + p_base += (1.0 - perr); + } + else + { + p_base += perr * p1; + } + + if (read == h2) + { + p_base += (1.0 - perr); + } + else + { + p_base += perr * p2; + } + + p_base = Math.log10(p_base/2.0); + + return p_base; + } + + public void sort() { + Integer[] permutation = MathUtils.sortPermutation(likelihoods); + + Integer[] reverse_permutation = new Integer[permutation.length]; + for (int i = 0; i < reverse_permutation.length; i++) { + reverse_permutation[i] = permutation[(permutation.length - 1) - i]; + } + + sorted_genotypes = MathUtils.permuteArray(genotypes, reverse_permutation); + sorted_likelihoods = MathUtils.permuteArray(likelihoods, reverse_permutation); + } + + public String toString(char ref) { + this.sort(); + double sum = 0; + String s = String.format("%s %f %f ", this.BestGenotype(), this.LodVsNextBest(), this.LodVsRef(ref)); + for (int i = 0; i < sorted_genotypes.length; i++) { + if (i != 0) { + s = s + " "; + } + s = s + sorted_genotypes[i] + ":" + String.format("%.2f", sorted_likelihoods[i]); + sum += Math.pow(10,sorted_likelihoods[i]); + } + s = s + String.format(" %f", sum); + return s; + } + + public void ApplyPrior(char ref, double[] allele_likelihoods) + { + int k = 0; + for (int i = 0; i < 4; i++) + { + for (int j = i; j < 4; j++) + { + if (i == j) + { + this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]); + } + else + { + this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]) + Math.log10(2); + } + k++; + } + } + this.sort(); + } + + public void ApplyPrior(char ref, char alt, double p_alt) { + for (int i = 0; i < genotypes.length; i++) { + if ((p_alt == -1) || (p_alt <= 1e-6)) { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + // hom-ref + likelihoods[i] += Math.log10(priorHomRef); + } else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { + // hom-nonref + likelihoods[i] += Math.log10(priorHomVar); + } else { + // het + likelihoods[i] += Math.log10(priorHet); + } + if (Double.isInfinite(likelihoods[i])) { + likelihoods[i] = -1000; + } + } else { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + // hom-ref + likelihoods[i] += 2.0 * Math.log10(1.0 - p_alt); + } else if ((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == alt)) { + // hom-nonref + likelihoods[i] += 2.0 * Math.log10(p_alt); + } else if (((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == ref)) || + ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == alt))) { + // het + likelihoods[i] += Math.log10((1.0 - p_alt) * p_alt * 2.0); + } else { + // something else (noise!) + likelihoods[i] += Math.log10(1e-5); + } + + if (Double.isInfinite(likelihoods[i])) { + likelihoods[i] = -1000; + } + } + } + this.sort(); + } + + public void ApplyWeight(double weight) + { + double log10weight = Math.log10(weight); + for (int i = 0; i < genotypes.length; i++) { likelihoods[i] += log10weight; } + this.sort(); + } + +// public void applySecondBaseDistributionPrior(String primaryBases, String secondaryBases) { +// for (int genotypeIndex = 0; genotypeIndex < genotypes.length; genotypeIndex++) { +// char firstAllele = genotypes[genotypeIndex].charAt(0); +// char secondAllele = genotypes[genotypeIndex].charAt(1); +// +// int offIsGenotypic = 0; +// int offTotal = 0; +// +// int onIsGenotypic = 0; +// int onTotal = 0; +// +// for (int pileupIndex = 0; pileupIndex < primaryBases.length(); pileupIndex++) { +// char primaryBase = primaryBases.charAt(pileupIndex); +// +// if (secondaryBases != null) { +// char secondaryBase = secondaryBases.charAt(pileupIndex); +// +// if (primaryBase != firstAllele && primaryBase != secondAllele) { +// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { +// offIsGenotypic++; +// } +// offTotal++; +// } else { +// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { +// onIsGenotypic++; +// } +// onTotal++; +// } +// } +// } +// +// double offPrior = MathUtils.binomialProbability(offIsGenotypic, offTotal, offNextBestBasePriors.get(genotypes[genotypeIndex])); +// double onPrior = MathUtils.binomialProbability(onIsGenotypic, onTotal, onNextBestBasePriors.get(genotypes[genotypeIndex])); +// +// likelihoods[genotypeIndex] += Math.log10(offPrior) + Math.log10(onPrior); +// } +// this.sort(); +// } + + public double LodVsNextBest() { + this.sort(); + return sorted_likelihoods[0] - sorted_likelihoods[1]; + } + + public double LodVsRef(char ref) { + if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) { + ref_likelihood = sorted_likelihoods[0]; + return (-1.0 * this.LodVsNextBest()); + } else { + for (int i = 0; i < genotypes.length; i++) { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + ref_likelihood = likelihoods[i]; + } + } + } + return sorted_likelihoods[0] - ref_likelihood; + } + + public String BestGenotype() { + this.sort(); + return this.sorted_genotypes[0]; + } + + public double BestPosterior() { + this.sort(); + return this.sorted_likelihoods[0]; + } + + public double RefPosterior(char ref) + { + this.LodVsRef(ref); + return this.ref_likelihood; + } + + public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; } + public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; } + +} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/ConfusionMatrix.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ConfusionMatrix.java new file mode 100644 index 000000000..8b51648e5 --- /dev/null +++ b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ConfusionMatrix.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.oneoffprojects.multisamplecaller; + +import java.lang.*; +import java.util.*; +import java.io.*; + +import org.broadinstitute.sting.utils.*; + +public class ConfusionMatrix +{ + private double[][] ILLUMINA; + private double[][] solid; + private double[][] LS454; + + public ConfusionMatrix(String file_name) + { + //System.out.println("DBG: ConfusionMatrix constructor! (" + file_name + ")"); + + ILLUMINA = new double[4][4]; + solid = new double[4][4]; + LS454 = new double[4][4]; + + try + { + Scanner sc = new Scanner(new File(file_name)); + while (sc.hasNext()) + { + String platform = sc.next(); + char read = sc.next().charAt(0); + char ref = sc.next().charAt(0); + double p = sc.nextDouble(); + + int read_i = BaseUtils.simpleBaseToBaseIndex(read); + int ref_i = BaseUtils.simpleBaseToBaseIndex(ref); + + if (platform.equals("ILLUMINA")) { ILLUMINA[read_i][ref_i] = p; } + if (platform.equals("solid")) { solid[read_i][ref_i] = p; } + if (platform.equals("LS454")) { LS454[read_i][ref_i] = p; } + + //System.out.println("DBG: " + key + " " + p); + } + } + catch (Exception e) + { + e.printStackTrace(); + System.exit(-1); + } + + } + + double lookup(String platform, char read, char truth) + { + int read_i = BaseUtils.simpleBaseToBaseIndex(read); + int truth_i = BaseUtils.simpleBaseToBaseIndex(truth); + + double d = 0; + + if (platform.equals("ILLUMINA")) { d = ILLUMINA[read_i][truth_i]; } + else if (platform.equals("solid")) { d = solid[read_i][truth_i]; } + else if (platform.equals("LS454")) { d = LS454[read_i][truth_i]; } + else { assert(false); } + + return d; + } +} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/IndelLikelihood.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/IndelLikelihood.java new file mode 100755 index 000000000..71ebd9498 --- /dev/null +++ b/archive/java/src/org/broadinstitute/sting/multisamplecaller/IndelLikelihood.java @@ -0,0 +1,113 @@ +package org.broadinstitute.sting.oneoffprojects.multisamplecaller; + +import java.util.HashMap; + +public class IndelLikelihood { + private String type; + private String[] alleles; + private double p; + private double lod; + + private double pRef; + private double pHet; + private double pHom; + private String alt; + + public IndelLikelihood(String type, String[] alleles, double p, double lod) { + initialize(type, alleles, p, lod); + } + + public IndelLikelihood(String[] indels, double indel_alt_freq) + { + HashMap indel_allele_counts = new HashMap(); + + for (int i = 0; i < indels.length; i++) { + if (! indel_allele_counts.containsKey(indels[i])) { + indel_allele_counts.put(indels[i], 1); + } else { + indel_allele_counts.put(indels[i], indel_allele_counts.get(indels[i])+1); + } + } + + Object[] keys = indel_allele_counts.keySet().toArray(); + String[] alleles = new String[keys.length]; + int[] counts = new int[keys.length]; + //double likelihoods[] = new double[keys.length]; + int null_count = 0; + String max_allele = null; + int max_count = -1; + + if ((keys.length > 0) && (! ((keys.length == 1) && (((String)keys[0]).equals("null"))))) { + for (int i = 0; i < keys.length; i++) { + Integer count = (Integer)indel_allele_counts.get(keys[i]); + alleles[i] = (String)keys[i]; + counts[i] = count; + if (alleles[i].equals("null")) { null_count = counts[i]; } + else if (counts[i] > max_count) { max_count = counts[i]; max_allele = alleles[i]; } + //System.out.printf("%s[%d] ", keys[i], count); + } + //System.out.printf("\n"); + + double eps = 1e-3; + pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + 2*Math.log10(1-indel_alt_freq); + pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10((1-indel_alt_freq)*indel_alt_freq); + pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + 2*Math.log10(indel_alt_freq); + + double lodRef = pRef - Math.max(pHet, pHom); + double lodHet = pHet - pRef; + double lodHom = pHom - pRef; + + //System.out.printf("%s/%s %f %f\n", "null", "null", pRef, lodRef); + //System.out.printf("%s/%s %f %f\n", max_allele, "null", pHet, lodHet); + //System.out.printf("%s/%s %f %f\n", max_allele, max_allele, pHom, lodHom); + //System.out.printf("\n"); + + if (lodRef > 0) { + // reference call + String[] genotype = new String[2]; + genotype[0] = "null"; + genotype[1] = "null"; + + //return new IndelLikelihood("ref", genotype, pRef, lodRef); + initialize("ref", genotype, pRef, lodRef); + } else if (lodHet > lodHom) { + // het call + String[] genotype = new String[2]; + genotype[0] = "null"; + genotype[1] = max_allele; + + //return new IndelLikelihood("het", genotype, pHet, lodHet); + initialize("het", genotype, pHet, lodHet); + } else { + // hom call + String[] genotype = new String[2]; + genotype[0] = max_allele; + genotype[1] = max_allele; + + //return new IndelLikelihood("hom", genotype, pHom, lodHom); + initialize("hom", genotype, pHom, lodHom); + } + } + } + + private void initialize(String type, String[] alleles, double p, double lod) { + this.type = type; + this.alleles = alleles; + this.p = p; + this.lod = lod; + } + + public String getAlt() { return alt; } + public double pRef() { return pRef; } + public double pHet() { return pHet; } + public double pHom() { return pHom; } + + public String getType() { return type; } + public String[] getAlleles() { return alleles; } + public double getPosteriorProbability() { return p; } + public double getLOD() { return lod; } + + public String toString() { + return String.format("%s/%s %f %f", alleles[0], alleles[1], p, lod); + } +} diff --git a/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCaller.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCaller.java new file mode 100644 index 000000000..fdc1a61cb --- /dev/null +++ b/archive/java/src/org/broadinstitute/sting/multisamplecaller/MultiSampleCaller.java @@ -0,0 +1,1093 @@ + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.multisamplecaller; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.commandline.Argument; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.*; +import java.util.zip.GZIPOutputStream; + +// Beta iterative multi-sample caller +// j.maguire 6-11-2009 + +public class MultiSampleCaller extends LocusWalker +{ + @Argument(required=false, shortName="fractional_counts", doc="should we use fractional counts?") public boolean FRACTIONAL_COUNTS = false; + @Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10; + @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT = null; + @Argument(fullName="individual_output", shortName="individual_output", required=false, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT = null; + @Argument(fullName="stats_output", shortName="stats_output", required=false, doc="file to write stats to") public String STATS_OUTPUT = null; + @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; + @Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false; + @Argument(fullName="weight_samples", shortName="weight_samples", required=false, doc="rw-weight samples during EM?") public boolean WEIGHT_SAMPLES = false; + + @Argument(fullName="theta", shortName="theta", required=false, doc="rate of sequence divergence") public double THETA = 1e-3; + @Argument(fullName="allele_frequency_prior", shortName="allele_frequency_prior", required=false, doc="use prior on allele frequencies? (P(f) = theta/(N*f)") public boolean ALLELE_FREQUENCY_PRIOR = false; + + @Argument(fullName="confusion_matrix_file", shortName="confusion_matrix_file", required=false, doc="file containing confusion matrix for all three technologies") public String CONFUSION_MATRIX_FILE = null; + + @Argument(fullName="ALLELE_FREQ_TOLERANCE", shortName="AFT", required=false, doc="") + public double ALLELE_FREQ_TOLERANCE = 1e-6; + + @Argument(fullName="append", shortName="append", required=false, doc="if the discovery file already exists, don't re-call sites that are done.") boolean APPEND = false; + + private static final double MIN_LOD_FOR_STRAND = 0.01; + + // Private state. + protected List sample_names; + protected SAMFileHeader header; + protected PrintStream individual_output_file = null; + protected PrintStream discovery_output_file = null; + protected PrintStream stats_output_file = null; + + private boolean INCLUDE_STATS = false; + private boolean INCLUDE_GENOTYPES = false ; + + class MultiSampleCallResult + { + GenomeLoc location; + char ref; + char alt; + EM_Result em_result; + double lod; + double strand_score; + double pD; + double pNull; + String in_dbsnp; + int n_ref; + int n_het; + int n_hom; + int EM_N; + double alt_freq; + + public MultiSampleCallResult(GenomeLoc location, char ref, char alt, EM_Result em_result, double lod, double strand_score, double pD, double pNull, String in_dbsnp, int n_ref, int n_het, int n_hom, int EM_N, double alt_freq) + { + this.location = location; + this.ref = ref; + this.alt = alt; + this.em_result = em_result; + this.lod = lod; + this.strand_score = strand_score; + this.pD = pD; + this.pNull = pNull; + this.in_dbsnp = in_dbsnp; + this.n_ref = n_ref; + this.n_het = n_het; + this.n_hom = n_hom; + this.EM_N = EM_N; + this.alt_freq = alt_freq; + } + + public MultiSampleCallResult() { } // this is just so I can do new MultiSampleCallResult().header(). "inner classes cannot have static declarations" :( + + public String header() + { + return new String("loc ref alt lod strand_score pD pNull in_dbsnp pA pC pG pT EM_alt_freq EM_N n_ref n_het n_hom"); + } + + public String toString() + { + String s = ""; + s = s + String.format("%s %c %c %f %f %f %f %s ", location, ref, alt, lod, strand_score, pD, pNull, in_dbsnp); + for (int i = 0; i < 4; i++) { s = s + String.format("%f ", em_result.allele_likelihoods[i]); } + s = s + String.format("%f %d %d %d %d", alt_freq, em_result.EM_N, n_ref, n_het, n_hom); + return s; + } + } + + public static class DepthStats + { + public static String Header() + { + return "loc ref depth A C G T a c g t mq_min mq_mean mq_median mq_max mq_sd"; + } + + public static String Row(char ref, AlignmentContext context) + { + String ans = ""; + List reads = context.getReads(); + List offsets = context.getOffsets(); + //Pileup pileup = new ReadBackedPileupOld(ref, context); + + ans += String.format("%s ", context.getLocation()); + ans += String.format("%c ", ref); + ans += String.format("%d ", reads.size()); + ans += String.format("%d ", countBase(context, 'A', "+")); + ans += String.format("%d ", countBase(context, 'C', "+")); + ans += String.format("%d ", countBase(context, 'G', "+")); + ans += String.format("%d ", countBase(context, 'T', "+")); + ans += String.format("%d ", countBase(context, 'A', "-")); + ans += String.format("%d ", countBase(context, 'C', "-")); + ans += String.format("%d ", countBase(context, 'G', "-")); + ans += String.format("%d ", countBase(context, 'T', "-")); + + ans += String.format("%s ", Stats(BasicPileup.mappingQualPileup(reads))); + + return ans; + } + + static int countBase(AlignmentContext context, char base, String strand) + { + int count = 0; + List reads = context.getReads(); + List offsets = context.getOffsets(); + for (int i = 0; i < reads.size(); i++) + { + if (reads.get(i).getReadString().charAt(offsets.get(i)) == base) + { + if (strand.equals("+") && (reads.get(i).getReadNegativeStrandFlag()==false)) { count += 1; } + else if (strand.equals("-") && (reads.get(i).getReadNegativeStrandFlag()==true)) { count += 1; } + else if (! (strand.equals("+") || strand.equals("-"))) { count += 1; } + } + } + return count; + } + + public static String Stats(ArrayList X) + { + Collections.sort(X); + + long count = 0; + long sum = 0; + long min = X.get(0); + long max = X.get(0); + long median = X.get(0); + for (int i = 0; i < X.size(); i++) + { + int x = X.get(i); + if (x < min) { min = x; } + if (x > max) { max = x; } + sum += x; + count += 1; + if (i == X.size()/2) { median = x; } + } + + double mean = sum/count; + for (int i = 0; i < X.size(); i++) + { + int x = X.get(i); + sum += (x-mean)*(x-mean); + count += 1; + } + double sd = Math.sqrt(sum/count); + + return String.format("%d %f %d %d %f", min, mean, median, max, sd); + } + } + + GenomeLoc highest_previously_done_loc = null; + private boolean in_skip_mask(GenomeLoc loc) + { + if (highest_previously_done_loc == null) { return false; } + if (highest_previously_done_loc.compareTo(loc) < 0) { return false; } + else { return true; } + } + + private void maybeInitializeDisoveryOutput() + { + if ( discovery_output_file == null ) + { + File file = new File(DISCOVERY_OUTPUT); + if ((APPEND == true) && (file.exists())) + { + try + { + Runtime.getRuntime().exec("cp -v " + DISCOVERY_OUTPUT + " " + DISCOVERY_OUTPUT + ".backup"); + + // 1. Read the existing file and record the highest site we've seen. + Scanner scanner = new Scanner(file); + while(scanner.hasNext()) + { + String line = scanner.nextLine(); + String[] tokens = line.split(" +"); + String loc_string = tokens[0]; + if (loc_string.equals("loc")) { continue; } + highest_previously_done_loc = GenomeLocParser.parseGenomeLoc(loc_string); + } + + // 2. Open the output file for appending. + discovery_output_file = new PrintStream(new FileOutputStream(DISCOVERY_OUTPUT, true)); + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + else + { + try + { + final String filename = DISCOVERY_OUTPUT; + discovery_output_file = new PrintStream(filename); + discovery_output_file.println(new MultiSampleCallResult().header()); + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + } + } + + ///////// + // Walker Interface Functions + public void initialize() + { + System.out.printf("\n\n\n\n"); + (new ClassicGenotypeLikelihoods()).TEST(); + + try + { + maybeInitializeDisoveryOutput(); + + INCLUDE_GENOTYPES = INDIVIDUAL_OUTPUT != null; + if ( INCLUDE_GENOTYPES ) { + individual_output_file = new PrintStream(new GZIPOutputStream(new FileOutputStream(INDIVIDUAL_OUTPUT))); + individual_output_file.println("loc ref sample_name genotype lodVsNextBest lodVsRef in_dbsnp AA AC AG AT CC CG CT GG GT TT"); + } + + INCLUDE_STATS = STATS_OUTPUT != null; + if ( INCLUDE_STATS ) { + stats_output_file = new PrintStream(STATS_OUTPUT); + stats_output_file.println(DepthStats.Header()); + } + } + catch (Exception e) + { + e.printStackTrace(); + System.exit(-1); + } + + GenomeAnalysisEngine toolkit = this.getToolkit(); + this.header = toolkit.getSAMFileHeader(); + List read_groups = header.getReadGroups(); + + sample_names = new ArrayList(); + + HashSet unique_sample_names = new HashSet(); + + for (int i = 0; i < read_groups.size(); i++) + { + String sample_name = read_groups.get(i).getSample(); + String platform = (String)(read_groups.get(i).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); + + if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + + //System.out.printf("SAMPLE: %s %s\n", sample_name, platform); + + if (unique_sample_names.contains(sample_name)) { continue; } + unique_sample_names.add(sample_name); + sample_names.add(sample_name); + + System.out.printf("UNIQUE_SAMPLE: %s %s\n", sample_name, platform); + } + + // Load the confusion matrix if it exists + if (CONFUSION_MATRIX_FILE != null) + { + this.confusion_matrix = new ConfusionMatrix(CONFUSION_MATRIX_FILE); + } + + } + + + Date start_time = null; + int n_sites_processed = 0; + + public MultiSampleCallResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + if (start_time == null) { start_time = new Date(); } + + if (in_skip_mask(context.getLocation()) == true) { return null; } + + if ((n_sites_processed % 1000) == 0) + { + Date current_time = new Date(); + long elapsed = current_time.getTime() - start_time.getTime(); + out.printf("RUNTIME: %d sites processed in %f seconds; %f seconds per site.\n", + n_sites_processed, + (double)elapsed/1000.0, + ((double)elapsed/1000.0)/(double)n_sites_processed); + } + n_sites_processed += 1; + + context = filter_each_read(context); + + if (ref.getBaseAsChar() == 'N') { return null; } + if (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) == -1) { return null; } + if (context.getReads().size() <= 0) { return null; } + if (context.getReads().size() >= 10000) { return null; } // to deal with big piles -- totally arbitrary threshold + + this.ref = ref.getBaseAsChar(); + MultiSampleCallResult result = this.MultiSampleCall(tracker, ref.getBaseAsChar(), context, sample_names); + if ( INCLUDE_STATS ) stats_output_file.println(DepthStats.Row(ref.getBaseAsChar(), context)); + return result; + } + + public void onTraversalDone(String sum) + { + discovery_output_file.flush(); + discovery_output_file.close(); + + if ( INCLUDE_STATS ) { + stats_output_file.flush(); + stats_output_file.close(); + } + + out.println("MultiSampleCaller done."); + return; + } + + public String reduceInit() + { + return null; + } + + public String reduce(MultiSampleCallResult record, String sum) + { + if (record != null) + { + discovery_output_file.printf(record.toString() + "\n"); + } + return null; + } + + // END Walker Interface Functions + ///////// + + + ///////// + // Calling Functions + + char ref; + protected ConfusionMatrix confusion_matrix; + + ClassicGenotypeLikelihoods reallyMakeGenotypeLikelihood(AlignmentContext context) { + List reads = context.getReads(); + List offsets = context.getOffsets(); + + // Handle single-base polymorphisms. + ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); + for ( int i = 0; i < reads.size(); i++ ) + { + //System.out.printf("DBG: %s\n", context.getLocation()); + + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + if (CONFUSION_MATRIX_FILE == null) + { + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); + } + else + { + String RG = (String)(read.getAttribute("RG")); + + assert(header != null); + assert(header.getReadGroup(RG) != null); + + String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); + + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); + } + } + + return G; + } + + HashMap glCache = new HashMap(); + int cache_size = 0; + + ClassicGenotypeLikelihoods GenotypeOld(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { + //ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + //String bases = pileup.getBases(); + + List reads = context.getReads(); + List offsets = context.getOffsets(); + ref = Character.toUpperCase(ref); + + if (reads.size() == 0) { + ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); + return G; + } + + // Handle single-base polymorphisms. + ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); + for ( int i = 0; i < reads.size(); i++ ) + { + //System.out.printf("DBG: %s\n", context.getLocation()); + + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + if (CONFUSION_MATRIX_FILE == null) + { + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); + } + else + { + String RG = (String)(read.getAttribute("RG")); + + assert(header != null); + assert(header.getReadGroup(RG) != null); + + String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); + + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); + } + } + G.ApplyPrior(ref, allele_likelihoods); + + // Handle indels + if (CALL_INDELS) + { + String[] indels = BasicPileup.indelPileup(reads, offsets); + IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq); + if (indel_call.getType() != null) + { + G.addIndelLikelihood(indel_call); + } + else + { + G.addIndelLikelihood(null); + } + } + + return G; + } + + ClassicGenotypeLikelihoods Genotype(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { + return GenotypeCache(context, allele_likelihoods, indel_alt_freq ); + //return GenotypeOld(context, allele_likelihoods, indel_alt_freq ); + } + + + ClassicGenotypeLikelihoods GenotypeCache(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) + { + ref = Character.toUpperCase(ref); + + // Handle single-base polymorphisms. + ClassicGenotypeLikelihoods G = null; + if ( context.getReads().size() == 0 ) { + G = new ClassicGenotypeLikelihoods(); + return G; + } else { + if ( true && glCache.containsKey(context) ) { + ClassicGenotypeLikelihoods cached = glCache.get(context); + G = (ClassicGenotypeLikelihoods)cached.clone(); + } else { + G = reallyMakeGenotypeLikelihood(context); + if (cache_size < 5000) + { + //System.out.printf("cache add (%d)\n", cache_size); + glCache.put(context, G.clone()); + cache_size += context.getReads().size(); + } + else + { + //System.out.printf("cache skip (%d)\n", cache_size); + } + } + G.ApplyPrior(ref, allele_likelihoods); + } + + return G; + } + + // This version is a little faster. + double[] CountFreqs(ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + double[] allele_likelihoods = new double[4]; + + for (int x = 0; x < genotype_likelihoods.length; x++) + { + ClassicGenotypeLikelihoods G = genotype_likelihoods[x]; + + if (G.coverage == 0) { continue; } + + double[] personal_allele_likelihoods = new double[4]; + + int k = 0; + for (int i = 0; i < 4; i++) + { + for (int j = i; j < 4; j++) + { + double likelihood = Math.pow(10,G.likelihoods[k]); + personal_allele_likelihoods[i] += likelihood; + personal_allele_likelihoods[j] += likelihood; + k++; + } + } + double sum = 0; + for (int y = 0; y < 4; y++) { sum += personal_allele_likelihoods[y]; } + for (int y = 0; y < 4; y++) { personal_allele_likelihoods[y] /= sum; } + for (int y = 0; y < 4; y++) { allele_likelihoods[y] += personal_allele_likelihoods[y]; } + } + + double sum = 0; + for (int i = 0; i < 4; i++) { sum += allele_likelihoods[i]; } + for (int i = 0; i < 4; i++) { allele_likelihoods[i] /= sum; } + + return allele_likelihoods; + } + + + double CountIndelFreq(ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + HashMap indel_allele_likelihoods = new HashMap(); + + double pRef = 0; + double pAlt = 0; + + for (int j = 0; j < sample_names.size(); j++) + { + double personal_pRef = 0; + double personal_pAlt = 0; + + IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood(); + personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet()); + personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet()); + + personal_pRef = personal_pRef / (personal_pAlt + personal_pRef); + personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef); + + pRef += personal_pRef; + pAlt += personal_pAlt; + } + + pRef = pRef / (pRef + pAlt); + pAlt = pAlt / (pRef + pAlt); + + return pAlt; + } + + // Potential precision error here. + double Compute_pD(ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] sample_weights) + { + double pD = 0; + for (int i = 0; i < sample_names.size(); i++) + { + double sum = 0; + for (int j = 0; j < 10; j++) + { + sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); + } + pD += Math.log10(sample_weights[i] * sum); + } + return pD; + } + + double Compute_pNull(AlignmentContext[] contexts, double[] sample_weights) + { + double[] allele_likelihoods = new double[4]; + for (int i = 0; i < 4; i++) { allele_likelihoods[i] = 1e-6/3.0; } + allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6; + ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; + for (int j = 0; j < sample_names.size(); j++) + { + G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6); + } + return Compute_pD(G, sample_weights); + } + + double[] Compute_SampleWeights(ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + double[] pD = new double[sample_names.size()]; + double total_pD = 0; + for (int i = 0; i < sample_names.size(); i++) + { + double sum = 0; + for (int j = 0; j < 10; j++) + { + sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); + } + pD[i] = sum; + total_pD += pD[i]; + } + + for (int i = 0; i < sample_names.size(); i++) + { + pD[i] /= total_pD; + } + + return pD; + } + + // Some globals to cache results. + EM_Result em_result; + double pD; + double pNull; + double lod; + double LOD(AlignmentContext[] contexts) + { + em_result = EM(contexts); + ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods; + pD = Compute_pD(G, em_result.sample_weights); + pNull = Compute_pNull(contexts, em_result.sample_weights); + + if (ALLELE_FREQUENCY_PRIOR) + { + // Apply p(f). + double pVar = 0.0; + for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } + + double p0 = Math.log10(1 - pVar); + double pF; + + double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods); + + if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; } + else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); } + + //System.out.printf("DBG %s %c %f %f %f %f (%.20f) %f ", contexts[0].getLocation(), ref, pD, pF, pNull, p0, Compute_alt_freq(ref, em_result.allele_likelihoods), 2.0 * em_result.EM_N); + //for (int i = 0; i < 4; i++) { System.out.printf("%f ", em_result.allele_likelihoods[i]); } + //System.out.printf("\n"); + + pD = pD + pF; + pNull = pNull + p0; + } + + lod = pD - pNull; + return lod; + } + + class EM_Result + { + String[] sample_names; + ClassicGenotypeLikelihoods[] genotype_likelihoods; + double[] allele_likelihoods; + double[] sample_weights; + int EM_N; + + public EM_Result(List sample_names, ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] allele_likelihoods, double[] sample_weights) + { + this.sample_names = new String[1]; + this.sample_names = sample_names.toArray(this.sample_names); + this.genotype_likelihoods = genotype_likelihoods; + this.allele_likelihoods = allele_likelihoods; + this.sample_weights = sample_weights; + + EM_N = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage > 0) { EM_N += 1; } + } + + } + + } + + final static double[] sample_weights = new double[1000]; + static { + for (int i = 0; i < 1000; i++) + { + //sample_weights[i] = 1.0/(double)i; + sample_weights[i] = 1.0; + } + } + + EM_Result EM(AlignmentContext[] contexts) + { + final boolean DEBUG_PRINT = false; + double[] allele_likelihoods = new double[4]; + + // These initial conditions should roughly replicate classic SSG. (at least on hets) + for (int i = 0; i < 4; i++) + { + if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999) + else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999) + } + double indel_alt_freq = 1e-4; + + ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; + //ClassicGenotypeLikelihoods[] Weighted_G = new ClassicGenotypeLikelihoods[sample_names.size()]; + + if ( DEBUG_PRINT ) System.out.printf("%n"); + + for (int i = 0; i < MAX_ITERATIONS; i++) + { + for (int j = 0; j < sample_names.size(); j++) + { + G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq); + //if (WEIGHT_SAMPLES) { G[j].ApplyWeight(sample_weights[j]); } + } + + double[] old_allele_likelihoods = allele_likelihoods; + allele_likelihoods = CountFreqs(G); + double alDelta = 0.0; + for (int j = 0; j < 4; j++) { alDelta += Math.abs(old_allele_likelihoods[j] - allele_likelihoods[j]); } + + if ( DEBUG_PRINT ) + { + System.out.printf("%s AL %f %f %f %f => delta=%e < %e == %b%n", + contexts[0].getLocation(), + Math.log10(allele_likelihoods[0]), Math.log10(allele_likelihoods[1]), Math.log10(allele_likelihoods[2]), Math.log10(allele_likelihoods[3]), + alDelta, ALLELE_FREQ_TOLERANCE, alDelta < ALLELE_FREQ_TOLERANCE); + } + + //if ( alDelta < ALLELE_FREQ_TOLERANCE ) { + // System.out.printf("Converged after %d iterations%n", i); + // break; + //} + +// if (CALL_INDELS) +// { +// indel_alt_freq = CountIndelFreq(G); +// } + +// if (WEIGHT_SAMPLES) +// { +// sample_weights = Compute_SampleWeights(G); +// } + } + + return new EM_Result(sample_names, G, allele_likelihoods, sample_weights); + } + + // Hacky global variables for debugging. + double StrandScore(AlignmentContext context) + { + //AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); + + AlignmentContext fw = filterAlignmentContext(context, "+"); + AlignmentContext bw = filterAlignmentContext(context, "-"); + AlignmentContext[] contexts_fw = filterAlignmentContext(fw, sample_names, 0); + AlignmentContext[] contexts_bw = filterAlignmentContext(bw, sample_names, 0); + + EM_Result em_fw = EM(contexts_fw); + EM_Result em_bw = EM(contexts_bw); + + double pNull_fw = Compute_pNull(contexts_fw, em_fw.sample_weights); + double pNull_bw = Compute_pNull(contexts_bw, em_bw.sample_weights); + + double pD_fw = Compute_pD(em_fw.genotype_likelihoods, em_fw.sample_weights); + double pD_bw = Compute_pD(em_bw.genotype_likelihoods, em_bw.sample_weights); + + if (ALLELE_FREQUENCY_PRIOR) + { + // Apply p(f). + double pVar = 0.0; + for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } + + pD_fw = pD_fw + Math.log10(THETA/(2.0*em_fw.EM_N * Compute_alt_freq(ref, em_fw.allele_likelihoods))); + pNull_fw = pNull_fw + Math.log10(1 - pVar); + + pD_bw = pD_bw + Math.log10(THETA/(2.0*em_bw.EM_N * Compute_alt_freq(ref, em_bw.allele_likelihoods))); + pNull_bw = pNull_bw + Math.log10(1 - pVar); + } + + double EM_alt_freq_fw = Compute_alt_freq(ref, em_fw.allele_likelihoods); + double EM_alt_freq_bw = Compute_alt_freq(ref, em_bw.allele_likelihoods); + + //double pNull = Compute_pNull(contexts); + //double lod = LOD(contexts); + + double lod_fw = (pD_fw + pNull_bw) - pNull; + double lod_bw = (pD_bw + pNull_fw) - pNull; + double strand_score = Math.max(lod_fw - lod, lod_bw - lod); + return strand_score; + } + +// ClassicGenotypeLikelihoods HardyWeinberg(double[] allele_likelihoods) +// { +// ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); +// int k = 0; +// for (int i = 0; i < 4; i++) +// { +// for (int j = i; j < 4; j++) +// { +// G.likelihoods[k] = allele_likelihoods[i] * allele_likelihoods[j]; +// k++; +// } +// } +// return G; +// } + + char PickAlt(char ref, double[] allele_likelihoods) + { + Integer[] perm = MathUtils.sortPermutation(allele_likelihoods); + if (perm[3] != BaseUtils.simpleBaseToBaseIndex(ref)) { return BaseUtils.baseIndexToSimpleBaseAsChar(perm[3]); } + else { return BaseUtils.baseIndexToSimpleBaseAsChar(perm[2]); } + } + + double Compute_discovery_lod(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + double pBest = 0; + double pRef = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + pBest += genotype_likelihoods[i].BestPosterior(); + pRef += genotype_likelihoods[i].RefPosterior(ref); + } + return pBest - pRef; + } + + // this one is a bit of a lazy hack. + double Compute_alt_freq(char ref, double[] allele_likelihoods) + { + return allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(PickAlt(ref, allele_likelihoods))]; + } + + int Compute_n_ref(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + int n = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage == 0) { continue; } + String g = genotype_likelihoods[i].BestGenotype(); + if ((g.charAt(0) == ref) && (g.charAt(1) == ref)) { n += 1; } + } + return n; + } + + int Compute_n_het(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + int n = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage == 0) { continue; } + String g = genotype_likelihoods[i].BestGenotype(); + if ((g.charAt(0) == ref) && (g.charAt(1) != ref)) { n += 1; } + if ((g.charAt(0) != ref) && (g.charAt(1) == ref)) { n += 1; } + } + return n; + } + + int Compute_n_hom(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + int n = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage == 0) { continue; } + String g = genotype_likelihoods[i].BestGenotype(); + if ((g.charAt(0) != ref) && (g.charAt(1) != ref)) { n += 1; } + } + return n; + } + + // This should actually return a GLF Record + MultiSampleCallResult MultiSampleCall(RefMetaDataTracker tracker, char ref, AlignmentContext context, List sample_names) + { + String in_dbsnp; + if (tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME).size() > 0) { in_dbsnp = "known"; } else { in_dbsnp = "novel"; } + + AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); + glCache.clear(); // reset the cache + cache_size = 0; + + double lod = LOD(contexts); + + int n_ref = Compute_n_ref(ref, em_result.genotype_likelihoods); + int n_het = Compute_n_het(ref, em_result.genotype_likelihoods); + int n_hom = Compute_n_hom(ref, em_result.genotype_likelihoods); + + //double strand_score = lod > MIN_LOD_FOR_STRAND ? StrandScore(context) : 0.0; + double strand_score; + if (n_het+n_hom > 0) { strand_score = StrandScore(context); } + else { strand_score = 0; } + + //EM_Result em_result = EM(contexts); + //ClassicGenotypeLikelihoods population_genotype_likelihoods = HardyWeinberg(em_result.allele_likelihoods); + + //double pD = Compute_pD(em_result.genotype_likelihoods); + //double pNull = Compute_pNull(contexts); + + //double discovery_lod = Compute_discovery_lod(ref, em_result.genotype_likelihoods); + double alt_freq = Compute_alt_freq(ref, em_result.allele_likelihoods); + + char alt = 'N'; + //if (lod > 0.0) { alt = PickAlt(ref, em_result.allele_likelihoods); } + if ((n_het > 0) || (n_hom > 0)) { alt = PickAlt(ref, em_result.allele_likelihoods); } + + if ( INCLUDE_GENOTYPES ) { + for (int i = 0; i < em_result.genotype_likelihoods.length; i++) + { + individual_output_file.printf("%s %c %s ", context.getLocation(), ref, sample_names.get(i)); + individual_output_file.printf("%s %f %f %s ", em_result.genotype_likelihoods[i].BestGenotype(), + em_result.genotype_likelihoods[i].LodVsNextBest(), + em_result.genotype_likelihoods[i].LodVsRef(ref), + in_dbsnp); + + //individual_output.printf("%s ", new ReadBackedPileup(ref, contexts[i]).getBaseCountsString()); + assert(em_result.genotype_likelihoods[i] != null); + em_result.genotype_likelihoods[i].sort(); + assert(em_result.genotype_likelihoods[i].sorted_likelihoods != null); + + if ( INCLUDE_GENOTYPES ) { + for (int j = 0; j < em_result.genotype_likelihoods[i].sorted_likelihoods.length; j++) + { + individual_output_file.printf("%f ", em_result.genotype_likelihoods[i].likelihoods[j]); + } + individual_output_file.printf("\n"); + } + } + } + + return new MultiSampleCallResult(context.getLocation(), ref, alt, em_result, lod, strand_score, pD, pNull, in_dbsnp, n_ref, n_het, n_hom, em_result.EM_N, alt_freq); + } + + // END Calling Functions + ///////// + + ///////// + // Utility Functions + + /// Filter a locus context by forward and backward + private AlignmentContext filterAlignmentContext(AlignmentContext context, String strand) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + for (int i = 0; i < context.getReads().size(); i++) + { + SAMRecord read = context.getReads().get(i); + Integer offset = context.getOffsets().get(i); + + // Filter for strandedness + if ((!strand.contains("+")) && (read.getReadNegativeStrandFlag() == false)) { continue; } + if ((!strand.contains("-")) && (read.getReadNegativeStrandFlag() == true)) { continue; } + reads.add(read); + offsets.add(offset); + } + return new AlignmentContext(context.getLocation(), reads, offsets); + } + + // Filter a locus context by sample ID + protected AlignmentContext[] filterAlignmentContext(AlignmentContext context, List sample_names, int downsample) + { + HashMap index = new HashMap(); + for (int i = 0; i < sample_names.size(); i++) + { + index.put(sample_names.get(i), i); + } + + AlignmentContext[] contexts = new AlignmentContext[sample_names.size()]; + ArrayList[] reads = new ArrayList[sample_names.size()]; + ArrayList[] offsets = new ArrayList[sample_names.size()]; + + for (int i = 0; i < sample_names.size(); i++) + { + reads[i] = new ArrayList(); + offsets[i] = new ArrayList(); + } + + for (int i = 0; i < context.getReads().size(); i++) + { + SAMRecord read = context.getReads().get(i); + Integer offset = context.getOffsets().get(i); + String RG = (String)(read.getAttribute("RG")); + + assert(header != null); + assert(header.getReadGroup(RG) != null); + + String sample = header.getReadGroup(RG).getSample(); + if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + reads[index.get(sample)].add(read); + offsets[index.get(sample)].add(offset); + } + + if (downsample != 0) + { + for (int j = 0; j < reads.length; j++) + { + List perm = new ArrayList(); + for (int i = 0; i < reads[j].size(); i++) { perm.add(i); } + perm = MathUtils.randomSubset(perm, downsample); + + ArrayList downsampled_reads = new ArrayList(); + ArrayList downsampled_offsets = new ArrayList(); + + for (int i = 0; i < perm.size(); i++) + { + downsampled_reads.add(reads[j].get(perm.get(i))); + downsampled_offsets.add(offsets[j].get(perm.get(i))); + } + + reads[j] = downsampled_reads; + offsets[j] = downsampled_offsets; + contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); + } + } + else + { + for (int j = 0; j < reads.length; j++) + { + contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); + } + } + + return contexts; + } + + private AlignmentContext filter_each_read(AlignmentContext L) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + for (int i = 0; i < L.getReads().size(); i++) + { + SAMRecord read = L.getReads().get(i); + Integer offset = L.getOffsets().get(i); + String RG = (String)(read.getAttribute("RG")); + + assert(this.header != null); + //assert(this.header.getReadGroup(RG) != null); + if (this.header.getReadGroup(RG) == null) { continue; } + + // skip bogus data + if (read.getMappingQuality() == 0) { continue; } + + String sample = this.header.getReadGroup(RG).getSample(); + //if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + + reads.add(read); + offsets.add(offset); + } + + AlignmentContext ans = new AlignmentContext(L.getLocation(), reads, offsets); + + return ans; + } + + // END Utility functions + ///////// + + + + +}