diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java index 6b7b3cc9d..80e451396 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java @@ -25,9 +25,9 @@ public class MultiSampleCaller extends LocusWalker sample_names; protected SAMFileHeader header; - protected PrintStream individual_output_file; - protected PrintStream discovery_output_file; - protected PrintStream stats_output_file; + protected PrintStream individual_output_file = null; + protected PrintStream discovery_output_file = null; + protected PrintStream stats_output_file = null; + + private boolean INCLUDE_STATS = false; + private boolean INCLUDE_GENOTYPES = false ; class MultiSampleCallResult { @@ -176,6 +179,19 @@ public class MultiSampleCaller extends LocusWalker= 10000) { return null; } // to deal with big piles -- totally arbitrary threshold this.ref = ref.getBase(); MultiSampleCallResult result = this.MultiSampleCall(tracker, ref.getBase(), context, sample_names); - stats_output_file.println(DepthStats.Row(ref.getBase(), context)); + if ( INCLUDE_STATS ) stats_output_file.println(DepthStats.Row(ref.getBase(), context)); return result; } @@ -253,8 +274,10 @@ public class MultiSampleCaller extends LocusWalker reads = context.getReads(); List offsets = context.getOffsets(); ref = Character.toUpperCase(ref); - + + if (reads.size() == 0) { + ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); + return G; + } + + // Handle single-base polymorphisms. ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); for ( int i = 0; i < reads.size(); i++ ) @@ -318,7 +341,7 @@ public class MultiSampleCaller extends LocusWalker 0.0) { alt = PickAlt(ref, em_result.allele_likelihoods); } if ((n_het > 0) || (n_hom > 0)) { alt = PickAlt(ref, em_result.allele_likelihoods); } - for (int i = 0; i < em_result.genotype_likelihoods.length; i++) - { - individual_output_file.printf("%s %c %s ", context.getLocation(), ref, sample_names.get(i)); - individual_output_file.printf("%s %f %f %s ", em_result.genotype_likelihoods[i].BestGenotype(), - em_result.genotype_likelihoods[i].LodVsNextBest(), - em_result.genotype_likelihoods[i].LodVsRef(ref), - in_dbsnp); - //individual_output.printf("%s ", new ReadBackedPileup(ref, contexts[i]).getBasePileupAsCountsString()); - assert(em_result.genotype_likelihoods[i] != null); - em_result.genotype_likelihoods[i].sort(); - assert(em_result.genotype_likelihoods[i].sorted_likelihoods != null); - for (int j = 0; j < em_result.genotype_likelihoods[i].sorted_likelihoods.length; j++) - { - individual_output_file.printf("%f ", em_result.genotype_likelihoods[i].likelihoods[j]); - } - individual_output_file.printf("\n"); - } + for (int i = 0; i < em_result.genotype_likelihoods.length; i++) + { + if ( INCLUDE_GENOTYPES ) { + individual_output_file.printf("%s %c %s ", context.getLocation(), ref, sample_names.get(i)); + individual_output_file.printf("%s %f %f %s ", em_result.genotype_likelihoods[i].BestGenotype(), + em_result.genotype_likelihoods[i].LodVsNextBest(), + em_result.genotype_likelihoods[i].LodVsRef(ref), + in_dbsnp); + } + + //individual_output.printf("%s ", new ReadBackedPileup(ref, contexts[i]).getBasePileupAsCountsString()); + assert(em_result.genotype_likelihoods[i] != null); + em_result.genotype_likelihoods[i].sort(); + assert(em_result.genotype_likelihoods[i].sorted_likelihoods != null); + + if ( INCLUDE_GENOTYPES ) { + for (int j = 0; j < em_result.genotype_likelihoods[i].sorted_likelihoods.length; j++) + { + individual_output_file.printf("%f ", em_result.genotype_likelihoods[i].likelihoods[j]); + } + individual_output_file.printf("\n"); + } + } return new MultiSampleCallResult(context.getLocation(), ref, alt, em_result, lod, strand_score, pD, pNull, in_dbsnp, n_ref, n_het, n_hom, em_result.EM_N, alt_freq); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller2.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller2.java new file mode 100644 index 000000000..0046dbf0a --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller2.java @@ -0,0 +1,989 @@ + +package org.broadinstitute.sting.playground.gatk.walkers; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.playground.utils.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; +import java.util.zip.*; +import java.io.*; + +// Beta iterative multi-sample caller +// j.maguire 6-11-2009 + +public class MultiSampleCaller2 extends LocusWalker +{ + @Argument(required=false, shortName="fractional_counts", doc="should we use fractional counts?") public boolean FRACTIONAL_COUNTS = false; + @Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10; + @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT = null; + @Argument(fullName="individual_output", shortName="individual_output", required=false, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT = null; + @Argument(fullName="stats_output", shortName="stats_output", required=false, doc="file to write stats to") public String STATS_OUTPUT = null; + @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; + @Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false; + @Argument(fullName="weight_samples", shortName="weight_samples", required=false, doc="rw-weight samples during EM?") public boolean WEIGHT_SAMPLES = false; + + @Argument(fullName="theta", shortName="theta", required=false, doc="rate of sequence divergence") public double THETA = 1e-3; + @Argument(fullName="allele_frequency_prior", shortName="allele_frequency_prior", required=false, doc="use prior on allele frequencies? (P(f) = theta/(N*f)") public boolean ALLELE_FREQUENCY_PRIOR = false; + + @Argument(fullName="confusion_matrix_file", shortName="confusion_matrix_file", required=false, doc="file containing confusion matrix for all three technologies") public String CONFUSION_MATRIX_FILE = null; + + @Argument(fullName="ALLELE_FREQ_TOLERANCE", shortName="AFT", required=false, doc="") + public double ALLELE_FREQ_TOLERANCE = 1e-3; + + private static final double MIN_LOD_FOR_STRAND = 0.01; + + // Private state. + protected List sample_names; + protected SAMFileHeader header; + protected PrintStream individual_output_file = null; + protected PrintStream discovery_output_file = null; + protected PrintStream stats_output_file = null; + + private boolean INCLUDE_STATS = false; + private boolean INCLUDE_GENOTYPES = false ; + + class MultiSampleCallResult + { + GenomeLoc location; + char ref; + char alt; + EM_Result em_result; + double lod; + double strand_score; + double pD; + double pNull; + String in_dbsnp; + int n_ref; + int n_het; + int n_hom; + int EM_N; + double alt_freq; + + public MultiSampleCallResult(GenomeLoc location, char ref, char alt, EM_Result em_result, double lod, double strand_score, double pD, double pNull, String in_dbsnp, int n_ref, int n_het, int n_hom, int EM_N, double alt_freq) + { + this.location = location; + this.ref = ref; + this.alt = alt; + this.em_result = em_result; + this.lod = lod; + this.strand_score = strand_score; + this.pD = pD; + this.pNull = pNull; + this.in_dbsnp = in_dbsnp; + this.n_ref = n_ref; + this.n_het = n_het; + this.n_hom = n_hom; + this.EM_N = EM_N; + this.alt_freq = alt_freq; + } + + public MultiSampleCallResult() { } // this is just so I can do new MultiSampleCallResult().header(). "inner classes cannot have static declarations" :( + + public String header() + { + return new String("loc ref alt lod strand_score pD pNull in_dbsnp pA pC pG pT EM_alt_freq EM_N n_ref n_het n_hom"); + } + + public String toString() + { + String s = ""; + s = s + String.format("%s %c %c %f %f %f %f %s ", location, ref, alt, lod, strand_score, pD, pNull, in_dbsnp); + for (int i = 0; i < 4; i++) { s = s + String.format("%f ", em_result.allele_likelihoods[i]); } + s = s + String.format("%f %d %d %d %d", alt_freq, em_result.EM_N, n_ref, n_het, n_hom); + return s; + } + } + + public static class DepthStats + { + public static String Header() + { + return "loc ref depth A C G T a c g t mq_min mq_mean mq_median mq_max mq_sd"; + } + + public static String Row(char ref, AlignmentContext context) + { + String ans = ""; + List reads = context.getReads(); + List offsets = context.getOffsets(); + Pileup pileup = new ReadBackedPileup(ref, context); + + ans += String.format("%s ", context.getLocation()); + ans += String.format("%c ", ref); + ans += String.format("%d ", reads.size()); + ans += String.format("%d ", countBase(context, 'A', "+")); + ans += String.format("%d ", countBase(context, 'C', "+")); + ans += String.format("%d ", countBase(context, 'G', "+")); + ans += String.format("%d ", countBase(context, 'T', "+")); + ans += String.format("%d ", countBase(context, 'A', "-")); + ans += String.format("%d ", countBase(context, 'C', "-")); + ans += String.format("%d ", countBase(context, 'G', "-")); + ans += String.format("%d ", countBase(context, 'T', "-")); + + ans += String.format("%s ", Stats(BasicPileup.mappingQualPileup(reads))); + + return ans; + } + + static int countBase(AlignmentContext context, char base, String strand) + { + int count = 0; + List reads = context.getReads(); + List offsets = context.getOffsets(); + for (int i = 0; i < reads.size(); i++) + { + if (reads.get(i).getReadString().charAt(offsets.get(i)) == base) + { + if (strand.equals("+") && (reads.get(i).getReadNegativeStrandFlag()==false)) { count += 1; } + else if (strand.equals("-") && (reads.get(i).getReadNegativeStrandFlag()==true)) { count += 1; } + else if (! (strand.equals("+") || strand.equals("-"))) { count += 1; } + } + } + return count; + } + + public static String Stats(ArrayList X) + { + Collections.sort(X); + + long count = 0; + long sum = 0; + long min = X.get(0); + long max = X.get(0); + long median = X.get(0); + for (int i = 0; i < X.size(); i++) + { + int x = X.get(i); + if (x < min) { min = x; } + if (x > max) { max = x; } + sum += x; + count += 1; + if (i == X.size()/2) { median = x; } + } + + double mean = sum/count; + for (int i = 0; i < X.size(); i++) + { + int x = X.get(i); + sum += (x-mean)*(x-mean); + count += 1; + } + double sd = Math.sqrt(sum/count); + + return String.format("%d %f %d %d %f", min, mean, median, max, sd); + } + } + + private void maybeInitializeDisoveryOutput() { + if ( discovery_output_file == null ) { + try { + //final String filename = String.format("%s_%s_%d.calls", DISCOVERY_OUTPUT, loc.getContig(), loc.getStart()); + final String filename = DISCOVERY_OUTPUT; + //System.out.printf("opening %s%n", filename); + discovery_output_file = new PrintStream(filename); + discovery_output_file.println(new MultiSampleCallResult().header()); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + ///////// + // Walker Interface Functions + public void initialize() + { + System.out.printf("\n\n\n\n"); + (new ClassicGenotypeLikelihoods()).TEST(); + + try + { + maybeInitializeDisoveryOutput(); + + INCLUDE_GENOTYPES = INDIVIDUAL_OUTPUT != null; + if ( INCLUDE_GENOTYPES ) { + individual_output_file = new PrintStream(new GZIPOutputStream(new FileOutputStream(INDIVIDUAL_OUTPUT))); + individual_output_file.println("loc ref sample_name genotype lodVsNextBest lodVsRef in_dbsnp AA AC AG AT CC CG CT GG GT TT"); + } + + INCLUDE_STATS = STATS_OUTPUT != null; + if ( INCLUDE_STATS ) { + stats_output_file = new PrintStream(STATS_OUTPUT); + stats_output_file.println(DepthStats.Header()); + } + } + catch (Exception e) + { + e.printStackTrace(); + System.exit(-1); + } + + GenomeAnalysisEngine toolkit = this.getToolkit(); + this.header = toolkit.getSAMFileHeader(); + List read_groups = header.getReadGroups(); + + sample_names = new ArrayList(); + + HashSet unique_sample_names = new HashSet(); + + for (int i = 0; i < read_groups.size(); i++) + { + String sample_name = read_groups.get(i).getSample(); + String platform = (String)(read_groups.get(i).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); + + if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + + System.out.printf("SAMPLE: %s %s\n", sample_name, platform); + + if (unique_sample_names.contains(sample_name)) { continue; } + unique_sample_names.add(sample_name); + sample_names.add(sample_name); + + System.out.printf("UNIQUE_SAMPLE: %s %s\n", sample_name, platform); + } + + // Load the confusion matrix if it exists + if (CONFUSION_MATRIX_FILE != null) + { + this.confusion_matrix = new ConfusionMatrix(CONFUSION_MATRIX_FILE); + } + + } + + + public MultiSampleCallResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) + { + context = filter_each_read(context); + + if (ref.getBase() == 'N') { return null; } + if (context.getReads().size() <= 0) { return null; } + if (context.getReads().size() >= 10000) { return null; } // to deal with big piles -- totally arbitrary threshold + + this.ref = ref.getBase(); + MultiSampleCallResult result = this.MultiSampleCall(tracker, ref.getBase(), context, sample_names); + if ( INCLUDE_STATS ) stats_output_file.println(DepthStats.Row(ref.getBase(), context)); + return result; + } + + public void onTraversalDone(String sum) + { + discovery_output_file.flush(); + discovery_output_file.close(); + + if ( INCLUDE_STATS ) { + stats_output_file.flush(); + stats_output_file.close(); + } + + out.println("MultiSampleCaller done."); + return; + } + + public String reduceInit() + { + return null; + } + + public String reduce(MultiSampleCallResult record, String sum) + { + if (record != null) + { + discovery_output_file.printf(record.toString() + "\n"); + } + return null; + } + + // END Walker Interface Functions + ///////// + + + ///////// + // Calling Functions + + char ref; + protected ConfusionMatrix confusion_matrix; + + ClassicGenotypeLikelihoods reallyMakeGenotypeLikelihood(AlignmentContext context) { + List reads = context.getReads(); + List offsets = context.getOffsets(); + + // Handle single-base polymorphisms. + ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); + for ( int i = 0; i < reads.size(); i++ ) + { + //System.out.printf("DBG: %s\n", context.getLocation()); + + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + if (CONFUSION_MATRIX_FILE == null) + { + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); + } + else + { + String RG = (String)(read.getAttribute("RG")); + + assert(header != null); + assert(header.getReadGroup(RG) != null); + + String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); + + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); + } + } + + return G; + } + + HashMap glCache = new HashMap(); + + ClassicGenotypeLikelihoods GenotypeOld(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { + //ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + //String bases = pileup.getBases(); + + List reads = context.getReads(); + List offsets = context.getOffsets(); + ref = Character.toUpperCase(ref); + + if (reads.size() == 0) { + ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); + return G; + } + + // Handle single-base polymorphisms. + ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); + for ( int i = 0; i < reads.size(); i++ ) + { + //System.out.printf("DBG: %s\n", context.getLocation()); + + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + if (CONFUSION_MATRIX_FILE == null) + { + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); + } + else + { + String RG = (String)(read.getAttribute("RG")); + + assert(header != null); + assert(header.getReadGroup(RG) != null); + + String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); + + G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); + } + } + G.ApplyPrior(ref, allele_likelihoods); + + // Handle indels + if (CALL_INDELS) + { + String[] indels = BasicPileup.indelPileup(reads, offsets); + IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq); + if (indel_call.getType() != null) + { + G.addIndelLikelihood(indel_call); + } + else + { + G.addIndelLikelihood(null); + } + } + + return G; + } + + ClassicGenotypeLikelihoods Genotype(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) { + return GenotypeCache(context, allele_likelihoods, indel_alt_freq ); + //return GenotypeOld(context, allele_likelihoods, indel_alt_freq ); + } + + + ClassicGenotypeLikelihoods GenotypeCache(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) + { + ref = Character.toUpperCase(ref); + + // Handle single-base polymorphisms. + ClassicGenotypeLikelihoods G = null; + if ( context.getReads().size() == 0 ) { + G = new ClassicGenotypeLikelihoods(); + return G; + } else { + if ( true && glCache.containsKey(context) ) { + ClassicGenotypeLikelihoods cached = glCache.get(context); + G = (ClassicGenotypeLikelihoods)cached.clone(); + } else { + G = reallyMakeGenotypeLikelihood(context); + glCache.put(context, G.clone()); + } + G.ApplyPrior(ref, allele_likelihoods); + } + + return G; + } + + // This version is a little faster. + double[] CountFreqs(ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + double[] allele_likelihoods = new double[4]; + double[] personal_allele_likelihoods = new double[4]; + + for (int x = 0; x < genotype_likelihoods.length; x++) + { + ClassicGenotypeLikelihoods G = genotype_likelihoods[x]; + + if (G.coverage == 0) { continue; } + + //double Z = 0; + int k = 0; + for (int i = 0; i < 4; i++) + { + personal_allele_likelihoods[i] = 0.0; + for (int j = i; j < 4; j++) + { + double likelihood = Math.pow(10,G.likelihoods[k]); + //Z += likelihood; + personal_allele_likelihoods[i] += likelihood; + personal_allele_likelihoods[j] += likelihood; + k++; + } + } + double sum = 0; + for (int y = 0; y < 4; y++) { sum += personal_allele_likelihoods[y]; } + for (int y = 0; y < 4; y++) { personal_allele_likelihoods[y] /= sum; } + for (int y = 0; y < 4; y++) { allele_likelihoods[y] += personal_allele_likelihoods[y]; } + } + + double sum = 0; + for (int i = 0; i < 4; i++) { sum += allele_likelihoods[i]; } + for (int i = 0; i < 4; i++) { allele_likelihoods[i] /= sum; } + + return allele_likelihoods; + } + + + double CountIndelFreq(ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + HashMap indel_allele_likelihoods = new HashMap(); + + double pRef = 0; + double pAlt = 0; + + for (int j = 0; j < sample_names.size(); j++) + { + double personal_pRef = 0; + double personal_pAlt = 0; + + IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood(); + personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet()); + personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet()); + + personal_pRef = personal_pRef / (personal_pAlt + personal_pRef); + personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef); + + pRef += personal_pRef; + pAlt += personal_pAlt; + } + + pRef = pRef / (pRef + pAlt); + pAlt = pAlt / (pRef + pAlt); + + return pAlt; + } + + // Potential precision error here. + double Compute_pD(ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] sample_weights) + { + double pD = 0; + for (int i = 0; i < sample_names.size(); i++) + { + double sum = 0; + for (int j = 0; j < 10; j++) + { + sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); + } + pD += Math.log10(sample_weights[i] * sum); + } + return pD; + } + + double Compute_pNull(AlignmentContext[] contexts, double[] sample_weights) + { + double[] allele_likelihoods = new double[4]; + for (int i = 0; i < 4; i++) { allele_likelihoods[i] = 1e-6/3.0; } + allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6; + ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; + for (int j = 0; j < sample_names.size(); j++) + { + G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6); + } + return Compute_pD(G, sample_weights); + } + + double[] Compute_SampleWeights(ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + double[] pD = new double[sample_names.size()]; + double total_pD = 0; + for (int i = 0; i < sample_names.size(); i++) + { + double sum = 0; + for (int j = 0; j < 10; j++) + { + sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); + } + pD[i] = sum; + total_pD += pD[i]; + } + + for (int i = 0; i < sample_names.size(); i++) + { + pD[i] /= total_pD; + } + + return pD; + } + + // Some globals to cache results. + EM_Result em_result; + double pD; + double pNull; + double lod; + double LOD(AlignmentContext[] contexts) + { + em_result = EM(contexts); + ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods; + pD = Compute_pD(G, em_result.sample_weights); + pNull = Compute_pNull(contexts, em_result.sample_weights); + + if (ALLELE_FREQUENCY_PRIOR) + { + // Apply p(f). + double pVar = 0.0; + for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } + + double p0 = Math.log10(1 - pVar); + double pF; + + double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods); + + if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; } + else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); } + + //System.out.printf("DBG %s %c %f %f %f %f (%.20f) %f ", contexts[0].getLocation(), ref, pD, pF, pNull, p0, Compute_alt_freq(ref, em_result.allele_likelihoods), 2.0 * em_result.EM_N); + //for (int i = 0; i < 4; i++) { System.out.printf("%f ", em_result.allele_likelihoods[i]); } + //System.out.printf("\n"); + + pD = pD + pF; + pNull = pNull + p0; + } + + lod = pD - pNull; + return lod; + } + + class EM_Result + { + String[] sample_names; + ClassicGenotypeLikelihoods[] genotype_likelihoods; + double[] allele_likelihoods; + double[] sample_weights; + int EM_N; + + public EM_Result(List sample_names, ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] allele_likelihoods, double[] sample_weights) + { + this.sample_names = new String[1]; + this.sample_names = sample_names.toArray(this.sample_names); + this.genotype_likelihoods = genotype_likelihoods; + this.allele_likelihoods = allele_likelihoods; + this.sample_weights = sample_weights; + + EM_N = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage > 0) { EM_N += 1; } + } + + } + + } + + final static double[] sample_weights = new double[1000]; + static { + for (int i = 0; i < 1000; i++) + { + //sample_weights[i] = 1.0/(double)i; + sample_weights[i] = 1.0; + } + } + + EM_Result EM(AlignmentContext[] contexts) + { + final boolean DEBUG_PRINT = false; + double[] allele_likelihoods = new double[4]; + + // These initial conditions should roughly replicate classic SSG. (at least on hets) + for (int i = 0; i < 4; i++) + { + if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999) + else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999) + } + double indel_alt_freq = 1e-4; + + ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; + //ClassicGenotypeLikelihoods[] Weighted_G = new ClassicGenotypeLikelihoods[sample_names.size()]; + + if ( DEBUG_PRINT ) System.out.printf("%n"); + + for (int i = 0; i < MAX_ITERATIONS; i++) + { + for (int j = 0; j < sample_names.size(); j++) + { + G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq); + //if (WEIGHT_SAMPLES) { G[j].ApplyWeight(sample_weights[j]); } + } + + double[] old_allele_likelihoods = allele_likelihoods; + allele_likelihoods = CountFreqs(G); + double alDelta = 0.0; + for (int j = 0; j < 4; j++) { alDelta += Math.abs(old_allele_likelihoods[j] - allele_likelihoods[j]); } + if ( DEBUG_PRINT ) + System.out.printf("%s AL %f %f %f %f => delta=%e < %e == %b%n", + contexts[0].getLocation(), + allele_likelihoods[0], allele_likelihoods[1], allele_likelihoods[2], allele_likelihoods[3], + alDelta, ALLELE_FREQ_TOLERANCE, alDelta < ALLELE_FREQ_TOLERANCE); + + if ( alDelta < ALLELE_FREQ_TOLERANCE ) { + if ( DEBUG_PRINT ) System.out.printf("Aborting after %d iterations%n", i); + break; + } + +// if (CALL_INDELS) +// { +// indel_alt_freq = CountIndelFreq(G); +// } + +// if (WEIGHT_SAMPLES) +// { +// sample_weights = Compute_SampleWeights(G); +// } + } + + return new EM_Result(sample_names, G, allele_likelihoods, sample_weights); + } + + // Hacky global variables for debugging. + double StrandScore(AlignmentContext context) + { + //AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); + + AlignmentContext fw = filterAlignmentContext(context, "+"); + AlignmentContext bw = filterAlignmentContext(context, "-"); + AlignmentContext[] contexts_fw = filterAlignmentContext(fw, sample_names, 0); + AlignmentContext[] contexts_bw = filterAlignmentContext(bw, sample_names, 0); + + EM_Result em_fw = EM(contexts_fw); + EM_Result em_bw = EM(contexts_bw); + + double pNull_fw = Compute_pNull(contexts_fw, em_fw.sample_weights); + double pNull_bw = Compute_pNull(contexts_bw, em_bw.sample_weights); + + double pD_fw = Compute_pD(em_fw.genotype_likelihoods, em_fw.sample_weights); + double pD_bw = Compute_pD(em_bw.genotype_likelihoods, em_bw.sample_weights); + + if (ALLELE_FREQUENCY_PRIOR) + { + // Apply p(f). + double pVar = 0.0; + for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } + + pD_fw = pD_fw + Math.log10(THETA/(2.0*em_fw.EM_N * Compute_alt_freq(ref, em_fw.allele_likelihoods))); + pNull_fw = pNull_fw + Math.log10(1 - pVar); + + pD_bw = pD_bw + Math.log10(THETA/(2.0*em_bw.EM_N * Compute_alt_freq(ref, em_bw.allele_likelihoods))); + pNull_bw = pNull_bw + Math.log10(1 - pVar); + } + + double EM_alt_freq_fw = Compute_alt_freq(ref, em_fw.allele_likelihoods); + double EM_alt_freq_bw = Compute_alt_freq(ref, em_bw.allele_likelihoods); + + //double pNull = Compute_pNull(contexts); + //double lod = LOD(contexts); + + double lod_fw = (pD_fw + pNull_bw) - pNull; + double lod_bw = (pD_bw + pNull_fw) - pNull; + double strand_score = Math.max(lod_fw - lod, lod_bw - lod); + return strand_score; + } + +// ClassicGenotypeLikelihoods HardyWeinberg(double[] allele_likelihoods) +// { +// ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); +// int k = 0; +// for (int i = 0; i < 4; i++) +// { +// for (int j = i; j < 4; j++) +// { +// G.likelihoods[k] = allele_likelihoods[i] * allele_likelihoods[j]; +// k++; +// } +// } +// return G; +// } + + char PickAlt(char ref, double[] allele_likelihoods) + { + Integer[] perm = Utils.SortPermutation(allele_likelihoods); + if (perm[3] != BaseUtils.simpleBaseToBaseIndex(ref)) { return BaseUtils.baseIndexToSimpleBase(perm[3]); } + else { return BaseUtils.baseIndexToSimpleBase(perm[2]); } + } + + double Compute_discovery_lod(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + double pBest = 0; + double pRef = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + pBest += genotype_likelihoods[i].BestPosterior(); + pRef += genotype_likelihoods[i].RefPosterior(ref); + } + return pBest - pRef; + } + + // this one is a bit of a lazy hack. + double Compute_alt_freq(char ref, double[] allele_likelihoods) + { + return allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(PickAlt(ref, allele_likelihoods))]; + } + + int Compute_n_ref(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + int n = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage == 0) { continue; } + String g = genotype_likelihoods[i].BestGenotype(); + if ((g.charAt(0) == ref) && (g.charAt(1) == ref)) { n += 1; } + } + return n; + } + + int Compute_n_het(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + int n = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage == 0) { continue; } + String g = genotype_likelihoods[i].BestGenotype(); + if ((g.charAt(0) == ref) && (g.charAt(1) != ref)) { n += 1; } + if ((g.charAt(0) != ref) && (g.charAt(1) == ref)) { n += 1; } + } + return n; + } + + int Compute_n_hom(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) + { + int n = 0; + for (int i = 0; i < genotype_likelihoods.length; i++) + { + if (genotype_likelihoods[i].coverage == 0) { continue; } + String g = genotype_likelihoods[i].BestGenotype(); + if ((g.charAt(0) != ref) && (g.charAt(1) != ref)) { n += 1; } + } + return n; + } + + // This should actually return a GLF Record + MultiSampleCallResult MultiSampleCall(RefMetaDataTracker tracker, char ref, AlignmentContext context, List sample_names) + { + String in_dbsnp; + if (tracker.lookup("DBSNP", null) != null) { in_dbsnp = "known"; } else { in_dbsnp = "novel"; } + + AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); + glCache.clear(); // reset the contexts + + double lod = LOD(contexts); + double strand_score = lod > MIN_LOD_FOR_STRAND ? StrandScore(context) : 0.0; + //EM_Result em_result = EM(contexts); + //ClassicGenotypeLikelihoods population_genotype_likelihoods = HardyWeinberg(em_result.allele_likelihoods); + + //double pD = Compute_pD(em_result.genotype_likelihoods); + //double pNull = Compute_pNull(contexts); + + //double discovery_lod = Compute_discovery_lod(ref, em_result.genotype_likelihoods); + double alt_freq = Compute_alt_freq(ref, em_result.allele_likelihoods); + + int n_ref = Compute_n_ref(ref, em_result.genotype_likelihoods); + int n_het = Compute_n_het(ref, em_result.genotype_likelihoods); + int n_hom = Compute_n_hom(ref, em_result.genotype_likelihoods); + + char alt = 'N'; + //if (lod > 0.0) { alt = PickAlt(ref, em_result.allele_likelihoods); } + if ((n_het > 0) || (n_hom > 0)) { alt = PickAlt(ref, em_result.allele_likelihoods); } + + if ( INCLUDE_GENOTYPES ) { + for (int i = 0; i < em_result.genotype_likelihoods.length; i++) + { + individual_output_file.printf("%s %c %s ", context.getLocation(), ref, sample_names.get(i)); + individual_output_file.printf("%s %f %f %s ", em_result.genotype_likelihoods[i].BestGenotype(), + em_result.genotype_likelihoods[i].LodVsNextBest(), + em_result.genotype_likelihoods[i].LodVsRef(ref), + in_dbsnp); + + //individual_output.printf("%s ", new ReadBackedPileup(ref, contexts[i]).getBasePileupAsCountsString()); + assert(em_result.genotype_likelihoods[i] != null); + em_result.genotype_likelihoods[i].sort(); + assert(em_result.genotype_likelihoods[i].sorted_likelihoods != null); + + if ( INCLUDE_GENOTYPES ) { + for (int j = 0; j < em_result.genotype_likelihoods[i].sorted_likelihoods.length; j++) + { + individual_output_file.printf("%f ", em_result.genotype_likelihoods[i].likelihoods[j]); + } + individual_output_file.printf("\n"); + } + } + } + + return new MultiSampleCallResult(context.getLocation(), ref, alt, em_result, lod, strand_score, pD, pNull, in_dbsnp, n_ref, n_het, n_hom, em_result.EM_N, alt_freq); + } + + // END Calling Functions + ///////// + + ///////// + // Utility Functions + + /// Filter a locus context by forward and backward + private AlignmentContext filterAlignmentContext(AlignmentContext context, String strand) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + for (int i = 0; i < context.getReads().size(); i++) + { + SAMRecord read = context.getReads().get(i); + Integer offset = context.getOffsets().get(i); + + // Filter for strandedness + if ((!strand.contains("+")) && (read.getReadNegativeStrandFlag() == false)) { continue; } + if ((!strand.contains("-")) && (read.getReadNegativeStrandFlag() == true)) { continue; } + reads.add(read); + offsets.add(offset); + } + return new AlignmentContext(context.getLocation(), reads, offsets); + } + + // Filter a locus context by sample ID + protected AlignmentContext[] filterAlignmentContext(AlignmentContext context, List sample_names, int downsample) + { + HashMap index = new HashMap(); + for (int i = 0; i < sample_names.size(); i++) + { + index.put(sample_names.get(i), i); + } + + AlignmentContext[] contexts = new AlignmentContext[sample_names.size()]; + ArrayList[] reads = new ArrayList[sample_names.size()]; + ArrayList[] offsets = new ArrayList[sample_names.size()]; + + for (int i = 0; i < sample_names.size(); i++) + { + reads[i] = new ArrayList(); + offsets[i] = new ArrayList(); + } + + for (int i = 0; i < context.getReads().size(); i++) + { + SAMRecord read = context.getReads().get(i); + Integer offset = context.getOffsets().get(i); + String RG = (String)(read.getAttribute("RG")); + + assert(header != null); + assert(header.getReadGroup(RG) != null); + + String sample = header.getReadGroup(RG).getSample(); + if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + reads[index.get(sample)].add(read); + offsets[index.get(sample)].add(offset); + } + + if (downsample != 0) + { + for (int j = 0; j < reads.length; j++) + { + List perm = new ArrayList(); + for (int i = 0; i < reads[j].size(); i++) { perm.add(i); } + perm = Utils.RandomSubset(perm, downsample); + + ArrayList downsampled_reads = new ArrayList(); + ArrayList downsampled_offsets = new ArrayList(); + + for (int i = 0; i < perm.size(); i++) + { + downsampled_reads.add(reads[j].get(perm.get(i))); + downsampled_offsets.add(offsets[j].get(perm.get(i))); + } + + reads[j] = downsampled_reads; + offsets[j] = downsampled_offsets; + contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); + } + } + else + { + for (int j = 0; j < reads.length; j++) + { + contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); + } + } + + return contexts; + } + + private AlignmentContext filter_each_read(AlignmentContext L) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + for (int i = 0; i < L.getReads().size(); i++) + { + SAMRecord read = L.getReads().get(i); + Integer offset = L.getOffsets().get(i); + String RG = (String)(read.getAttribute("RG")); + + assert(this.header != null); + //assert(this.header.getReadGroup(RG) != null); + if (this.header.getReadGroup(RG) == null) { continue; } + + // skip bogus data + if (read.getMappingQuality() == 0) { continue; } + + String sample = this.header.getReadGroup(RG).getSample(); + //if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + + reads.add(read); + offsets.add(offset); + } + + AlignmentContext ans = new AlignmentContext(L.getLocation(), reads, offsets); + + return ans; + } + + // END Utility functions + ///////// + + + + +} diff --git a/java/src/org/broadinstitute/sting/playground/utils/ClassicGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/utils/ClassicGenotypeLikelihoods.java index 382e894f9..78dadd04f 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/ClassicGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/playground/utils/ClassicGenotypeLikelihoods.java @@ -7,8 +7,9 @@ import org.broadinstitute.sting.utils.QualityUtils; import static java.lang.Math.log10; import static java.lang.Math.pow; import java.util.HashMap; +import java.lang.Cloneable; -public class ClassicGenotypeLikelihoods { +public class ClassicGenotypeLikelihoods implements Cloneable { // precalculate these for performance (pow/log10 is expensive!) private static final double[] oneMinusData = new double[Byte.MAX_VALUE]; @@ -44,24 +45,28 @@ public class ClassicGenotypeLikelihoods { private double priorHomRef; private double priorHet; private double priorHomVar; + public String[] sorted_genotypes; + public double[] sorted_likelihoods; + double ref_likelihood = Double.NaN; + private IndelLikelihood indel_likelihood; // Store the 2nd-best base priors for on-genotype primary bases - private HashMap onNextBestBasePriors = new HashMap(); + //private HashMap onNextBestBasePriors = new HashMap(); // Store the 2nd-best base priors for off-genotype primary bases - private HashMap offNextBestBasePriors = new HashMap(); + //private HashMap offNextBestBasePriors = new HashMap(); + + private static double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; + private static double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; public ClassicGenotypeLikelihoods() { - double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; - double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; - initialize(1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff); } - public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { - double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; - double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; + public ClassicGenotypeLikelihoods(boolean foo) { + } + public ClassicGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); } @@ -69,6 +74,23 @@ public class ClassicGenotypeLikelihoods { initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); } + public ClassicGenotypeLikelihoods clone() { + ClassicGenotypeLikelihoods c = new ClassicGenotypeLikelihoods(false); + c.likelihoods = this.likelihoods.clone(); + c.genotypes = this.genotypes.clone(); + c.coverage = this.coverage; + + // The genotype priors; + c.priorHomRef = this.priorHomRef; + c.priorHet = this.priorHet; + c.priorHomVar = this.priorHomVar; + //public String[] sorted_genotypes; + //public double[] sorted_likelihoods; + //double ref_likelihood = Double.NaN; + //private IndelLikelihood indel_likelihood; + return c; + } + private void initialize(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { this.priorHomRef = priorHomRef; this.priorHet = priorHet; @@ -91,10 +113,10 @@ public class ClassicGenotypeLikelihoods { genotypes[8] = "GT"; genotypes[9] = "TT"; - for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { - onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); - offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); - } + //for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { + // onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); + // offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); + //} } public double getHomRefPrior() { @@ -121,37 +143,37 @@ public class ClassicGenotypeLikelihoods { this.priorHomVar = priorHomVar; } - public double[] getOnGenotypeSecondaryPriors() { - double[] p2ndon = new double[10]; - - for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { - p2ndon[genotypeIndex] = onNextBestBasePriors.get(genotypes[genotypeIndex]); - } - - return p2ndon; - } - - public void setOnGenotypeSecondaryPriors(double[] p2ndon) { - for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { - onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); - } - } - - public double[] getOffGenotypeSecondaryPriors() { - double[] p2ndoff = new double[10]; - - for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { - p2ndoff[genotypeIndex] = offNextBestBasePriors.get(genotypes[genotypeIndex]); - } - - return p2ndoff; - } - - public void setOffGenotypeSecondaryPriors(double[] p2ndoff) { - for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { - offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); - } - } +// public double[] getOnGenotypeSecondaryPriors() { +// double[] p2ndon = new double[10]; +// +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// p2ndon[genotypeIndex] = onNextBestBasePriors.get(genotypes[genotypeIndex]); +// } +// +// return p2ndon; +// } +// +// public void setOnGenotypeSecondaryPriors(double[] p2ndon) { +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); +// } +// } +// +// public double[] getOffGenotypeSecondaryPriors() { +// double[] p2ndoff = new double[10]; +// +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// p2ndoff[genotypeIndex] = offNextBestBasePriors.get(genotypes[genotypeIndex]); +// } +// +// return p2ndoff; +// } +// +// public void setOffGenotypeSecondaryPriors(double[] p2ndoff) { +// for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { +// offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); +// } +// } public void add(char ref, char read, byte qual) { @@ -289,9 +311,6 @@ public class ClassicGenotypeLikelihoods { return p_base; } - public String[] sorted_genotypes; - public double[] sorted_likelihoods; - public void sort() { Integer[] permutation = Utils.SortPermutation(likelihoods); @@ -382,56 +401,55 @@ public class ClassicGenotypeLikelihoods { public void ApplyWeight(double weight) { - for (int i = 0; i < genotypes.length; i++) { likelihoods[i] += Math.log10(weight); } + double log10weight = Math.log10(weight); + for (int i = 0; i < genotypes.length; i++) { likelihoods[i] += log10weight; } this.sort(); } - public void applySecondBaseDistributionPrior(String primaryBases, String secondaryBases) { - for (int genotypeIndex = 0; genotypeIndex < genotypes.length; genotypeIndex++) { - char firstAllele = genotypes[genotypeIndex].charAt(0); - char secondAllele = genotypes[genotypeIndex].charAt(1); - - int offIsGenotypic = 0; - int offTotal = 0; - - int onIsGenotypic = 0; - int onTotal = 0; - - for (int pileupIndex = 0; pileupIndex < primaryBases.length(); pileupIndex++) { - char primaryBase = primaryBases.charAt(pileupIndex); - - if (secondaryBases != null) { - char secondaryBase = secondaryBases.charAt(pileupIndex); - - if (primaryBase != firstAllele && primaryBase != secondAllele) { - if (secondaryBase == firstAllele || secondaryBase == secondAllele) { - offIsGenotypic++; - } - offTotal++; - } else { - if (secondaryBase == firstAllele || secondaryBase == secondAllele) { - onIsGenotypic++; - } - onTotal++; - } - } - } - - double offPrior = MathUtils.binomialProbability(offIsGenotypic, offTotal, offNextBestBasePriors.get(genotypes[genotypeIndex])); - double onPrior = MathUtils.binomialProbability(onIsGenotypic, onTotal, onNextBestBasePriors.get(genotypes[genotypeIndex])); - - likelihoods[genotypeIndex] += Math.log10(offPrior) + Math.log10(onPrior); - } - this.sort(); - } +// public void applySecondBaseDistributionPrior(String primaryBases, String secondaryBases) { +// for (int genotypeIndex = 0; genotypeIndex < genotypes.length; genotypeIndex++) { +// char firstAllele = genotypes[genotypeIndex].charAt(0); +// char secondAllele = genotypes[genotypeIndex].charAt(1); +// +// int offIsGenotypic = 0; +// int offTotal = 0; +// +// int onIsGenotypic = 0; +// int onTotal = 0; +// +// for (int pileupIndex = 0; pileupIndex < primaryBases.length(); pileupIndex++) { +// char primaryBase = primaryBases.charAt(pileupIndex); +// +// if (secondaryBases != null) { +// char secondaryBase = secondaryBases.charAt(pileupIndex); +// +// if (primaryBase != firstAllele && primaryBase != secondAllele) { +// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { +// offIsGenotypic++; +// } +// offTotal++; +// } else { +// if (secondaryBase == firstAllele || secondaryBase == secondAllele) { +// onIsGenotypic++; +// } +// onTotal++; +// } +// } +// } +// +// double offPrior = MathUtils.binomialProbability(offIsGenotypic, offTotal, offNextBestBasePriors.get(genotypes[genotypeIndex])); +// double onPrior = MathUtils.binomialProbability(onIsGenotypic, onTotal, onNextBestBasePriors.get(genotypes[genotypeIndex])); +// +// likelihoods[genotypeIndex] += Math.log10(offPrior) + Math.log10(onPrior); +// } +// this.sort(); +// } public double LodVsNextBest() { this.sort(); return sorted_likelihoods[0] - sorted_likelihoods[1]; } - double ref_likelihood = Double.NaN; - public double LodVsRef(char ref) { if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) { ref_likelihood = sorted_likelihoods[0]; @@ -462,7 +480,6 @@ public class ClassicGenotypeLikelihoods { return this.ref_likelihood; } - private IndelLikelihood indel_likelihood; public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; } public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; }