From 05923f7fba4949533da50e1fcd9719bbd184cd03 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 1 Dec 2009 21:19:02 +0000 Subject: [PATCH] Started transition to oneoffprojects. Moved/killed a few other walkers (with permission). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2204 348d0f76-0448-11de-a6fe-93d51630548a --- .../multisamplecaller}/ListSampleIds.java | 0 .../PrintHapmapGenotypes.java | 0 .../walkers/FlagStatWalker.java} | 10 +- ...seTransitionTableCalculatorJavaWalker.java | 2 +- .../walkers/HapmapPoolAllelicInfoWalker.java | 2 +- .../walkers/IOCrusherWalker.java | 2 +- .../walkers/MismatchCounterWalker.java | 0 .../walkers/MismatchHistoWalker.java | 0 .../gatk/walkers/MultiSampleCaller.java | 945 ------------------ 9 files changed, 6 insertions(+), 955 deletions(-) rename {java/src/org/broadinstitute/sting/playground/gatk/walkers => archive/java/src/org/broadinstitute/sting/multisamplecaller}/ListSampleIds.java (100%) rename {java/src/org/broadinstitute/sting/playground/gatk/walkers => archive/java/src/org/broadinstitute/sting/multisamplecaller}/PrintHapmapGenotypes.java (100%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/FlagStatReadWalker.java => gatk/walkers/FlagStatWalker.java} (94%) rename java/src/org/broadinstitute/sting/{playground/gatk => oneoffprojects}/walkers/BaseTransitionTableCalculatorJavaWalker.java (99%) rename java/src/org/broadinstitute/sting/{playground/gatk => oneoffprojects}/walkers/HapmapPoolAllelicInfoWalker.java (99%) rename java/src/org/broadinstitute/sting/{playground/gatk => oneoffprojects}/walkers/IOCrusherWalker.java (97%) rename java/src/org/broadinstitute/sting/{playground/gatk => oneoffprojects}/walkers/MismatchCounterWalker.java (100%) rename java/src/org/broadinstitute/sting/{playground/gatk => oneoffprojects}/walkers/MismatchHistoWalker.java (100%) delete mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ListSampleIds.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/ListSampleIds.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/ListSampleIds.java rename to archive/java/src/org/broadinstitute/sting/multisamplecaller/ListSampleIds.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PrintHapmapGenotypes.java b/archive/java/src/org/broadinstitute/sting/multisamplecaller/PrintHapmapGenotypes.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/PrintHapmapGenotypes.java rename to archive/java/src/org/broadinstitute/sting/multisamplecaller/PrintHapmapGenotypes.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/FlagStatReadWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java similarity index 94% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/FlagStatReadWalker.java rename to java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java index f487f8ab8..46787e020 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/FlagStatReadWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java @@ -1,14 +1,10 @@ -package org.broadinstitute.sting.playground.gatk.walkers; +package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMRecord; import java.text.DecimalFormat; import java.text.NumberFormat; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; - /* * Copyright (c) 2009 The Broad Institute @@ -38,13 +34,13 @@ import org.broadinstitute.sting.gatk.walkers.DataSource; /** * @author aaron *

- * Class FlagStatReadWalker + * Class FlagStatWalker *

* This walker mirrors that stats that are generated by the flagstat * command of samtools. */ @Requires({DataSource.READS}) -public class FlagStatReadWalker extends ReadWalker { +public class FlagStatWalker extends ReadWalker { // what comes out of the flagstat class FlagStat { int readCount = 0; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BaseTransitionTableCalculatorJavaWalker.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java rename to java/src/org/broadinstitute/sting/oneoffprojects/walkers/BaseTransitionTableCalculatorJavaWalker.java index ab17f1fb2..7df8fe7af 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BaseTransitionTableCalculatorJavaWalker.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers; +package org.broadinstitute.sting.oneoffprojects.walkers; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.genotyper.*; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/HapmapPoolAllelicInfoWalker.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoWalker.java rename to java/src/org/broadinstitute/sting/oneoffprojects/walkers/HapmapPoolAllelicInfoWalker.java index 4b9610ac0..bdb598d30 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HapmapPoolAllelicInfoWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/HapmapPoolAllelicInfoWalker.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers; +package org.broadinstitute.sting.oneoffprojects.walkers; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IOCrusherWalker.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java rename to java/src/org/broadinstitute/sting/oneoffprojects/walkers/IOCrusherWalker.java index 06e4aa690..ce591919b 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IOCrusherWalker.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers; +package org.broadinstitute.sting.oneoffprojects.walkers; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.cmdLine.Argument; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchCounterWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchCounterWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchCounterWalker.java rename to java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchCounterWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchHistoWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchHistoWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/MismatchHistoWalker.java rename to java/src/org/broadinstitute/sting/oneoffprojects/walkers/MismatchHistoWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java deleted file mode 100644 index d7c03413d..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java +++ /dev/null @@ -1,945 +0,0 @@ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.playground.utils.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.util.*; -import java.util.zip.*; -import java.io.*; - -// Beta iterative multi-sample caller -// j.maguire 6-11-2009 - -public class MultiSampleCaller extends LocusWalker -{ - @Argument(required=false, shortName="fractional_counts", doc="should we use fractional counts?") public boolean FRACTIONAL_COUNTS = false; - @Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10; - @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT = null; - @Argument(fullName="individual_output", shortName="individual_output", required=false, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT = null; - @Argument(fullName="stats_output", shortName="stats_output", required=false, doc="file to write stats to") public String STATS_OUTPUT = null; - @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; - @Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false; - @Argument(fullName="weight_samples", shortName="weight_samples", required=false, doc="rw-weight samples during EM?") public boolean WEIGHT_SAMPLES = false; - - @Argument(fullName="theta", shortName="theta", required=false, doc="rate of sequence divergence") public double THETA = 1e-3; - @Argument(fullName="allele_frequency_prior", shortName="allele_frequency_prior", required=false, doc="use prior on allele frequencies? (P(f) = theta/(N*f)") public boolean ALLELE_FREQUENCY_PRIOR = false; - - @Argument(fullName="confusion_matrix_file", shortName="confusion_matrix_file", required=false, doc="file containing confusion matrix for all three technologies") public String CONFUSION_MATRIX_FILE = null; - - // Private state. - protected List sample_names; - protected SAMFileHeader header; - protected PrintStream individual_output_file = null; - protected PrintStream discovery_output_file = null; - protected PrintStream stats_output_file = null; - - private boolean INCLUDE_STATS = false; - private boolean INCLUDE_GENOTYPES = false ; - - class MultiSampleCallResult - { - GenomeLoc location; - char ref; - char alt; - EM_Result em_result; - double lod; - double strand_score; - double pD; - double pNull; - String in_dbsnp; - int n_ref; - int n_het; - int n_hom; - int EM_N; - double alt_freq; - - public MultiSampleCallResult(GenomeLoc location, char ref, char alt, EM_Result em_result, double lod, double strand_score, double pD, double pNull, String in_dbsnp, int n_ref, int n_het, int n_hom, int EM_N, double alt_freq) - { - this.location = location; - this.ref = ref; - this.alt = alt; - this.em_result = em_result; - this.lod = lod; - this.strand_score = strand_score; - this.pD = pD; - this.pNull = pNull; - this.in_dbsnp = in_dbsnp; - this.n_ref = n_ref; - this.n_het = n_het; - this.n_hom = n_hom; - this.EM_N = EM_N; - this.alt_freq = alt_freq; - } - - public MultiSampleCallResult() { } // this is just so I can do new MultiSampleCallResult().header(). "inner classes cannot have static declarations" :( - - public String header() - { - return new String("loc ref alt lod strand_score pD pNull in_dbsnp pA pC pG pT EM_alt_freq EM_N n_ref n_het n_hom"); - } - - public String toString() - { - String s = ""; - s = s + String.format("%s %c %c %f %f %f %f %s ", location, ref, alt, lod, strand_score, pD, pNull, in_dbsnp); - for (int i = 0; i < 4; i++) { s = s + String.format("%f ", em_result.allele_likelihoods[i]); } - s = s + String.format("%f %d %d %d %d", alt_freq, em_result.EM_N, n_ref, n_het, n_hom); - return s; - } - } - - public static class DepthStats - { - public static String Header() - { - return "loc ref depth A C G T a c g t mq_min mq_mean mq_median mq_max mq_sd"; - } - - public static String Row(char ref, AlignmentContext context) - { - String ans = ""; - List reads = context.getReads(); - List offsets = context.getOffsets(); - //Pileup pileup = new ReadBackedPileupOld(ref, context); - - ans += String.format("%s ", context.getLocation()); - ans += String.format("%c ", ref); - ans += String.format("%d ", reads.size()); - ans += String.format("%d ", countBase(context, 'A', "+")); - ans += String.format("%d ", countBase(context, 'C', "+")); - ans += String.format("%d ", countBase(context, 'G', "+")); - ans += String.format("%d ", countBase(context, 'T', "+")); - ans += String.format("%d ", countBase(context, 'A', "-")); - ans += String.format("%d ", countBase(context, 'C', "-")); - ans += String.format("%d ", countBase(context, 'G', "-")); - ans += String.format("%d ", countBase(context, 'T', "-")); - - ans += String.format("%s ", Stats(BasicPileup.mappingQualPileup(reads))); - - return ans; - } - - static int countBase(AlignmentContext context, char base, String strand) - { - int count = 0; - List reads = context.getReads(); - List offsets = context.getOffsets(); - for (int i = 0; i < reads.size(); i++) - { - if (reads.get(i).getReadString().charAt(offsets.get(i)) == base) - { - if (strand.equals("+") && (reads.get(i).getReadNegativeStrandFlag()==false)) { count += 1; } - else if (strand.equals("-") && (reads.get(i).getReadNegativeStrandFlag()==true)) { count += 1; } - else if (! (strand.equals("+") || strand.equals("-"))) { count += 1; } - } - } - return count; - } - - public static String Stats(ArrayList X) - { - Collections.sort(X); - - long count = 0; - long sum = 0; - long min = X.get(0); - long max = X.get(0); - long median = X.get(0); - for (int i = 0; i < X.size(); i++) - { - int x = X.get(i); - if (x < min) { min = x; } - if (x > max) { max = x; } - sum += x; - count += 1; - if (i == X.size()/2) { median = x; } - } - - double mean = sum/count; - for (int i = 0; i < X.size(); i++) - { - int x = X.get(i); - sum += (x-mean)*(x-mean); - count += 1; - } - double sd = Math.sqrt(sum/count); - - return String.format("%d %f %d %d %f", min, mean, median, max, sd); - } - } - - private void maybeInitializeDisoveryOutput() { - if ( discovery_output_file == null ) { - try { - //final String filename = String.format("%s_%s_%d.calls", DISCOVERY_OUTPUT, loc.getContig(), loc.getStart()); - final String filename = DISCOVERY_OUTPUT; - //System.out.printf("opening %s%n", filename); - discovery_output_file = new PrintStream(filename); - discovery_output_file.println(new MultiSampleCallResult().header()); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - } - - ///////// - // Walker Interface Functions - public void initialize() - { - System.out.printf("\n\n\n\n"); - (new ClassicGenotypeLikelihoods()).TEST(); - - try - { - maybeInitializeDisoveryOutput(); - - INCLUDE_GENOTYPES = INDIVIDUAL_OUTPUT != null; - if ( INCLUDE_GENOTYPES ) { - individual_output_file = new PrintStream(new GZIPOutputStream(new FileOutputStream(INDIVIDUAL_OUTPUT))); - individual_output_file.println("loc ref sample_name genotype lodVsNextBest lodVsRef in_dbsnp AA AC AG AT CC CG CT GG GT TT"); - } - - INCLUDE_STATS = STATS_OUTPUT != null; - if ( INCLUDE_STATS ) { - stats_output_file = new PrintStream(STATS_OUTPUT); - stats_output_file.println(DepthStats.Header()); - } - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } - - GenomeAnalysisEngine toolkit = this.getToolkit(); - this.header = toolkit.getSAMFileHeader(); - List read_groups = header.getReadGroups(); - - sample_names = new ArrayList(); - - HashSet unique_sample_names = new HashSet(); - - for (int i = 0; i < read_groups.size(); i++) - { - String sample_name = read_groups.get(i).getSample(); - String platform = (String)(read_groups.get(i).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - - System.out.printf("SAMPLE: %s %s\n", sample_name, platform); - - if (unique_sample_names.contains(sample_name)) { continue; } - unique_sample_names.add(sample_name); - sample_names.add(sample_name); - - System.out.printf("UNIQUE_SAMPLE: %s %s\n", sample_name, platform); - } - - // Load the confusion matrix if it exists - if (CONFUSION_MATRIX_FILE != null) - { - this.confusion_matrix = new ConfusionMatrix(CONFUSION_MATRIX_FILE); - } - - } - - - public MultiSampleCallResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - context = filter_each_read(context); - - if (ref.getBase() == 'N') { return null; } - if (context.getReads().size() <= 0) { return null; } - if (context.getReads().size() >= 10000) { return null; } // to deal with big piles -- totally arbitrary threshold - - this.ref = ref.getBase(); - MultiSampleCallResult result = this.MultiSampleCall(tracker, ref.getBase(), context, sample_names); - if ( INCLUDE_STATS ) stats_output_file.println(DepthStats.Row(ref.getBase(), context)); - return result; - } - - public void onTraversalDone(String sum) - { - discovery_output_file.flush(); - discovery_output_file.close(); - - if ( INCLUDE_STATS ) { - stats_output_file.flush(); - stats_output_file.close(); - } - - out.println("MultiSampleCaller done."); - return; - } - - public String reduceInit() - { - return null; - } - - public String reduce(MultiSampleCallResult record, String sum) - { - if (record != null && record.n_ref != record.EM_N) - { - discovery_output_file.printf(record.toString() + "\n"); - } - return null; - } - - // END Walker Interface Functions - ///////// - - - ///////// - // Calling Functions - - char ref; - protected ConfusionMatrix confusion_matrix; - - ClassicGenotypeLikelihoods Genotype(AlignmentContext context, double[] allele_likelihoods, double indel_alt_freq) - { - //ReadBackedPileup pileup = new ReadBackedPileup(ref, context); - //String bases = pileup.getBases(); - - List reads = context.getReads(); - List offsets = context.getOffsets(); - ref = Character.toUpperCase(ref); - - if (reads.size() == 0) { - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - return G; - } - - - // Handle single-base polymorphisms. - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - for ( int i = 0; i < reads.size(); i++ ) - { - //System.out.printf("DBG: %s\n", context.getLocation()); - - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - if (CONFUSION_MATRIX_FILE == null) - { - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); - } - else - { - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String platform = (String)(header.getReadGroup(RG).getAttribute(SAMReadGroupRecord.PLATFORM_TAG)); - - G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset], confusion_matrix, platform); - } - } - G.ApplyPrior(ref, allele_likelihoods); - - // Handle indels - if (CALL_INDELS) - { - String[] indels = BasicPileup.indelPileup(reads, offsets); - IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq); - if (indel_call.getType() != null) - { - G.addIndelLikelihood(indel_call); - } - else - { - G.addIndelLikelihood(null); - } - } - - /* - // Handle 2nd-best base calls. - if (fourBaseMode && pileup.getBases().length() < 750) - { - G.applySecondBaseDistributionPrior(pileup.getBases(), pileup.getSecondaryBasePileup()); - } - */ - - return G; - } - - double[] CountFreqs_gold(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double[] allele_likelihoods = new double[4]; - for (int x = 0; x < genotype_likelihoods.length; x++) - { - ClassicGenotypeLikelihoods G = genotype_likelihoods[x]; - - if (G.coverage == 0) { continue; } - - double Z = 0; - for(int k = 0; k < 10; k++) { Z += Math.pow(10,G.likelihoods[k]); } - Z = Math.log10(Z); - - double[] personal_allele_likelihoods = new double[4]; - int k = 0; - for (int i = 0; i < 4; i++) - { - for (int j = i; j < 4; j++) - { - double likelihood = Math.pow(10,G.likelihoods[k]-Z); - personal_allele_likelihoods[i] += likelihood; - personal_allele_likelihoods[j] += likelihood; - k++; - } - } - double sum = 0; - for (int y = 0; y < 4; y++) { sum += personal_allele_likelihoods[y]; } - for (int y = 0; y < 4; y++) { personal_allele_likelihoods[y] /= sum; } - for (int y = 0; y < 4; y++) { allele_likelihoods[y] += personal_allele_likelihoods[y]; } - } - - double sum = 0; - for (int i = 0; i < 4; i++) { sum += allele_likelihoods[i]; } - for (int i = 0; i < 4; i++) { allele_likelihoods[i] /= sum; } - - return allele_likelihoods; - } - - // This version is a little faster. - double[] CountFreqs(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double[] allele_likelihoods = new double[4]; - for (int x = 0; x < genotype_likelihoods.length; x++) - { - ClassicGenotypeLikelihoods G = genotype_likelihoods[x]; - - if (G.coverage == 0) { continue; } - - double Z = 0; - double[] personal_allele_likelihoods = new double[4]; - int k = 0; - for (int i = 0; i < 4; i++) - { - for (int j = i; j < 4; j++) - { - double likelihood = Math.pow(10,G.likelihoods[k]); - Z += likelihood; - personal_allele_likelihoods[i] += likelihood; - personal_allele_likelihoods[j] += likelihood; - k++; - } - } - double sum = 0; - for (int y = 0; y < 4; y++) { sum += personal_allele_likelihoods[y]; } - for (int y = 0; y < 4; y++) { personal_allele_likelihoods[y] /= sum; } - for (int y = 0; y < 4; y++) { allele_likelihoods[y] += personal_allele_likelihoods[y]; } - } - - double sum = 0; - for (int i = 0; i < 4; i++) { sum += allele_likelihoods[i]; } - for (int i = 0; i < 4; i++) { allele_likelihoods[i] /= sum; } - - return allele_likelihoods; - } - - - double CountIndelFreq(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - HashMap indel_allele_likelihoods = new HashMap(); - - double pRef = 0; - double pAlt = 0; - - for (int j = 0; j < sample_names.size(); j++) - { - double personal_pRef = 0; - double personal_pAlt = 0; - - IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood(); - personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet()); - personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet()); - - personal_pRef = personal_pRef / (personal_pAlt + personal_pRef); - personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef); - - pRef += personal_pRef; - pAlt += personal_pAlt; - } - - pRef = pRef / (pRef + pAlt); - pAlt = pAlt / (pRef + pAlt); - - return pAlt; - } - - // Potential precision error here. - double Compute_pD(ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] sample_weights) - { - double pD = 0; - for (int i = 0; i < sample_names.size(); i++) - { - double sum = 0; - for (int j = 0; j < 10; j++) - { - sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); - } - pD += Math.log10(sample_weights[i] * sum); - } - return pD; - } - - double Compute_pNull(AlignmentContext[] contexts, double[] sample_weights) - { - double[] allele_likelihoods = new double[4]; - for (int i = 0; i < 4; i++) { allele_likelihoods[i] = 1e-6/3.0; } - allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6; - ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; - for (int j = 0; j < sample_names.size(); j++) - { - G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6); - } - return Compute_pD(G, sample_weights); - } - - double[] Compute_SampleWeights(ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double[] pD = new double[sample_names.size()]; - double total_pD = 0; - for (int i = 0; i < sample_names.size(); i++) - { - double sum = 0; - for (int j = 0; j < 10; j++) - { - sum += Math.pow(10, genotype_likelihoods[i].likelihoods[j]); - } - pD[i] = sum; - total_pD += pD[i]; - } - - for (int i = 0; i < sample_names.size(); i++) - { - pD[i] /= total_pD; - } - - return pD; - } - - // Some globals to cache results. - EM_Result em_result; - double pD; - double pNull; - double lod; - double LOD(AlignmentContext[] contexts) - { - em_result = EM(contexts); - ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods; - pD = Compute_pD(G, em_result.sample_weights); - pNull = Compute_pNull(contexts, em_result.sample_weights); - - if (ALLELE_FREQUENCY_PRIOR) - { - // Apply p(f). - double pVar = 0.0; - for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } - - double p0 = Math.log10(1 - pVar); - double pF; - - double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods); - - if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; } - else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); } - - //System.out.printf("DBG %s %c %f %f %f %f (%.20f) %f ", contexts[0].getLocation(), ref, pD, pF, pNull, p0, Compute_alt_freq(ref, em_result.allele_likelihoods), 2.0 * em_result.EM_N); - //for (int i = 0; i < 4; i++) { System.out.printf("%f ", em_result.allele_likelihoods[i]); } - //System.out.printf("\n"); - - pD = pD + pF; - pNull = pNull + p0; - } - - lod = pD - pNull; - return lod; - } - - class EM_Result - { - String[] sample_names; - ClassicGenotypeLikelihoods[] genotype_likelihoods; - double[] allele_likelihoods; - double[] sample_weights; - int EM_N; - - public EM_Result(List sample_names, ClassicGenotypeLikelihoods[] genotype_likelihoods, double[] allele_likelihoods, double[] sample_weights) - { - this.sample_names = new String[1]; - this.sample_names = sample_names.toArray(this.sample_names); - this.genotype_likelihoods = genotype_likelihoods; - this.allele_likelihoods = allele_likelihoods; - this.sample_weights = sample_weights; - - EM_N = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage > 0) { EM_N += 1; } - } - - } - - } - - EM_Result EM(AlignmentContext[] contexts) - { - double[] allele_likelihoods = new double[4]; - - // These initial conditions should roughly replicate classic SSG. (at least on hets) - for (int i = 0; i < 4; i++) - { - if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999) - else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999) - } - double indel_alt_freq = 1e-4; - - double[] sample_weights = new double[sample_names.size()]; - for (int i = 0; i < sample_weights.length; i++) - { - //sample_weights[i] = 1.0/(double)i; - sample_weights[i] = 1.0; - } - - ClassicGenotypeLikelihoods[] G = new ClassicGenotypeLikelihoods[sample_names.size()]; - ClassicGenotypeLikelihoods[] Weighted_G = new ClassicGenotypeLikelihoods[sample_names.size()]; - for (int i = 0; i < MAX_ITERATIONS; i++) - { - for (int j = 0; j < sample_names.size(); j++) - { - G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq); - if (WEIGHT_SAMPLES) { G[j].ApplyWeight(sample_weights[j]); } - } - - allele_likelihoods = CountFreqs(G); - - if (CALL_INDELS) - { - indel_alt_freq = CountIndelFreq(G); - } - - if (WEIGHT_SAMPLES) - { - sample_weights = Compute_SampleWeights(G); - } - } - - return new EM_Result(sample_names, G, allele_likelihoods, sample_weights); - } - - // Hacky global variables for debugging. - double StrandScore(AlignmentContext context) - { - //AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); - - AlignmentContext fw = filterAlignmentContext(context, "+"); - AlignmentContext bw = filterAlignmentContext(context, "-"); - AlignmentContext[] contexts_fw = filterAlignmentContext(fw, sample_names, 0); - AlignmentContext[] contexts_bw = filterAlignmentContext(bw, sample_names, 0); - - EM_Result em_fw = EM(contexts_fw); - EM_Result em_bw = EM(contexts_bw); - - double pNull_fw = Compute_pNull(contexts_fw, em_fw.sample_weights); - double pNull_bw = Compute_pNull(contexts_bw, em_bw.sample_weights); - - double pD_fw = Compute_pD(em_fw.genotype_likelihoods, em_fw.sample_weights); - double pD_bw = Compute_pD(em_bw.genotype_likelihoods, em_bw.sample_weights); - - if (ALLELE_FREQUENCY_PRIOR) - { - // Apply p(f). - double pVar = 0.0; - for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } - - pD_fw = pD_fw + Math.log10(THETA/(2.0*em_fw.EM_N * Compute_alt_freq(ref, em_fw.allele_likelihoods))); - pNull_fw = pNull_fw + Math.log10(1 - pVar); - - pD_bw = pD_bw + Math.log10(THETA/(2.0*em_bw.EM_N * Compute_alt_freq(ref, em_bw.allele_likelihoods))); - pNull_bw = pNull_bw + Math.log10(1 - pVar); - } - - double EM_alt_freq_fw = Compute_alt_freq(ref, em_fw.allele_likelihoods); - double EM_alt_freq_bw = Compute_alt_freq(ref, em_bw.allele_likelihoods); - - //double pNull = Compute_pNull(contexts); - //double lod = LOD(contexts); - - double lod_fw = (pD_fw + pNull_bw) - pNull; - double lod_bw = (pD_bw + pNull_fw) - pNull; - double strand_score = Math.max(lod_fw - lod, lod_bw - lod); - return strand_score; - } - - ClassicGenotypeLikelihoods HardyWeinberg(double[] allele_likelihoods) - { - ClassicGenotypeLikelihoods G = new ClassicGenotypeLikelihoods(); - int k = 0; - for (int i = 0; i < 4; i++) - { - for (int j = i; j < 4; j++) - { - G.likelihoods[k] = allele_likelihoods[i] * allele_likelihoods[j]; - k++; - } - } - return G; - } - - char PickAlt(char ref, double[] allele_likelihoods) - { - Integer[] perm = Utils.SortPermutation(allele_likelihoods); - if (perm[3] != BaseUtils.simpleBaseToBaseIndex(ref)) { return BaseUtils.baseIndexToSimpleBase(perm[3]); } - else { return BaseUtils.baseIndexToSimpleBase(perm[2]); } - } - - double Compute_discovery_lod(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - double pBest = 0; - double pRef = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - pBest += genotype_likelihoods[i].BestPosterior(); - pRef += genotype_likelihoods[i].RefPosterior(ref); - } - return pBest - pRef; - } - - // this one is a bit of a lazy hack. - double Compute_alt_freq(char ref, double[] allele_likelihoods) - { - return allele_likelihoods[BaseUtils.simpleBaseToBaseIndex(PickAlt(ref, allele_likelihoods))]; - } - - int Compute_n_ref(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) == ref) && (g.charAt(1) == ref)) { n += 1; } - } - return n; - } - - int Compute_n_het(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) == ref) && (g.charAt(1) != ref)) { n += 1; } - if ((g.charAt(0) != ref) && (g.charAt(1) == ref)) { n += 1; } - } - return n; - } - - int Compute_n_hom(char ref, ClassicGenotypeLikelihoods[] genotype_likelihoods) - { - int n = 0; - for (int i = 0; i < genotype_likelihoods.length; i++) - { - if (genotype_likelihoods[i].coverage == 0) { continue; } - String g = genotype_likelihoods[i].BestGenotype(); - if ((g.charAt(0) != ref) && (g.charAt(1) != ref)) { n += 1; } - } - return n; - } - - // This should actually return a GLF Record - MultiSampleCallResult MultiSampleCall(RefMetaDataTracker tracker, char ref, AlignmentContext context, List sample_names) - { - String in_dbsnp; - if (tracker.lookup("DBSNP", null) != null) { in_dbsnp = "known"; } else { in_dbsnp = "novel"; } - - AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); - double lod = LOD(contexts); - double strand_score = StrandScore(context); - //EM_Result em_result = EM(contexts); - ClassicGenotypeLikelihoods population_genotype_likelihoods = HardyWeinberg(em_result.allele_likelihoods); - - //double pD = Compute_pD(em_result.genotype_likelihoods); - //double pNull = Compute_pNull(contexts); - - double discovery_lod = Compute_discovery_lod(ref, em_result.genotype_likelihoods); - double alt_freq = Compute_alt_freq(ref, em_result.allele_likelihoods); - - int n_ref = Compute_n_ref(ref, em_result.genotype_likelihoods); - int n_het = Compute_n_het(ref, em_result.genotype_likelihoods); - int n_hom = Compute_n_hom(ref, em_result.genotype_likelihoods); - - char alt = 'N'; - //if (lod > 0.0) { alt = PickAlt(ref, em_result.allele_likelihoods); } - if ((n_het > 0) || (n_hom > 0)) { alt = PickAlt(ref, em_result.allele_likelihoods); } - - for (int i = 0; i < em_result.genotype_likelihoods.length; i++) - { - if ( INCLUDE_GENOTYPES ) { - individual_output_file.printf("%s %c %s ", context.getLocation(), ref, sample_names.get(i)); - individual_output_file.printf("%s %f %f %s ", em_result.genotype_likelihoods[i].BestGenotype(), - em_result.genotype_likelihoods[i].LodVsNextBest(), - em_result.genotype_likelihoods[i].LodVsRef(ref), - in_dbsnp); - } - - //individual_output.printf("%s ", new ReadBackedPileup(ref, contexts[i]).getBaseCountsString()); - assert(em_result.genotype_likelihoods[i] != null); - em_result.genotype_likelihoods[i].sort(); - assert(em_result.genotype_likelihoods[i].sorted_likelihoods != null); - - if ( INCLUDE_GENOTYPES ) { - for (int j = 0; j < em_result.genotype_likelihoods[i].sorted_likelihoods.length; j++) - { - individual_output_file.printf("%f ", em_result.genotype_likelihoods[i].likelihoods[j]); - } - individual_output_file.printf("\n"); - } - } - - return new MultiSampleCallResult(context.getLocation(), ref, alt, em_result, lod, strand_score, pD, pNull, in_dbsnp, n_ref, n_het, n_hom, em_result.EM_N, alt_freq); - } - - // END Calling Functions - ///////// - - ///////// - // Utility Functions - - /// Filter a locus context by forward and backward - private AlignmentContext filterAlignmentContext(AlignmentContext context, String strand) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - - // Filter for strandedness - if ((!strand.contains("+")) && (read.getReadNegativeStrandFlag() == false)) { continue; } - if ((!strand.contains("-")) && (read.getReadNegativeStrandFlag() == true)) { continue; } - reads.add(read); - offsets.add(offset); - } - return new AlignmentContext(context.getLocation(), reads, offsets); - } - - // Filter a locus context by sample ID - protected AlignmentContext[] filterAlignmentContext(AlignmentContext context, List sample_names, int downsample) - { - HashMap index = new HashMap(); - for (int i = 0; i < sample_names.size(); i++) - { - index.put(sample_names.get(i), i); - } - - AlignmentContext[] contexts = new AlignmentContext[sample_names.size()]; - ArrayList[] reads = new ArrayList[sample_names.size()]; - ArrayList[] offsets = new ArrayList[sample_names.size()]; - - for (int i = 0; i < sample_names.size(); i++) - { - reads[i] = new ArrayList(); - offsets[i] = new ArrayList(); - } - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - - assert(header != null); - assert(header.getReadGroup(RG) != null); - - String sample = header.getReadGroup(RG).getSample(); - if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - reads[index.get(sample)].add(read); - offsets[index.get(sample)].add(offset); - } - - if (downsample != 0) - { - for (int j = 0; j < reads.length; j++) - { - List perm = new ArrayList(); - for (int i = 0; i < reads[j].size(); i++) { perm.add(i); } - perm = Utils.RandomSubset(perm, downsample); - - ArrayList downsampled_reads = new ArrayList(); - ArrayList downsampled_offsets = new ArrayList(); - - for (int i = 0; i < perm.size(); i++) - { - downsampled_reads.add(reads[j].get(perm.get(i))); - downsampled_offsets.add(offsets[j].get(perm.get(i))); - } - - reads[j] = downsampled_reads; - offsets[j] = downsampled_offsets; - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - else - { - for (int j = 0; j < reads.length; j++) - { - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - - return contexts; - } - - private AlignmentContext filter_each_read(AlignmentContext L) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < L.getReads().size(); i++) - { - SAMRecord read = L.getReads().get(i); - Integer offset = L.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - - assert(this.header != null); - //assert(this.header.getReadGroup(RG) != null); - if (this.header.getReadGroup(RG) == null) { continue; } - - // skip bogus data - if (read.getMappingQuality() == 0) { continue; } - - String sample = this.header.getReadGroup(RG).getSample(); - //if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - - reads.add(read); - offsets.add(offset); - } - - AlignmentContext ans = new AlignmentContext(L.getLocation(), reads, offsets); - - return ans; - } - - // END Utility functions - ///////// - - - - -}