From 3485397483e0904d5d19fbf519813cb229283a07 Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 3 Aug 2009 20:55:31 +0000 Subject: [PATCH] Reorganization of the genotyping system git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1370 348d0f76-0448-11de-a6fe-93d51630548a --- .../genotyper/GenotypeLikelihoods.java | 21 +- .../walkers/genotyper/SSGGenotypeCall.java | 8 +- .../genotyper/SingleSampleGenotyper.java | 14 +- .../gatk/walkers/AlleleFrequencyWalker.java | 915 ------------------ .../gatk/walkers/CoverageBySample.java | 3 - .../gatk/walkers/CoverageEvalWalker.java | 8 +- .../gatk/walkers/CoverageHistogram.java | 9 - .../FindNonrandomSecondBestBasePiles.java | 187 ---- .../gatk/walkers/HLAcaller/CallHLAWalker.java | 7 +- .../gatk/walkers/MultiSampleCaller.java | 17 +- .../MultiSampleCallerAccuracyTest.java | 9 - .../gatk/walkers/PoolCallingExperiment.java | 301 ------ .../gatk/walkers/PrintCoverageWalker.java | 16 - .../variants/ClusteredSNPFilterWalker.java | 6 +- .../variants/VariantFiltrationWalker.java | 6 +- .../utils/AlleleFrequencyEstimate.java | 210 ---- .../sting/playground/utils/AlleleMetrics.java | 198 ---- .../sting/utils/genotype/BasicGenotype.java | 2 +- .../sting/utils/genotype/Genotype.java | 2 +- .../sting/utils/genotype/GenotypeCall.java | 2 +- .../sting/utils/genotype/GenotypeOutput.java | 2 - .../sting/utils/genotype/Variant.java | 4 +- .../utils/genotype/geli/GeliAdapter.java | 2 +- .../utils/genotype/geli/GeliTextWriter.java | 6 +- .../sting/utils/genotype/glf/GLFWriter.java | 2 +- 25 files changed, 41 insertions(+), 1916 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java delete mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/FindNonrandomSecondBestBasePiles.java delete mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCallingExperiment.java delete mode 100755 java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java delete mode 100755 java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index 30ee11983..7a04ed822 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -1,15 +1,11 @@ -package org.broadinstitute.sting.playground.utils; +package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.BasicGenotype; import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.GenotypeGenerator; -import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore; import static java.lang.Math.log10; @@ -18,7 +14,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; -public class GenotypeLikelihoods implements GenotypeGenerator { +public class GenotypeLikelihoods { // precalculate these for performance (pow/log10 is expensive!) /** @@ -336,16 +332,6 @@ public class GenotypeLikelihoods implements GenotypeGenerator { return this.ref_likelihood; } - private IndelLikelihood indel_likelihood; - - public void addIndelLikelihood(IndelLikelihood indel_likelihood) { - this.indel_likelihood = indel_likelihood; - } - - public IndelLikelihood getIndelLikelihood() { - return this.indel_likelihood; - } - /** * given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs * @@ -355,8 +341,7 @@ public class GenotypeLikelihoods implements GenotypeGenerator { * * @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values */ - @Override - public GenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) { + public SSGGenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) { filterQ0Bases(!keepQ0Bases); // Set the filtering / keeping flag // for calculating the rms of the mapping qualities diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java index b5c834980..068e9e06a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java @@ -1,17 +1,13 @@ -package org.broadinstitute.sting.utils.genotype.calls; +package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotype.BasicGenotype; -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.GenotypeOutput; -import org.broadinstitute.sting.utils.genotype.LexigraphicalComparator; import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore; import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore; -import org.broadinstitute.sting.utils.genotype.variant.Variant; +import org.broadinstitute.sting.utils.genotype.*; import java.util.*; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index 63438d1ed..88b817d16 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -1,22 +1,16 @@ -package org.broadinstitute.sting.playground.gatk.walkers; +package org.broadinstitute.sting.gatk.walkers.genotyper; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.playground.utils.AlleleMetrics; -import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.ReadBackedPileup; -import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; import java.io.File; @@ -108,13 +102,11 @@ public class SingleSampleGenotyper extends LocusWalker// implements AllelicVariant -{ - @Argument(doc="Number of chromosomes in data") public int N; - @Argument(required=false,doc="downsample") public int DOWNSAMPLE = 0; - @Argument(doc="File to output GFF formatted allele frequency calls") public String GFF_OUTPUT_FILE; - @Argument(shortName="met", doc="Turns on logging of metrics on the fly with AlleleFrequency calculation") public boolean LOG_METRICS; - @Argument(required=false, doc="Name of file where metrics will output") public String METRICS_OUTPUT_FILE = "metrics.out"; - @Argument(required=false, doc="Ignores 4-base probabilities and only uses the quality of the best/called base") public boolean FORCE_1BASE_PROBS; - - protected static Logger logger = Logger.getLogger(AlleleFrequencyWalker.class); - - Random random; - PrintStream output; - - private boolean initalized = false; - static private double quality_precision = 1e-4; - public void initalize() - { - if (initalized) { return; } - - try - { - this.random = new java.util.Random(0); - if ( GFF_OUTPUT_FILE.equals("-") ) - this.output = out; - else - this.output = new PrintStream(GFF_OUTPUT_FILE); - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } - - initalized = true; - } - - public boolean requiresReads() { return true; } - - public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) - { - // init in the map because GATK doesn't appear to be initing me today. no problemo. - this.initalize(); - - // Convert context data into bases and 4-base quals - Pileup pileup = new ReadBackedPileup(ref, context); - String bases = pileup.getBases(); - double quals[][] = getQuals(context); - //String[] indels = getIndels(context); - - logger.debug(String.format("In alleleFrequnecy walker: N=%d, d=%d", N, DOWNSAMPLE)); - - if ((DOWNSAMPLE != 0) && (DOWNSAMPLE < bases.length())) - { - String downsampled_bases = ""; - double downsampled_quals[][] = new double[DOWNSAMPLE][4]; - - int picked_bases[] = new int[bases.length()]; - for (int i = 0; i < picked_bases.length; i++) { picked_bases[i] = 0; } - while (downsampled_bases.length() < DOWNSAMPLE) - { - int choice; - - /* - System.out.printf("DBG %b %b %b\n", - random == null, - bases == null, - picked_bases == null); - */ - - for (choice = random.nextInt(bases.length()); picked_bases[choice] == 1; choice = random.nextInt(bases.length())); - picked_bases[choice] = 1; - downsampled_bases += bases.charAt(choice); - downsampled_quals[downsampled_bases.length()-1] = quals[choice]; - } - - //System.out.printf("From: %s\n", bases); - //System.out.printf("To : %s\n", downsampled_bases); - - bases = downsampled_bases; - quals = downsampled_quals; - } - - // Count bases - int[] base_counts = new int[4]; - for (byte b : bases.getBytes()) - base_counts[nuc2num[b]]++; - - // Find alternate allele - 2nd most frequent non-ref allele - // (maybe we should check for ties and eval both or check most common including quality scores) - int altnum = -1; // start with first base numerical identity (0,1,2,3) - double altcount = -1; // start with first base count - int refnum = nuc2num[ref]; - - for (int b=0; b<4; b++) { - if ((b != refnum) && (base_counts[b] > altcount)) { - altnum = b; altcount = base_counts[b]; - } - } - assert(altnum != -1); - - AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation(), N, bases.getBytes(), quals, refnum, altnum, bases.length()); - - alleleFreq.notes = String.format("A:%d C:%d G:%d T:%d", - base_counts[nuc2num['A']], - base_counts[nuc2num['C']], - base_counts[nuc2num['G']], - base_counts[nuc2num['T']]); - - // Print dbSNP data if its there - if (true) { - for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) { - if ( datum != null && datum instanceof rodDbSNP) { - rodDbSNP dbsnp = (rodDbSNP)datum; - //System.out.printf(" DBSNP %s on %s => %s%n", dbsnp.toSimpleString(), dbsnp.strand, Utils.join("/", dbsnp.getAllelesFWD())); - alleleFreq.notes += String.format(" ROD: %s ",dbsnp.toString()); - } - } - } - - logger.debug(String.format(" => result is %s", alleleFreq)); - - //if (LOG_METRICS) metrics.nextPosition(alleleFreq, tracker); - - return alleleFreq; - } - - /* - static public String[] getIndels(LocusContext context) - { - List reads = context.getReads(); - List offsets = context.getOffsets(); - String[] indels = new String[reads.size()]; - - for (int i = 0; i < reads.size(); i++) - { - SAMRecord read = reads.get(i); - Cigar cigar = read.getCigar(); - - int k = 0; - for (int j = 0; j < cigar.numCigarElements(); j++) - { - CigarOperator operator = cigar.getCigarElement(j).getOperator(); - int length = cigar.getCigarElement(j).getLength(); - if (operator == CigarOperator.M) - { - k += length; - } - else if ((k != offset) && (operator == CigarOperator.I)) - { - k += length; - } - else if ((k == offset) && (operator == CigarOperator.I)) - { - // this insertion is associated with this offset. - - break; - } - else if ((k == offset) && (operator == CigarOperator.D)) - { - - break; - } - else if ((k == offset) && - ((operator == CigarOperator.I) || (operator == CigarOperator.D))) - { - // no indel here. - indels[i] = ""; - break; - } - } - } - - return indels; - } - */ - - public double[][] getQuals (LocusContext context) - { - int numReads = context.getReads().size(); //numReads(); - double[][] quals = new double[numReads][4]; - - List reads = context.getReads(); - List offsets = context.getOffsets(); - for (int i =0; i < numReads; i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - // First, set the called base to the correct quality - int callednum = nuc2num[read.getReadBases()[offset]]; // number of the called base (0,1,2,3) - - quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0); - - // For other 3 bases, check if 2-base probs exist - assert (reads.size() > 0); - Object SQ_field = reads.get(i).getAttribute("SQ"); - if (SQ_field == null || FORCE_1BASE_PROBS) { - // Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual - double nonref_quals = (1.0 - quals[i][callednum]) / 3; - for (int b=0; b<4; b++) - { - if (b != callednum) - { - quals[i][b] = nonref_quals; - if (quals[i][b] <= quality_precision) { quals[i][b] = quality_precision; } // avoid zero probs. - } - } - }else{ - assert (SQ_field instanceof byte[]); - byte[] hex_quals = (byte[]) SQ_field; - //System.out.printf("SQ field (hex): %s\n", bytesToHexString(hex_quals)); - //System.out.printf("SAM record: %s\n", read.format()); - - byte hex_qual = hex_quals[offset]; - int called2num = QualityUtils.compressedQualityToBaseIndex(hex_qual); - - /* - int crossTalkPartnerIndex = BaseUtils.crossTalkPartnerIndex(callednum); - - if (called2num == crossTalkPartnerIndex) { - double nonref_quals = (1.0 - quals[i][callednum]) / 3.0; - for (int b=0; b<4; b++) - if (b != callednum) - quals[i][b] = nonref_quals; - } else { - */ - double qual2 = QualityUtils.compressedQualityToProb(hex_qual); - //System.out.printf("2ND %x %d %f\n", hex_qual, called2num, qual2); - quals[i][called2num] = qual2; - if (quals[i][called2num] <= quality_precision) { quals[i][called2num] = quality_precision; } // avoid zero probs. - - double nonref_quals = (1.0 - quals[i][callednum] - quals[i][called2num]) / 2.0; - for (int b=0; b<4; b++) - if (b != callednum && b != called2num) - quals[i][b] = nonref_quals; - //} - } - } - - //print_base_qual_matrix(quals); - return quals; - } - - // Code pulled from SAMUtils for debugging - static String bytesToHexString(final byte[] data) { - final char[] chars = new char[2 * data.length]; - for (int i = 0; i < data.length; i++) { - final byte b = data[i]; - chars[2*i] = toHexDigit((b >> 4) & 0xF); - chars[2*i+1] = toHexDigit(b & 0xF); - } - return new String(chars); - } - - private static char toHexDigit(final int value) { - return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10)); - } - - public AlleleFrequencyEstimate AlleleFrequencyEstimator(GenomeLoc location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth) - { - - // q = hypothetical %nonref - // qstar = true %nonref - // G = N, qstar, alt - // N = number of chromosomes - // alt = alternate hypothesis base (most frequent nonref base) - // ref = reference base - - double qstar; - int qstar_N; - - double qstep = 0.001; - double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating - - double posterior_null_hyp; - - // Initialize empyt MixtureLikelihood holders for each possible qstar (N+1) - MixtureLikelihood[] bestMixtures = new MixtureLikelihood[N+1]; - double posteriors[] = new double[N+1]; - - { - double q = ML_q_byEM(bases, quals, refnum, altnum); - double pDq = P_D_q(bases, quals, q, refnum, altnum); - long q_R = Math.round(q*bases.length); - - posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, 0, 0) + P_G(N, 0); - bestMixtures[0] = new MixtureLikelihood(posterior_null_hyp, 0.0, 0.0); - posteriors[0] = posterior_null_hyp; - - //System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n", location, 0.0, 0.0, P_D_q(bases, quals, 0.0, refnum, altnum), P_q_G(bases, N, 0.0, 0, 0), P_G(N, 0), prior_alt_frequency, posterior_null_hyp, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases)); - - assert(! Double.isNaN(P_D_q(bases, quals, 0.0, refnum, altnum))); - assert(! Double.isNaN(P_q_G(bases, N, 0.0, 0, 0))); - assert(! Double.isNaN(P_G(N, 0))); - - assert(! Double.isNaN(posteriors[0])); - assert(! Double.isInfinite(posteriors[0])); - - // qstar - true allele balances - //for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) - for (qstar_N = 1; qstar_N <= N; qstar_N += 1) - { - qstar = (double)qstar_N / (double)N; - - double pqG = P_q_G(bases, N, q, qstar, q_R); - double pG = P_G(N, qstar_N); - double posterior = pDq + pqG + pG; - - bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q); - posteriors[qstar_N] = posterior; - - //System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n", - // location, q, qstar, pDq, pqG, pG, prior_alt_frequency, posterior, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases)); - } - } - - // First reverse sort mixtures according to highest posterior probabililty - Arrays.sort(bestMixtures); - - // Calculate Lod of any variant call versus the reference call - // Answers how confident are we in the best variant (nonref) mixture versus the null hypothesis - // reference mixture - qhat = qstar = 0.0 - double lodVarVsRef = bestMixtures[0].posterior - posterior_null_hyp; - - // Now reverse sort ALL mixtures according to highest posterior probability - Arrays.sort(bestMixtures); - - // Calculate Lod of the mixture versus other possible - // Answers how confident are we in the best mixture versus the nextPosition best mixture - double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior; - - if (lodVarVsRef == 0) { lodVarVsRef = -1.0 * lodBestVsNextBest; } - - AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location, - num2nuc[refnum], - num2nuc[altnum], - N, - bestMixtures[0].qhat, - bestMixtures[0].qstar, - lodVarVsRef, - lodBestVsNextBest, - bestMixtures[0].posterior, - posterior_null_hyp, - depth, - new String(bases), - quals, - posteriors, - "unknown_sample"); - return alleleFreq; - } - - public class MixtureLikelihood implements Comparable { - public double posterior; - public double qstar; - public double qhat; - - MixtureLikelihood() { - this.posterior = Math.log10(0); - this.qstar = Math.log10(0); - this.qhat = Math.log10(0); - } - - MixtureLikelihood(double posterior, double qstar, double qhat) { - this.posterior = posterior; - this.qstar = qstar; - this.qhat = qhat; - } - - public int compareTo (MixtureLikelihood other) { - // default sorting is REVERSE sorting on posterior prob; gives array with top post. prob. as first element - if (posterior < other.posterior) return 1; - else if (posterior > other.posterior) return -1; - else return 0; - } - } - - double ML_q(byte[] bases, double[][]quals, int refnum, int altnum) - { - double ref_count = 0; - double alt_count = 0; - for (int i=0; i= best_likelihood) - { - best_likelihood = likelihood; - best_q = q; - } - } - return best_q; - } - - double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum) - { - double p = 0.0; - double epsilon = 1e-4; - - for (int i=0; i n) - return 0; - - if (k > n/2) - k = n-k; // Take advantage of symmetry - - double accum = 1; - for (long i = 1; i <= k; i++) - accum = accum * (n-k+i) / i; - - //return accum + 0.5; // avoid rounding error - return accum; // avoid rounding error - - /* - long m = n - k; - if (k < m) - k = m; - - long t = 1; - for (long i = n, j = 1; i > k; i--, j++) - t = t * i / j; - return t; - */ - } - - public static String repeat(char letter, long count) { - // Repeat a character count times - String result = ""; - for (int i=0; i -5.0) || (current_offset != last_position_considered + 1)) ) - { - // No longer hom-ref, so output a ref line. - double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length; - - output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n", - confident_ref_interval_contig, - confident_ref_interval_start, - last_position_considered, - lod, - (int)(confident_ref_interval_length)); - - inside_confident_ref_interval = false; - } - else if (inside_confident_ref_interval && (alleleFreq.lodVsRef <= -5.0)) - { - // Still hom-ref so increment the counters. - confident_ref_interval_LOD_sum += alleleFreq.lodVsRef; - confident_ref_interval_length += 1; - } - else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef > -5.0)) - { - // do nothing. - } - else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef <= -5.0)) - { - // We moved into a hom-ref region so start a new interval. - confident_ref_interval_contig = alleleFreq.location.getContig(); - confident_ref_interval_start = alleleFreq.location.getStart(); - confident_ref_interval_LOD_sum = alleleFreq.lodVsRef; - confident_ref_interval_length = 1; - inside_confident_ref_interval = true; - } - - last_position_considered = current_offset; - - if (alleleFreq.lodVsRef >= 5) { - this.output.print(alleleFreq.asGFFString()); - - /* - String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N); - System.out.print("DEBUG " + gtype + " "); - if (gtype.contentEquals("het")) { - System.out.println(alleleFreq.ref + "" + alleleFreq.alt); - } else if (gtype.contentEquals("hom")) { - System.out.println(alleleFreq.ref + "" + alleleFreq.ref); - } else { - System.out.println(alleleFreq.alt + "" + alleleFreq.alt); - } - */ - } - - if (LOG_METRICS) metrics.printMetricsAtLocusIntervals(1000); - return ""; - } - - private double prior_alt_frequency = -1.0; - public void setAlleleFrequencyPrior(double frequency) - { - assert(! Double.isNaN(frequency) ); - assert(! Double.isInfinite(frequency)); - if (frequency == 0) { frequency = 1e-5; } // how many epsilons do we need!? This is worrisome. - this.prior_alt_frequency = frequency; - } - - static int nuc2num[]; - static char num2nuc[]; - public AlleleFrequencyWalker() { - nuc2num = new int[128]; - nuc2num['A'] = 0; - nuc2num['C'] = 1; - //nuc2num['T'] = 2; - nuc2num['G'] = 2; - //nuc2num['G'] = 3; - nuc2num['T'] = 3; - nuc2num['a'] = 0; - nuc2num['c'] = 1; - //nuc2num['t'] = 2; - nuc2num['g'] = 2; - //nuc2num['g'] = 3; - nuc2num['t'] = 3; - - num2nuc = new char[4]; - num2nuc[0] = 'A'; - num2nuc[1] = 'C'; - //num2nuc[2] = 'T'; - num2nuc[2] = 'G'; - //num2nuc[3] = 'G'; - num2nuc[3] = 'T'; - - //if (System.getenv("N") != null) { this.N = (new Integer(System.getenv("N"))).intValue(); } - //else { this.N = 2; } - // - //if (System.getenv("DOWNSAMPLE") != null) { this.DOWNSAMPLE = (new Integer(System.getenv("DOWNSAMPLE"))).intValue(); } - //else { this.DOWNSAMPLE = 0; } - } - - public void onTraversalDone(String result) - { - if (inside_confident_ref_interval) - { - // if we have a confident reference interval still hanging open, close it. - - double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length; - - output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n", - confident_ref_interval_contig, - confident_ref_interval_start, - last_position_considered, - lod, - (int)(confident_ref_interval_length)); - - inside_confident_ref_interval = false; - } - - if (LOG_METRICS) metrics.printMetrics(); - - try - { - this.output.flush(); - if ( ! GFF_OUTPUT_FILE.equals("-") ) - this.output.close(); - this.output.close(); - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } - - } - - static void print_base_qual_matrix(double [][]quals) { - // Print quals for debugging - System.out.println("Base quality matrix"); - int numReads = quals.length; - for (int b=0; b<4; b++) { - System.out.print(num2nuc[b]); - for (int i=0; i < numReads; i++){ - System.out.format(" %.4f", quals[i][b]); - } - System.out.println(); - } - } - - public static void main(String[] args) - { - - AlleleFrequencyWalker.binomialProb(5,10,0.5); - AlleleFrequencyWalker.binomialProb(50,100,0.5); - AlleleFrequencyWalker.binomialProb(1500,2965,0.508065); - AlleleFrequencyWalker.binomialProb(0,197,0.5); - AlleleFrequencyWalker.binomialProb(13,197,0.5); - AlleleFrequencyWalker.binomialProb(0, 7, 1e-3); - System.exit(0); - - { - int N = 2; - byte[] het_bases = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - double[][] het_quals = {{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}}; - AlleleFrequencyWalker w = new AlleleFrequencyWalker(); - AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20); - System.out.print(String.format("50%% Het : %s %c %c %f %f %f %d %s\n", - "null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null")); - } - - { - byte[] het_bases = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - - double[][] het_quals = { - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - {0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0}, - }; - - assert(het_bases.length == het_quals.length); - - int N = 10; - AlleleFrequencyWalker w = new AlleleFrequencyWalker(); - w.N = 10; - AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20); - System.out.print(String.format("10%% Het : %s %c %c %f %f %f %d %s\n", - "null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null")); - } - - } - -} - - - - - - - - - - - - - - - - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java index eff224ada..e5dcd3e62 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java @@ -6,9 +6,6 @@ import org.broadinstitute.sting.gatk.*; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; -import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; -import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.BaseUtils; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java index 4b7162525..7d716e210 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java @@ -5,15 +5,13 @@ import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.rodGFF; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper; +import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.ListUtils; +import org.broadinstitute.sting.utils.genotype.GenotypeCall; import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; import java.util.ArrayList; import java.util.List; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java index c375d4112..4db274444 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageHistogram.java @@ -1,20 +1,11 @@ package org.broadinstitute.sting.playground.gatk.walkers; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; -import org.broadinstitute.sting.playground.utils.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.ReadBackedPileup; -import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.*; import java.util.zip.*; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/FindNonrandomSecondBestBasePiles.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/FindNonrandomSecondBestBasePiles.java deleted file mode 100644 index cd1751e6b..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/FindNonrandomSecondBestBasePiles.java +++ /dev/null @@ -1,187 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Pileup; -import org.broadinstitute.sting.utils.BasicPileup; -import org.broadinstitute.sting.utils.ReadBackedPileup; -import net.sf.samtools.SAMRecord; - -import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; -import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; - -import java.util.List; -import java.util.ArrayList; - -public class FindNonrandomSecondBestBasePiles extends LocusWalker { - @Argument(fullName="verbose",doc="verbose",required=false) - public boolean VERBOSE = false; - - private AlleleFrequencyWalker caller_1b; - private AlleleFrequencyWalker caller_4b; - public void initialize() - { - caller_1b = new AlleleFrequencyWalker(); - caller_1b.N = 2; - caller_1b.DOWNSAMPLE = 0; - caller_1b.GFF_OUTPUT_FILE = "/dev/null"; - caller_1b.FORCE_1BASE_PROBS = true; - caller_1b.initalize(); - caller_1b.reduceInit(); - - caller_4b = new AlleleFrequencyWalker(); - caller_4b.N = 2; - caller_4b.DOWNSAMPLE = 0; - caller_4b.GFF_OUTPUT_FILE = "/dev/null"; - caller_4b.FORCE_1BASE_PROBS = false; - caller_4b.initalize(); - caller_4b.reduceInit(); - } - - // Do we actually want to operate on the context? - public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { - return true; // We are keeping all the reads - } - - public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) - { - ReadBackedPileup pileup = new ReadBackedPileup(ref, context); - - // First, call the site, because we're mostly interested in hets - ref = Character.toUpperCase(ref); - AlleleFrequencyEstimate call_1b = caller_1b.map(tracker, ref, context); - AlleleFrequencyEstimate call_4b = caller_4b.map(tracker, ref, context); - - // Only output data for sites that are confident disagreements. - if (Math.abs(call_1b.lodVsRef) < 5.0) { return 0; } - if (Math.abs(call_4b.lodVsRef) < 5.0) { return 0; } - if (call_1b.genotype().equals(call_4b.genotype())) { return 0; } - - List reads = pileup.getReads(); - List offsets = pileup.getOffsets(); - String best_bases = pileup.getBases(); - String second_bases; - double[] quals = new double[reads.size()]; - double[] second_quals = new double[reads.size()]; - { - char[] second_bases_array = new char[reads.size()]; - for ( int i = 0; i < reads.size(); i++ ) - { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - byte qual = (byte)read.getBaseQualities()[offset]; - quals[i] = 1.0 - Math.pow(10,(double)qual/-10.0); - - second_bases_array[i] = BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(((byte[])read.getAttribute("SQ"))[offset])); - second_quals[i] = QualityUtils.compressedQualityToProb(((byte[])read.getAttribute("SQ"))[offset]); - } - second_bases = new String(second_bases_array); - } - - String rodString = ""; - for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) { - if ( datum != null && ! (datum instanceof rodDbSNP)) { - //System.out.printf("rod = %s%n", datum.toSimpleString()); - rodString += datum.toSimpleString(); - //System.out.printf("Rod string %s%n", rodString); - } - } - - rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null); - if ( dbsnp != null ) - rodString += dbsnp.toMediumString(); - - if ( rodString != "" ) - rodString = "[ROD: " + rodString + "]"; - - char[] bases = {'A', 'C', 'G', 'T'}; - double[][] counts = new double[4][]; - double[] totals = new double[4]; - double[][] fractional_counts = new double[4][]; - double[] fractional_totals = new double[4]; - for (int i = 0; i < 4; i++) - { - counts[i] = new double[4]; - fractional_counts[i] = new double[4]; - for (int j = 0; j < best_bases.length(); j++) - { - if (best_bases.charAt(j) == bases[i]) - { - counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += 1; - totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += 1; - - fractional_counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += second_quals[j]; - fractional_totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += second_quals[j]; - } - } - } - - out.printf("%s\t%c\t%s\n%s\t%c\t%s\n", - pileup.getLocation(), - ref, - best_bases, - pileup.getLocation(), - ref, - second_bases); - out.printf("%s\t%s\t%f\t%f\n", - pileup.getLocation(), - call_1b.genotype(), - call_1b.lodVsRef, - call_1b.lodVsNextBest); - out.printf("%s\t%s\t%f\t%f\n", - pileup.getLocation(), - call_4b.genotype(), - call_4b.lodVsRef, - call_4b.lodVsNextBest); - for (int i = 0; i < quals.length; i++) - { - out.printf("%s\t%c %c %f\t%f\t%f\t%f\t%f\n", - pileup.getLocation(), - best_bases.charAt(i), - second_bases.charAt(i), - quals[i], - second_quals[i], - -10.0 * Math.log10(1.0 - quals[i]), - -10.0 * Math.log10(1.0 - second_quals[i]), - Math.log10((quals[i])/(second_quals[i]))); - } - out.printf("\n"); - for (int i = 0; i < 4; i++) - { - out.printf("%s\t%c ", pileup.getLocation(), bases[i]); - for (int j = 0; j < 4; j++) - { - out.printf("%.03f ", counts[i][j] / totals[i]); - } - out.printf("\n"); - } - /* - out.printf("\n"); - for (int i = 0; i < 4; i++) - { - out.printf("%s\t%c ", pileup.getLocation(), bases[i]); - for (int j = 0; j < 4; j++) - { - out.printf("%.03f ", fractional_counts[i][j] / fractional_totals[i]); - } - out.printf("\n"); - } - */ - out.printf("\n\n"); - - return 1; - } - - // Given result of map function - public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java index 0e02475de..6641caa87 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java @@ -10,10 +10,11 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.ReadBackedPileup; -import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods; +import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall; + import java.io.*; import java.util.*; /** @@ -103,7 +104,7 @@ public class CallHLAWalker extends LocusWalker>{ out.printf("%sAs\t%sCs\t%sTs\t%sGs\t",numAs,numCs,numTs,numGs); GenotypeLikelihoods G = new GenotypeLikelihoods(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY); - SSGGenotypeCall geno = (SSGGenotypeCall)G.callGenotypes(tracker, ref, pileup); + SSGGenotypeCall geno = G.callGenotypes(tracker, ref, pileup); double mLikelihoods[] = geno.getLikelihoods(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java index 415c3c344..47f1bf8e6 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods; import org.broadinstitute.sting.playground.utils.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.ReadBackedPileup; @@ -169,7 +170,7 @@ public class MultiSampleCaller extends LocusWalker indel_allele_likelihoods = new HashMap(); @@ -256,7 +257,7 @@ public class MultiSampleCaller extends LocusWalker -{ - List deep_callers = null; - List shallow_callers = null; - AlleleFrequencyWalker pooled_caller = null; - List sample_names = null; - - @Argument(required=false, shortName="downsample", doc="downsample") public int DOWNSAMPLE = 4; - @Argument(required=false, shortName="downsample_noise", doc="downsample noise") public int DOWNSAMPLE_NOISE = 3; - @Argument(required=false, shortName="log_metrics", doc="log metrics") public boolean LOG_METRICS = true; - @Argument(required=false, shortName="fractional_counts", doc="fractional counts") public boolean FRACTIONAL_COUNTS = false; - - private Random random; - - public void initialize() - { - GenomeAnalysisEngine toolkit = this.getToolkit(); - SAMFileHeader header = toolkit.getSAMFileHeader(); - List read_groups = header.getReadGroups(); - - sample_names = new ArrayList(); - deep_callers = new ArrayList(); - shallow_callers = new ArrayList(); - - random = new Random(42); - - for (int i = 0; i < read_groups.size(); i++) - { - String sample_name = read_groups.get(i).getSample(); - //System.out.println("SAMPLE: " + sample_name); - - AlleleFrequencyWalker deep = new AlleleFrequencyWalker(); - AlleleFrequencyWalker shallow = new AlleleFrequencyWalker(); - - deep.N = 2; - deep.DOWNSAMPLE = 0; - deep.GFF_OUTPUT_FILE = "/dev/null"; - deep.initalize(); - - shallow.N = 2; - shallow.DOWNSAMPLE = 0; - shallow.GFF_OUTPUT_FILE = "/dev/null"; - shallow.initalize(); - - sample_names.add(sample_name); - deep_callers.add(deep); - shallow_callers.add(shallow); - } - - pooled_caller = new AlleleFrequencyWalker(); - pooled_caller.N = sample_names.size() * 2; - pooled_caller.DOWNSAMPLE = 0; - pooled_caller.GFF_OUTPUT_FILE = "/dev/null"; - pooled_caller.initalize(); - pooled_caller.reduceInit(); - } - - public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) - { - // 1. seperate each context. - LocusContext[] deep_contexts = new LocusContext[sample_names.size()]; - for (int i = 0; i < sample_names.size(); i++) - { - deep_contexts[i] = filterLocusContext(context, sample_names.get(i), 0); - } - - // 2. Pool the deep contexts. (for the hell of it) - LocusContext deep_pooled_context = poolLocusContext(deep_contexts); - - // 3. Call individuals from deep coverage. - AlleleFrequencyEstimate[] deep_calls = new AlleleFrequencyEstimate[sample_names.size()]; - for (int i = 0; i < sample_names.size(); i++) - { - deep_calls[i] = deep_callers.get(i).map(tracker, ref, deep_contexts[i]); - } - - // 4. Count "true" allele frequency from deep calls, and compare to the estimated frequency from the pool. - // 4.1. Count "true" allele frequency from deep calls, and downsample the individuals with solid truth. - double true_alt_freq = 0.0; - double true_N = 0.0; - LocusContext[] downsampled_contexts = new LocusContext[sample_names.size()]; - for (int i = 0; i < deep_calls.length; i++) - { - downsampled_contexts[i] = null; - if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0)) - { - true_alt_freq += deep_calls[i].emperical_allele_frequency() * deep_calls[i].N; - true_N += 2; - downsampled_contexts[i] = filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian()*DOWNSAMPLE_NOISE)); - } - } - true_alt_freq /= true_N; - - if (true_N == 0.0) { return null; } // just bail if true_N == 0. - - // 4.2. Pool just the contexts that have truth data. - LocusContext pooled_context = poolLocusContext(downsampled_contexts); - - - // EM Loop: - AlleleFrequencyEstimate pooled_call = null; - - double correct_shallow_calls = 0; - double total_shallow_calls = 0; - AlleleFrequencyEstimate[] shallow_calls = null; - double EM_alt_freq = 0; - double EM_N = 0; - double shallow_calls_fraction_correct = 0; - - // 5. Call the pool. (this step is the EM init) - pooled_caller.N = (int)true_N; - pooled_call = pooled_caller.map(tracker, ref, pooled_context); - System.out.print("POOLED_CALL " + pooled_call.asTabularString()); - - // (this loop is the EM cycle) - EM_alt_freq = pooled_call.qstar; //pooled_call.qhat; - int num_iterations = 10; - double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq; - double[] likelihood_trajectory = new double[num_iterations + 1]; likelihood_trajectory[0] = pooled_call.pBest; - for (int iterations = 0; iterations < num_iterations; iterations++) - { - // 6. Re-call from shallow coverage using the estimated frequency as a prior, - // and compare to true deep calls, - // and compute new MAF estimate. - correct_shallow_calls = 0; - total_shallow_calls = 0; - shallow_calls = new AlleleFrequencyEstimate[sample_names.size()]; - EM_N = 0.0; - double EM_sum = 0.0; - double likelihood = 0.0; - - for (int i = 0; i < deep_calls.length; i++) - { - // Only shallow-call things where we know the truth! - if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0)) - { - shallow_callers.get(i).setAlleleFrequencyPrior(EM_alt_freq); - shallow_calls[i] = shallow_callers.get(i).map(tracker, ref, downsampled_contexts[i]); - String deep_genotype = deep_calls[i].genotype(); - String shallow_genotype = shallow_calls[i].genotype(); - - likelihood += shallow_calls[i].lodVsNextBest; - - //System.out.printf("DBG: %f %f %f %f\n", - // deep_calls[i].lodVsNextBest, - // deep_calls[i].lodVsRef, - // shallow_calls[i].lodVsNextBest, - // shallow_calls[i].lodVsRef); - - if (deep_genotype.equals(shallow_genotype)) - { - correct_shallow_calls += 1; - } - total_shallow_calls += 1; - - if (! FRACTIONAL_COUNTS) - { - EM_sum += shallow_calls[i].emperical_allele_frequency() * shallow_calls[i].N; - EM_N += shallow_calls[i].N; - } - else - { - for (int j = 0; j <= shallow_calls[i].N; j++) - { - if (Double.isInfinite(shallow_calls[i].posteriors[j])) { shallow_calls[i].posteriors[j] = -10000; } - System.out.printf("DBG3: %d %f %d\n", j, shallow_calls[i].posteriors[j], shallow_calls[i].N); - EM_sum += Math.pow(10,shallow_calls[i].posteriors[j]) * (double)j; - EM_N += shallow_calls[i].N; - } - } - - } - } - EM_alt_freq = EM_sum / EM_N; - shallow_calls_fraction_correct = correct_shallow_calls / total_shallow_calls; - trajectory[iterations+1] = EM_alt_freq; - likelihood_trajectory[iterations+1] = likelihood/(double)total_shallow_calls; - - System.out.printf("DBGTRAJ %f %f %f %f\n", EM_sum, EM_N, trajectory[iterations], trajectory[iterations+1]); - } - - // 7. Compare to estimation from the pool. - System.out.printf("EVAL %s %f %f %f %f %f %f %d %d %f %f %f %f\n", - pooled_call.location, - pooled_call.lodVsRef, - true_alt_freq, - pooled_call.qhat, - pooled_call.qstar, - true_alt_freq * true_N, - pooled_call.emperical_allele_frequency() * true_N, - pooled_call.N, - pooled_call.depth, - total_shallow_calls, - correct_shallow_calls, - shallow_calls_fraction_correct, - EM_alt_freq); - for (int i = 0; i < likelihood_trajectory.length; i++) - { - System.out.printf("TRAJECTORY %f %f\n", trajectory[i], likelihood_trajectory[i]); - } - System.out.print("\n\n"); - - return null; - } - - private LocusContext poolLocusContext(LocusContext[] contexts) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - GenomeLoc location = null; - - for (int i = 0; i < contexts.length; i++) - { - if (contexts[i] != null) - { - location = contexts[i].getLocation(); - reads.addAll(contexts[i].getReads()); - offsets.addAll(contexts[i].getOffsets()); - } - } - - return new LocusContext(location, reads, offsets); - } - - private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample) - { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - String RG = (String)(read.getAttribute("RG")); - String sample = read.getHeader().getReadGroup(RG).getSample(); - if (sample == sample_name) - { - reads.add(read); - offsets.add(offset); - } - } - - if (downsample != 0) - { - List perm = new ArrayList(); - for (int i = 0; i < reads.size(); i++) { perm.add(i); } - perm = Utils.RandomSubset(perm, downsample); - - ArrayList downsampled_reads = new ArrayList(); - ArrayList downsampled_offsets = new ArrayList(); - - for (int i = 0; i < perm.size(); i++) - { - downsampled_reads.add(reads.get(perm.get(i))); - downsampled_offsets.add(offsets.get(perm.get(i))); - } - - reads = downsampled_reads; - offsets = downsampled_offsets; - } - - return new LocusContext(context.getLocation(), reads, offsets); - } - - public void onTraversalDone() - { - return; - } - - public String reduceInit() - { - return ""; - } - - public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) - { - return ""; - } - - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PrintCoverageWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PrintCoverageWalker.java index 315cb0506..21172d499 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PrintCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PrintCoverageWalker.java @@ -1,28 +1,12 @@ package org.broadinstitute.sting.playground.gatk.walkers; -import net.sf.picard.reference.ReferenceSequence; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; -import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; @By(DataSource.REFERENCE) public class PrintCoverageWalker extends LocusWalker { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/ClusteredSNPFilterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/ClusteredSNPFilterWalker.java index bc2af67e0..69938b12f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/ClusteredSNPFilterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/ClusteredSNPFilterWalker.java @@ -7,8 +7,8 @@ import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.io.*; @@ -40,10 +40,10 @@ public class ClusteredSNPFilterWalker extends RefWalker { public void initialize() { try { vwriter = new PrintWriter(VARIANTS_OUT); - vwriter.println(AlleleFrequencyEstimate.geliHeaderString()); + vwriter.println(GeliTextWriter.headerLine); if ( FILTERED_OUT != null ) { fwriter = new PrintWriter(FILTERED_OUT); - fwriter.println(AlleleFrequencyEstimate.geliHeaderString()); + fwriter.println(GeliTextWriter.headerLine); } } catch (FileNotFoundException e) { throw new StingException(String.format("Could not open file for writing")); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java index 81e82ba0a..182f27381 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java @@ -3,8 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.io.FileNotFoundException; @@ -55,7 +55,7 @@ public class VariantFiltrationWalker extends LocusWalker { try { vwriter = new PrintWriter(VARIANTS_OUT_HEAD + ".included.geli.calls"); - vwriter.println(AlleleFrequencyEstimate.geliHeaderString()); + vwriter.println(GeliTextWriter.headerLine); swriters = new HashMap(); @@ -118,7 +118,7 @@ public class VariantFiltrationWalker extends LocusWalker { swriters.get(STUDY_NAME).print(vec.getStudyHeader() + "\t"); PrintWriter writer = new PrintWriter(VARIANTS_OUT_HEAD + ".excluded." + exclusionClassName + ".geli.calls"); - writer.println(AlleleFrequencyEstimate.geliHeaderString()); + writer.println(GeliTextWriter.headerLine); ewriters.put(exclusionClassName, writer); } catch (InstantiationException e) { diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java deleted file mode 100755 index 3220a623b..000000000 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ /dev/null @@ -1,210 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - -import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.Arrays; - -public class AlleleFrequencyEstimate { - - public GenomeLoc location; - public char ref; - public char alt; - public int N; - public double qhat; - public double qstar; - public double lodVsRef; - public double lodVsNextBest; - public double pBest; - public double pRef; - public int depth; - public String notes; - public String bases; - //public double[][] quals; - public double[] posteriors; - public String sample_name; - public int n_ref; - public int n_het; - public int n_hom; - - public GenotypeLikelihoods genotypeLikelihoods = null; - - GenomeLoc l; - - - public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth, String bases, double[][] quals, double[] posteriors, String sample_name) - { - this.location = location; - this.ref = ref; - this.alt = alt; - this.N = N; - this.qhat = qhat; - this.qstar = qstar; - this.lodVsRef = lodVsRef; - this.lodVsNextBest = lodVsNextBest; - this.pBest = pBest; - this.pRef = pRef; - this.depth = depth; - this.notes = ""; - this.bases = bases; - //this.quals = quals; - this.posteriors = posteriors; - this.sample_name = sample_name; - } - - public boolean isREF() { return (this.lodVsRef <= -5.0); } - public boolean isSNP() { return (this.lodVsRef >= 5.0); } - - /** Return the most likely genotype. */ - public String genotype() - { - int alt_count = (int)(qstar * N); - int ref_count = N-alt_count; - char[] alleles = new char[N]; - int i; - for (i = 0; i < ref_count; i++) { alleles[i] = ref; } - for (; i < N; i++) { alleles[i] = alt; } - Arrays.sort(alleles); - return new String(alleles); - } - - public double emperical_allele_frequency() - { - return (double)Math.round((double)qstar * (double)N) / (double)N; - } - - public double emperical_allele_frequency(int N) - { - return (double)Math.round((double)qstar * (double)N) / (double)N; - } - - public String asGFFString() - { - String s = ""; - s += String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\t", - location.getContig(), - location.getStart(), - location.getStart(), - lodVsRef); - s += String.format("\t;\tSAMPLE %s", sample_name); - s += String.format("\t;\tREF %c", ref); - s += String.format("\t;\tALT %c", alt); - s += String.format("\t;\tFREQ %f", qstar); - s += String.format("\t;\tGENOTYPE %s", this.genotype()); - s += String.format("\t;\tDEPTH %d", depth); - s += String.format("\t;\tLODvsREF %f", lodVsRef); - s += String.format("\t;\tLODvsNEXTBEST %f", lodVsNextBest); - s += String.format("\t;\tQHAT %f", qhat); - s += String.format("\t;\tQSTAR %f", qstar); - s += String.format("\t;\tBASES %s", bases); - - s += ";\n"; - - // add quals. - - return s; - } - - public static String asTabularStringHeader() - { - return "location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases"; - } - - public static String geliHeaderString() { - return "#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT"; - } - - public String asGeliString() - { - // #Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT - // chr1 7764136 A 48 99 CC 83.650421 9.18159 -92.83638 -18.367548 -96.91729 -96.614204 -9.185958 -23.33643 -23.033337 -101.282059 -101.583092 -101.279999 - - // chr pos ref Nreads maxMapQ genotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT - //public double[] posteriors; - return String.format("%s %16d %c %8d %d %s %.6f %.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f", - location.getContig(), - location.getStart(), - ref, - depth, - -1, - genotype(), - lodVsRef, - lodVsNextBest, - posteriors[0], - posteriors[1], - posteriors[2], - posteriors[3], - posteriors[4], - posteriors[5], - posteriors[6], - posteriors[7], - posteriors[8], - posteriors[9]); - } - - public String asTabularString() - { - return String.format("%s %s %c %c %s %f %f %f %f %d %s", - location, - sample_name, - ref, - alt, - genotype(), - qhat, - qstar, - lodVsRef, - lodVsNextBest, - depth, - bases); - } - - public String toString() { return asTabularString(); } - - public String asString() { - // Print out the called bases - // Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref - // base has you calculate qstar = 0.0 and qhat = 0.49 and that leads to a genotype predicition of AG according - // to qhat, but AA according to qstar. This needs to be further investigated to see whether we really want - // to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1 - long numNonrefBases = Math.round(qstar * N); - long numRefBases = N - numNonrefBases; - if (ref < alt) { // order bases alphabetically - return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases); - }else{ - return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases); - } - } - - public String asPoolTabularString() - { - return String.format("%s %c %c %f %f %f %s %f %d %d %d %d", - location, - ref, - alt, - qstar, - pBest, - pRef, - "NA", - lodVsRef, - N, - n_ref, - n_het, - n_hom); - } - - - public double posterior() - { - return this.posteriors[(int)this.qstar * this.N]; - } - - public String callType() { - // Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP - - String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"}; - String genotype = genotype(); - int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0); - return callTypeString[ref_matches]; - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java deleted file mode 100755 index 464cbf386..000000000 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java +++ /dev/null @@ -1,198 +0,0 @@ -package org.broadinstitute.sting.playground.utils; - -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.refdata.rodGFF; -import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; - -public class AlleleMetrics { - private double LOD_cutoff = 5; - - private long dbsnp_hits = 0; - private long num_variants = 0; - private long num_loci_total = 0; - private long num_loci_confident = 0; - private long hapmap_genotype_correct = 0; - private long hapmap_genotype_incorrect = 0; - private long hapmap_refvar_correct = 0; - private long hapmap_refvar_incorrect = 0; - - private final double dbl_cmp_precision = 0.0001; - - protected PrintStream out; - - public AlleleMetrics(String metricsOutputPath) { - initialize(new File(metricsOutputPath), LOD_cutoff); - } - - public AlleleMetrics(String metricsOutputPath, double lodThreshold) { - initialize(new File(metricsOutputPath), lodThreshold); - } - - public AlleleMetrics(File metricsOutputFile) { - initialize(metricsOutputFile, LOD_cutoff); - } - - public AlleleMetrics(File metricsOutputFile, double lodThreshold) { - initialize(metricsOutputFile, lodThreshold); - } - - private void initialize(File metricsOutputFile, double lodThresold) { - try { - this.out = new PrintStream(metricsOutputFile); - } catch (FileNotFoundException e) { - System.err.format("Unable to open file '%s'. Perhaps the parent directory does not exist or is read-only.", metricsOutputFile.getAbsolutePath()); - System.exit(-1); - } - - this.LOD_cutoff = lodThresold; - } - - public void nextPosition(GenotypeCall cl, RefMetaDataTracker tracker) { - SSGGenotypeCall call = (SSGGenotypeCall)cl; - num_loci_total += 1; - - boolean is_dbSNP_SNP = false; - boolean has_hapmap_chip_genotype = false; - rodGFF hapmap_chip_genotype = null; - - for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) - { - if ( datum != null ) - { - if ( datum instanceof rodDbSNP) - { - rodDbSNP dbsnp = (rodDbSNP)datum; - if (dbsnp.isSNP()) is_dbSNP_SNP = true; - } - - if ( datum instanceof rodGFF ) - { - has_hapmap_chip_genotype = true; - hapmap_chip_genotype = (rodGFF)datum; - } - } - } - double result = call.getBestRef(); - if (Math.abs(call.getBestNext()) >= LOD_cutoff) { num_loci_confident += 1; } - - if (call.isVariation() && result >= LOD_cutoff) - { - // Confident variant. - - num_variants += 1; - - if (is_dbSNP_SNP) - { - dbsnp_hits += 1; - } - } - - if (has_hapmap_chip_genotype) { - - // convert hapmap call to mixture of ref/nonref - String hapmap_genotype = hapmap_chip_genotype.getFeature(); - long refs=0, alts=0; - double hapmap_q; - String str = call.getBases(); - char alt = str.charAt(0); - if (str.charAt(0) == call.getReferencebase()) alt = str.charAt(1); - for (char c : hapmap_genotype.toCharArray()) { - if (c == call.getReferencebase()) { refs++; } - if (c == alt) { alts++; } - } - - if (refs+alts > 0) { - hapmap_q = ((double) alts) / ((double) (refs+alts)); - }else{ - hapmap_q = -1.0; - } - - // Hapmap debug info - //out.format("HAPMAP DEBUG %.2f %.2f %.2f ", hapmap_q, alleleFreq.qstar, alleleFreq.lodVsRef); - //String called_genotype = alleleFreq.asString(); - //out.format("%s %s %c %c", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt); - - //System.out.printf("DBG %f %s\n", LOD_cutoff, alleleFreq.asTabularString()); - if (call.getBestNext() >= LOD_cutoff) { - // TODO : should this all be commented out? - /* - System.out.printf("DBG %f %f %f %f\n", - hapmap_q, - alleleFreq.qhat, - alleleFreq.qstar, - alleleFreq.lodVsNextBest); - */ - - // Calculate genotyping performance - did we get the correct genotype of the N+1 choices? - //if (hapmap_q != -1 && hapmap_q == alleleFreq.qstar) { - /*if (Math.abs(hapmap_q - -1.0) > dbl_cmp_precision && Math.abs(hapmap_q - alleleFreq.qstar) <= dbl_cmp_precision) { - hapmap_genotype_correct++; - }else{ - hapmap_genotype_incorrect++; - //System.out.printf(" INCORRECT GENOTYPE Bases: %s", AlleleFrequencyWalker.getBases(context)); - //out.printf(" INCORRECT GENOTYPE"); - //AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context)); - }*/ - } - - if (result >= LOD_cutoff || -1 * result >= LOD_cutoff) { - - // Now calculate ref / var performance - did we correctly classify the site as - // reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here - boolean hapmap_var = hapmap_q != 0.0; - boolean called_var = call.isVariation(); - //if (hapmap_q != -1 && hapmap_var != called_var) { - if (Math.abs(hapmap_q - -1.0) > dbl_cmp_precision && hapmap_var != called_var) { - hapmap_refvar_incorrect++; - //out.printf(" INCORRECT REFVAR CALL"); - }else{ - hapmap_refvar_correct++; - } - } - - //out.print("\n"); - } - } - - public void printMetrics() - { - if (num_loci_total == 0) { return; } - - out.printf("\n"); - out.printf("Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff); - out.printf("-------------------------------------------------\n"); - out.printf("Total loci : %d\n", num_loci_total); - out.printf("Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); - if (num_variants != 0) - { - out.printf("Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); - out.printf("Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); - out.printf("-------------------------------------------------\n"); - out.printf("-- Hapmap Genotyping performance --\n"); - out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect); - out.printf("Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct); - out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect); - out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect)); - out.printf("-------------------------------------------------\n"); - out.printf("-- Hapmap Reference/Variant performance --\n"); - out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect); - out.printf("Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct); - out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect); - out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect)); - } - out.println(); - } - - public void printMetricsAtLocusIntervals(int loci_interval) { - if (num_loci_total % loci_interval == 0) printMetrics(); - } - - -} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java b/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java index 4a6729e97..149ad8843 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java @@ -154,7 +154,7 @@ public class BasicGenotype implements Genotype { * @return */ @Override - public org.broadinstitute.sting.utils.genotype.variant.Variant toVariant() { + public Variant toVariant() { return null; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java index f9bed48de..4847d34d7 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.utils.genotype; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore; -import org.broadinstitute.sting.utils.genotype.variant.Variant; +import org.broadinstitute.sting.utils.genotype.Variant; /** * @author aaron diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java index 23ba60427..9e087c77c 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.utils.genotype.calls; +package org.broadinstitute.sting.utils.genotype; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotype.Genotype; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeOutput.java b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeOutput.java index 623c5b831..298254540 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeOutput.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeOutput.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.utils.genotype; -import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall; - /** * @author aaron *

diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Variant.java b/java/src/org/broadinstitute/sting/utils/genotype/Variant.java index 78bb3d29c..4a642b1b2 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Variant.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Variant.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.utils.genotype.variant; +package org.broadinstitute.sting.utils.genotype; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore; @@ -21,7 +21,7 @@ public interface Variant { * * @return VariantFrequency with the stored frequency */ - public VariantFrequency getFrequency(); + public double getNonRefAlleleFrequency(); /** * get the confidence score for this variance diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java index b36aa0739..88aa40f47 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java @@ -7,8 +7,8 @@ import org.broadinstitute.sting.utils.genotype.GenotypeOutput; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.IndelLikelihood; import org.broadinstitute.sting.utils.genotype.LikelihoodObject; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall; import java.io.File; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java index 223ea7831..eee1d02fe 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java @@ -3,7 +3,7 @@ package org.broadinstitute.sting.utils.genotype.geli; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.GenotypeOutput; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; +import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall; import java.io.File; import java.io.FileNotFoundException; @@ -32,9 +32,11 @@ public class GeliTextWriter implements GenotypeWriter { } catch (FileNotFoundException e) { throw new StingException("Unable to open file " + file.toURI()); } - mWriter.println("#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT"); + mWriter.println(headerLine); } + public static String headerLine = "#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT"; + /** * Add a genotype, given a genotype locus * diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java index fc656649e..d1f40f9ab 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java @@ -8,7 +8,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeOutput; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.IndelLikelihood; import org.broadinstitute.sting.utils.genotype.LikelihoodObject; -import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall; +import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall; import java.io.DataOutputStream; import java.io.File;