Reorganization of the genotyping system

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1370 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-08-03 20:55:31 +00:00
parent 9f1d3aed26
commit 3485397483
25 changed files with 41 additions and 1916 deletions

View File

@ -1,15 +1,11 @@
package org.broadinstitute.sting.playground.utils;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.GenotypeGenerator;
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore;
import static java.lang.Math.log10;
@ -18,7 +14,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class GenotypeLikelihoods implements GenotypeGenerator {
public class GenotypeLikelihoods {
// precalculate these for performance (pow/log10 is expensive!)
/**
@ -336,16 +332,6 @@ public class GenotypeLikelihoods implements GenotypeGenerator {
return this.ref_likelihood;
}
private IndelLikelihood indel_likelihood;
public void addIndelLikelihood(IndelLikelihood indel_likelihood) {
this.indel_likelihood = indel_likelihood;
}
public IndelLikelihood getIndelLikelihood() {
return this.indel_likelihood;
}
/**
* given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs
*
@ -355,8 +341,7 @@ public class GenotypeLikelihoods implements GenotypeGenerator {
*
* @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values
*/
@Override
public GenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) {
public SSGGenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) {
filterQ0Bases(!keepQ0Bases); // Set the filtering / keeping flag
// for calculating the rms of the mapping qualities

View File

@ -1,17 +1,13 @@
package org.broadinstitute.sting.utils.genotype.calls;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
import org.broadinstitute.sting.utils.genotype.LexigraphicalComparator;
import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore;
import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore;
import org.broadinstitute.sting.utils.genotype.variant.Variant;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.*;

View File

@ -1,22 +1,16 @@
package org.broadinstitute.sting.playground.gatk.walkers;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.playground.utils.AlleleMetrics;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import java.io.File;
@ -108,13 +102,11 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
}
/**
* Compute the AlleleFrequencyEstimate at a given locus.
* Compute at a given locus.
*
* @param tracker the meta data tracker
* @param ref the reference base
* @param context contextual information around the locus
*
* @return an AlleleFrequencyEstimate object
*/
public SSGGenotypeCall map(RefMetaDataTracker tracker, char ref, LocusContext context) {
SSGGenotypeCall oldAndBusted = mapOldAndBusted(tracker, ref, context);

View File

@ -1,915 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
//import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.playground.utils.AlleleMetrics;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate, String>// implements AllelicVariant
{
@Argument(doc="Number of chromosomes in data") public int N;
@Argument(required=false,doc="downsample") public int DOWNSAMPLE = 0;
@Argument(doc="File to output GFF formatted allele frequency calls") public String GFF_OUTPUT_FILE;
@Argument(shortName="met", doc="Turns on logging of metrics on the fly with AlleleFrequency calculation") public boolean LOG_METRICS;
@Argument(required=false, doc="Name of file where metrics will output") public String METRICS_OUTPUT_FILE = "metrics.out";
@Argument(required=false, doc="Ignores 4-base probabilities and only uses the quality of the best/called base") public boolean FORCE_1BASE_PROBS;
protected static Logger logger = Logger.getLogger(AlleleFrequencyWalker.class);
Random random;
PrintStream output;
private boolean initalized = false;
static private double quality_precision = 1e-4;
public void initalize()
{
if (initalized) { return; }
try
{
this.random = new java.util.Random(0);
if ( GFF_OUTPUT_FILE.equals("-") )
this.output = out;
else
this.output = new PrintStream(GFF_OUTPUT_FILE);
}
catch (Exception e)
{
e.printStackTrace();
System.exit(-1);
}
initalized = true;
}
public boolean requiresReads() { return true; }
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
{
// init in the map because GATK doesn't appear to be initing me today. no problemo.
this.initalize();
// Convert context data into bases and 4-base quals
Pileup pileup = new ReadBackedPileup(ref, context);
String bases = pileup.getBases();
double quals[][] = getQuals(context);
//String[] indels = getIndels(context);
logger.debug(String.format("In alleleFrequnecy walker: N=%d, d=%d", N, DOWNSAMPLE));
if ((DOWNSAMPLE != 0) && (DOWNSAMPLE < bases.length()))
{
String downsampled_bases = "";
double downsampled_quals[][] = new double[DOWNSAMPLE][4];
int picked_bases[] = new int[bases.length()];
for (int i = 0; i < picked_bases.length; i++) { picked_bases[i] = 0; }
while (downsampled_bases.length() < DOWNSAMPLE)
{
int choice;
/*
System.out.printf("DBG %b %b %b\n",
random == null,
bases == null,
picked_bases == null);
*/
for (choice = random.nextInt(bases.length()); picked_bases[choice] == 1; choice = random.nextInt(bases.length()));
picked_bases[choice] = 1;
downsampled_bases += bases.charAt(choice);
downsampled_quals[downsampled_bases.length()-1] = quals[choice];
}
//System.out.printf("From: %s\n", bases);
//System.out.printf("To : %s\n", downsampled_bases);
bases = downsampled_bases;
quals = downsampled_quals;
}
// Count bases
int[] base_counts = new int[4];
for (byte b : bases.getBytes())
base_counts[nuc2num[b]]++;
// Find alternate allele - 2nd most frequent non-ref allele
// (maybe we should check for ties and eval both or check most common including quality scores)
int altnum = -1; // start with first base numerical identity (0,1,2,3)
double altcount = -1; // start with first base count
int refnum = nuc2num[ref];
for (int b=0; b<4; b++) {
if ((b != refnum) && (base_counts[b] > altcount)) {
altnum = b; altcount = base_counts[b];
}
}
assert(altnum != -1);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation(), N, bases.getBytes(), quals, refnum, altnum, bases.length());
alleleFreq.notes = String.format("A:%d C:%d G:%d T:%d",
base_counts[nuc2num['A']],
base_counts[nuc2num['C']],
base_counts[nuc2num['G']],
base_counts[nuc2num['T']]);
// Print dbSNP data if its there
if (true) {
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
if ( datum != null && datum instanceof rodDbSNP) {
rodDbSNP dbsnp = (rodDbSNP)datum;
//System.out.printf(" DBSNP %s on %s => %s%n", dbsnp.toSimpleString(), dbsnp.strand, Utils.join("/", dbsnp.getAllelesFWD()));
alleleFreq.notes += String.format(" ROD: %s ",dbsnp.toString());
}
}
}
logger.debug(String.format(" => result is %s", alleleFreq));
//if (LOG_METRICS) metrics.nextPosition(alleleFreq, tracker);
return alleleFreq;
}
/*
static public String[] getIndels(LocusContext context)
{
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
String[] indels = new String[reads.size()];
for (int i = 0; i < reads.size(); i++)
{
SAMRecord read = reads.get(i);
Cigar cigar = read.getCigar();
int k = 0;
for (int j = 0; j < cigar.numCigarElements(); j++)
{
CigarOperator operator = cigar.getCigarElement(j).getOperator();
int length = cigar.getCigarElement(j).getLength();
if (operator == CigarOperator.M)
{
k += length;
}
else if ((k != offset) && (operator == CigarOperator.I))
{
k += length;
}
else if ((k == offset) && (operator == CigarOperator.I))
{
// this insertion is associated with this offset.
break;
}
else if ((k == offset) && (operator == CigarOperator.D))
{
break;
}
else if ((k == offset) &&
((operator == CigarOperator.I) || (operator == CigarOperator.D)))
{
// no indel here.
indels[i] = "";
break;
}
}
}
return indels;
}
*/
public double[][] getQuals (LocusContext context)
{
int numReads = context.getReads().size(); //numReads();
double[][] quals = new double[numReads][4];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
// First, set the called base to the correct quality
int callednum = nuc2num[read.getReadBases()[offset]]; // number of the called base (0,1,2,3)
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
// For other 3 bases, check if 2-base probs exist
assert (reads.size() > 0);
Object SQ_field = reads.get(i).getAttribute("SQ");
if (SQ_field == null || FORCE_1BASE_PROBS) {
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
for (int b=0; b<4; b++)
{
if (b != callednum)
{
quals[i][b] = nonref_quals;
if (quals[i][b] <= quality_precision) { quals[i][b] = quality_precision; } // avoid zero probs.
}
}
}else{
assert (SQ_field instanceof byte[]);
byte[] hex_quals = (byte[]) SQ_field;
//System.out.printf("SQ field (hex): %s\n", bytesToHexString(hex_quals));
//System.out.printf("SAM record: %s\n", read.format());
byte hex_qual = hex_quals[offset];
int called2num = QualityUtils.compressedQualityToBaseIndex(hex_qual);
/*
int crossTalkPartnerIndex = BaseUtils.crossTalkPartnerIndex(callednum);
if (called2num == crossTalkPartnerIndex) {
double nonref_quals = (1.0 - quals[i][callednum]) / 3.0;
for (int b=0; b<4; b++)
if (b != callednum)
quals[i][b] = nonref_quals;
} else {
*/
double qual2 = QualityUtils.compressedQualityToProb(hex_qual);
//System.out.printf("2ND %x %d %f\n", hex_qual, called2num, qual2);
quals[i][called2num] = qual2;
if (quals[i][called2num] <= quality_precision) { quals[i][called2num] = quality_precision; } // avoid zero probs.
double nonref_quals = (1.0 - quals[i][callednum] - quals[i][called2num]) / 2.0;
for (int b=0; b<4; b++)
if (b != callednum && b != called2num)
quals[i][b] = nonref_quals;
//}
}
}
//print_base_qual_matrix(quals);
return quals;
}
// Code pulled from SAMUtils for debugging
static String bytesToHexString(final byte[] data) {
final char[] chars = new char[2 * data.length];
for (int i = 0; i < data.length; i++) {
final byte b = data[i];
chars[2*i] = toHexDigit((b >> 4) & 0xF);
chars[2*i+1] = toHexDigit(b & 0xF);
}
return new String(chars);
}
private static char toHexDigit(final int value) {
return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10));
}
public AlleleFrequencyEstimate AlleleFrequencyEstimator(GenomeLoc location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
{
// q = hypothetical %nonref
// qstar = true %nonref
// G = N, qstar, alt
// N = number of chromosomes
// alt = alternate hypothesis base (most frequent nonref base)
// ref = reference base
double qstar;
int qstar_N;
double qstep = 0.001;
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
double posterior_null_hyp;
// Initialize empyt MixtureLikelihood holders for each possible qstar (N+1)
MixtureLikelihood[] bestMixtures = new MixtureLikelihood[N+1];
double posteriors[] = new double[N+1];
{
double q = ML_q_byEM(bases, quals, refnum, altnum);
double pDq = P_D_q(bases, quals, q, refnum, altnum);
long q_R = Math.round(q*bases.length);
posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, 0, 0) + P_G(N, 0);
bestMixtures[0] = new MixtureLikelihood(posterior_null_hyp, 0.0, 0.0);
posteriors[0] = posterior_null_hyp;
//System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n", location, 0.0, 0.0, P_D_q(bases, quals, 0.0, refnum, altnum), P_q_G(bases, N, 0.0, 0, 0), P_G(N, 0), prior_alt_frequency, posterior_null_hyp, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases));
assert(! Double.isNaN(P_D_q(bases, quals, 0.0, refnum, altnum)));
assert(! Double.isNaN(P_q_G(bases, N, 0.0, 0, 0)));
assert(! Double.isNaN(P_G(N, 0)));
assert(! Double.isNaN(posteriors[0]));
assert(! Double.isInfinite(posteriors[0]));
// qstar - true allele balances
//for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++)
for (qstar_N = 1; qstar_N <= N; qstar_N += 1)
{
qstar = (double)qstar_N / (double)N;
double pqG = P_q_G(bases, N, q, qstar, q_R);
double pG = P_G(N, qstar_N);
double posterior = pDq + pqG + pG;
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
posteriors[qstar_N] = posterior;
//System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n",
// location, q, qstar, pDq, pqG, pG, prior_alt_frequency, posterior, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases));
}
}
// First reverse sort mixtures according to highest posterior probabililty
Arrays.sort(bestMixtures);
// Calculate Lod of any variant call versus the reference call
// Answers how confident are we in the best variant (nonref) mixture versus the null hypothesis
// reference mixture - qhat = qstar = 0.0
double lodVarVsRef = bestMixtures[0].posterior - posterior_null_hyp;
// Now reverse sort ALL mixtures according to highest posterior probability
Arrays.sort(bestMixtures);
// Calculate Lod of the mixture versus other possible
// Answers how confident are we in the best mixture versus the nextPosition best mixture
double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior;
if (lodVarVsRef == 0) { lodVarVsRef = -1.0 * lodBestVsNextBest; }
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
num2nuc[refnum],
num2nuc[altnum],
N,
bestMixtures[0].qhat,
bestMixtures[0].qstar,
lodVarVsRef,
lodBestVsNextBest,
bestMixtures[0].posterior,
posterior_null_hyp,
depth,
new String(bases),
quals,
posteriors,
"unknown_sample");
return alleleFreq;
}
public class MixtureLikelihood implements Comparable<MixtureLikelihood> {
public double posterior;
public double qstar;
public double qhat;
MixtureLikelihood() {
this.posterior = Math.log10(0);
this.qstar = Math.log10(0);
this.qhat = Math.log10(0);
}
MixtureLikelihood(double posterior, double qstar, double qhat) {
this.posterior = posterior;
this.qstar = qstar;
this.qhat = qhat;
}
public int compareTo (MixtureLikelihood other) {
// default sorting is REVERSE sorting on posterior prob; gives array with top post. prob. as first element
if (posterior < other.posterior) return 1;
else if (posterior > other.posterior) return -1;
else return 0;
}
}
double ML_q(byte[] bases, double[][]quals, int refnum, int altnum)
{
double ref_count = 0;
double alt_count = 0;
for (int i=0; i<bases.length; i++)
{
if (bases[i] == num2nuc[refnum]) { ref_count += 1; }
if (bases[i] == num2nuc[altnum]) { alt_count += 1; }
}
return alt_count / (alt_count + ref_count);
}
double ML_q_byEM(byte[] bases, double[][]quals, int refnum, int altnum)
{
double best_q = 0;
double best_likelihood = -1000000;
for (double q = 0.0; q <= 1.0; q += 0.001)
{
double likelihood = P_D_q(bases, quals, q, refnum, altnum);
if (likelihood >= best_likelihood)
{
best_likelihood = likelihood;
best_q = q;
}
}
return best_q;
}
double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum)
{
double p = 0.0;
double epsilon = 1e-4;
for (int i=0; i<bases.length; i++)
{
double atomic = (1-q) * quals[i][refnum] + q * quals[i][altnum];
if (atomic <= 0) { atomic = epsilon; }
if (Double.isNaN(Math.log10(atomic))) { System.out.printf("DBG: %f %f %f\n", q, quals[i][refnum], quals[i][altnum]); }
p += Math.log10(atomic);
}
return p;
}
double P_q_G(byte [] bases, int N, double q, double qstar, long q_R)
{
double epsilon = 1e-3;
if (qstar == 0) { qstar = epsilon; }
if (qstar == 1) { qstar = 1.0 - epsilon; }
if (N != 2) { return 0.0; }
else { return binomialProb(q_R, bases.length, qstar); }
}
double P_G(int N, int qstar_N)
{
// badly hard coded right now.
if ((N == 2) && (prior_alt_frequency != -1))
{
double p = -1.0;
if (qstar_N == 0) { p = Math.pow(1.0 - prior_alt_frequency, 2.0); }
else if (qstar_N == 1) { p = 2.0 * (prior_alt_frequency * (1.0 - prior_alt_frequency)); }
else if (qstar_N == 2) { p = Math.pow(prior_alt_frequency, 2.0); }
else { assert(false); }
//System.out.printf("DBG2: %d %d %f %f\n", N, qstar_N, prior_alt_frequency, p);
return Math.log10(p);
}
if (qstar_N == 0) { return Math.log10(0.999); }
else if (qstar_N == N) { return Math.log10(1e-5); }
else { return Math.log10(1e-3); }
}
static String genotypeTypeString(double q, int N){
switch (Math.round((float)q*N)) {
case (0):
return "ref";
case (1):
return "het";
case (2):
return "hom";
}
return "ERROR";
}
static double binomialProb(long k, long n, double p) {
// k - number of successes
// n - number of Bernoulli trials
// p - probability of success
double ans;
//if (((double)n*p < 5.0) && ((double)n*(1.0-p) < 5.0))
if (n < 1000)
{
// For small n and the edges, compute it directly.
ans = Math.log10((double)nchoosek(n, k)) + Math.log10(Math.pow(p, k)) + Math.log10(Math.pow(1-p, n-k));
//System.out.printf("DBG1: %d %d %f %f %f %f %f\n",
// k, n, p,
// nchoosek(n,k), Math.pow(p,k), Math.pow(1-p, n-k),
// ans);
}
else
{
// For large n, approximate with a gaussian.
double mean = (double)(n*p);
double var = Math.sqrt((double)(n*p)*(1.0-p));
double ans_1 = Math.log10((double)(1.0 / (var*Math.sqrt(2*Math.PI)))*Math.exp(-1.0 * Math.pow((double)k-mean,2)/(2.0*var*var)));
double ans_2 = ((Utils.logGamma(n+1) - Utils.logGamma(k+1) - Utils.logGamma(n-k+1))/Math.log(10)) + Math.log10(Math.pow(p, k)) + Math.log10(Math.pow(1-p, n-k));
double check = Math.log10((double)nchoosek(n, k)) + Math.log10(Math.pow(p, k)) + Math.log10(Math.pow(1-p, n-k));
double residual = ans_2 - check;
//System.out.format("DBG: %d %.10f %.10f\n", nchoosek(n, k), Math.pow(p, k), Math.pow(1-p, n-k));
//System.out.format("RESIDUAL: (%d,%d,%f) %.10f %.10f %.10f\n", k, n, p, ans, check, residual);
//System.out.printf("DBG2: %d %d %f %f %f %f %f\n", k, n, p, check, ans_1, ans_2, residual);
//if ((Double.isInfinite(check) || (Double.isNaN(check))))
//{
// System.out.printf("DBG2: %d %d %f %f\n", k, n, p, check);
//}
ans = ans_1;
}
//System.out.printf("DBG3: %d %d %f %f\n", n, k, p, ans);
return ans;
}
static double nchoosek(long n, long k)
{
if (k > n)
return 0;
if (k > n/2)
k = n-k; // Take advantage of symmetry
double accum = 1;
for (long i = 1; i <= k; i++)
accum = accum * (n-k+i) / i;
//return accum + 0.5; // avoid rounding error
return accum; // avoid rounding error
/*
long m = n - k;
if (k < m)
k = m;
long t = 1;
for (long i = n, j = 1; i > k; i--, j++)
t = t * i / j;
return t;
*/
}
public static String repeat(char letter, long count) {
// Repeat a character count times
String result = "";
for (int i=0; i<count; i++) {
result = result + letter;
}
return result;
}
private String confident_ref_interval_contig = "";
private long confident_ref_interval_start = 0;
private double confident_ref_interval_LOD_sum = 0;
private double confident_ref_interval_length = 0;
private long last_position_considered = -1;
private boolean inside_confident_ref_interval = false;
AlleleMetrics metrics;
public String reduceInit()
{
confident_ref_interval_contig = "";
confident_ref_interval_start = 0;
confident_ref_interval_LOD_sum = 0;
confident_ref_interval_length = 0;
last_position_considered = -1;
inside_confident_ref_interval = false;
if (LOG_METRICS) metrics = new AlleleMetrics(METRICS_OUTPUT_FILE);
return "";
}
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
// Print RESULT data for confident calls
long current_offset = alleleFreq.location.getStart(); //Integer.parseInt(tokens[1]);
if (inside_confident_ref_interval &&
((alleleFreq.lodVsRef > -5.0) || (current_offset != last_position_considered + 1)) )
{
// No longer hom-ref, so output a ref line.
double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length;
output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n",
confident_ref_interval_contig,
confident_ref_interval_start,
last_position_considered,
lod,
(int)(confident_ref_interval_length));
inside_confident_ref_interval = false;
}
else if (inside_confident_ref_interval && (alleleFreq.lodVsRef <= -5.0))
{
// Still hom-ref so increment the counters.
confident_ref_interval_LOD_sum += alleleFreq.lodVsRef;
confident_ref_interval_length += 1;
}
else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef > -5.0))
{
// do nothing.
}
else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef <= -5.0))
{
// We moved into a hom-ref region so start a new interval.
confident_ref_interval_contig = alleleFreq.location.getContig();
confident_ref_interval_start = alleleFreq.location.getStart();
confident_ref_interval_LOD_sum = alleleFreq.lodVsRef;
confident_ref_interval_length = 1;
inside_confident_ref_interval = true;
}
last_position_considered = current_offset;
if (alleleFreq.lodVsRef >= 5) {
this.output.print(alleleFreq.asGFFString());
/*
String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N);
System.out.print("DEBUG " + gtype + " ");
if (gtype.contentEquals("het")) {
System.out.println(alleleFreq.ref + "" + alleleFreq.alt);
} else if (gtype.contentEquals("hom")) {
System.out.println(alleleFreq.ref + "" + alleleFreq.ref);
} else {
System.out.println(alleleFreq.alt + "" + alleleFreq.alt);
}
*/
}
if (LOG_METRICS) metrics.printMetricsAtLocusIntervals(1000);
return "";
}
private double prior_alt_frequency = -1.0;
public void setAlleleFrequencyPrior(double frequency)
{
assert(! Double.isNaN(frequency) );
assert(! Double.isInfinite(frequency));
if (frequency == 0) { frequency = 1e-5; } // how many epsilons do we need!? This is worrisome.
this.prior_alt_frequency = frequency;
}
static int nuc2num[];
static char num2nuc[];
public AlleleFrequencyWalker() {
nuc2num = new int[128];
nuc2num['A'] = 0;
nuc2num['C'] = 1;
//nuc2num['T'] = 2;
nuc2num['G'] = 2;
//nuc2num['G'] = 3;
nuc2num['T'] = 3;
nuc2num['a'] = 0;
nuc2num['c'] = 1;
//nuc2num['t'] = 2;
nuc2num['g'] = 2;
//nuc2num['g'] = 3;
nuc2num['t'] = 3;
num2nuc = new char[4];
num2nuc[0] = 'A';
num2nuc[1] = 'C';
//num2nuc[2] = 'T';
num2nuc[2] = 'G';
//num2nuc[3] = 'G';
num2nuc[3] = 'T';
//if (System.getenv("N") != null) { this.N = (new Integer(System.getenv("N"))).intValue(); }
//else { this.N = 2; }
//
//if (System.getenv("DOWNSAMPLE") != null) { this.DOWNSAMPLE = (new Integer(System.getenv("DOWNSAMPLE"))).intValue(); }
//else { this.DOWNSAMPLE = 0; }
}
public void onTraversalDone(String result)
{
if (inside_confident_ref_interval)
{
// if we have a confident reference interval still hanging open, close it.
double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length;
output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n",
confident_ref_interval_contig,
confident_ref_interval_start,
last_position_considered,
lod,
(int)(confident_ref_interval_length));
inside_confident_ref_interval = false;
}
if (LOG_METRICS) metrics.printMetrics();
try
{
this.output.flush();
if ( ! GFF_OUTPUT_FILE.equals("-") )
this.output.close();
this.output.close();
}
catch (Exception e)
{
e.printStackTrace();
System.exit(-1);
}
}
static void print_base_qual_matrix(double [][]quals) {
// Print quals for debugging
System.out.println("Base quality matrix");
int numReads = quals.length;
for (int b=0; b<4; b++) {
System.out.print(num2nuc[b]);
for (int i=0; i < numReads; i++){
System.out.format(" %.4f", quals[i][b]);
}
System.out.println();
}
}
public static void main(String[] args)
{
AlleleFrequencyWalker.binomialProb(5,10,0.5);
AlleleFrequencyWalker.binomialProb(50,100,0.5);
AlleleFrequencyWalker.binomialProb(1500,2965,0.508065);
AlleleFrequencyWalker.binomialProb(0,197,0.5);
AlleleFrequencyWalker.binomialProb(13,197,0.5);
AlleleFrequencyWalker.binomialProb(0, 7, 1e-3);
System.exit(0);
{
int N = 2;
byte[] het_bases = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
double[][] het_quals = {{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}};
AlleleFrequencyWalker w = new AlleleFrequencyWalker();
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20);
System.out.print(String.format("50%% Het : %s %c %c %f %f %f %d %s\n",
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null"));
}
{
byte[] het_bases = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
double[][] het_quals = {
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
};
assert(het_bases.length == het_quals.length);
int N = 10;
AlleleFrequencyWalker w = new AlleleFrequencyWalker();
w.N = 10;
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20);
System.out.print(String.format("10%% Het : %s %c %c %f %f %f %d %s\n",
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null"));
}
}
}

View File

@ -6,9 +6,6 @@ import org.broadinstitute.sting.gatk.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BaseUtils;

View File

@ -5,15 +5,13 @@ import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ListUtils;
import org.broadinstitute.sting.utils.genotype.GenotypeCall;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;

View File

@ -1,20 +1,11 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.playground.utils.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
import java.util.zip.*;

View File

@ -1,187 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pileup;
import org.broadinstitute.sting.utils.BasicPileup;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import java.util.List;
import java.util.ArrayList;
public class FindNonrandomSecondBestBasePiles extends LocusWalker<Integer, Integer> {
@Argument(fullName="verbose",doc="verbose",required=false)
public boolean VERBOSE = false;
private AlleleFrequencyWalker caller_1b;
private AlleleFrequencyWalker caller_4b;
public void initialize()
{
caller_1b = new AlleleFrequencyWalker();
caller_1b.N = 2;
caller_1b.DOWNSAMPLE = 0;
caller_1b.GFF_OUTPUT_FILE = "/dev/null";
caller_1b.FORCE_1BASE_PROBS = true;
caller_1b.initalize();
caller_1b.reduceInit();
caller_4b = new AlleleFrequencyWalker();
caller_4b.N = 2;
caller_4b.DOWNSAMPLE = 0;
caller_4b.GFF_OUTPUT_FILE = "/dev/null";
caller_4b.FORCE_1BASE_PROBS = false;
caller_4b.initalize();
caller_4b.reduceInit();
}
// Do we actually want to operate on the context?
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
return true; // We are keeping all the reads
}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context)
{
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
// First, call the site, because we're mostly interested in hets
ref = Character.toUpperCase(ref);
AlleleFrequencyEstimate call_1b = caller_1b.map(tracker, ref, context);
AlleleFrequencyEstimate call_4b = caller_4b.map(tracker, ref, context);
// Only output data for sites that are confident disagreements.
if (Math.abs(call_1b.lodVsRef) < 5.0) { return 0; }
if (Math.abs(call_4b.lodVsRef) < 5.0) { return 0; }
if (call_1b.genotype().equals(call_4b.genotype())) { return 0; }
List<SAMRecord> reads = pileup.getReads();
List<Integer> offsets = pileup.getOffsets();
String best_bases = pileup.getBases();
String second_bases;
double[] quals = new double[reads.size()];
double[] second_quals = new double[reads.size()];
{
char[] second_bases_array = new char[reads.size()];
for ( int i = 0; i < reads.size(); i++ )
{
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte qual = (byte)read.getBaseQualities()[offset];
quals[i] = 1.0 - Math.pow(10,(double)qual/-10.0);
second_bases_array[i] = BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(((byte[])read.getAttribute("SQ"))[offset]));
second_quals[i] = QualityUtils.compressedQualityToProb(((byte[])read.getAttribute("SQ"))[offset]);
}
second_bases = new String(second_bases_array);
}
String rodString = "";
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
if ( datum != null && ! (datum instanceof rodDbSNP)) {
//System.out.printf("rod = %s%n", datum.toSimpleString());
rodString += datum.toSimpleString();
//System.out.printf("Rod string %s%n", rodString);
}
}
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
if ( dbsnp != null )
rodString += dbsnp.toMediumString();
if ( rodString != "" )
rodString = "[ROD: " + rodString + "]";
char[] bases = {'A', 'C', 'G', 'T'};
double[][] counts = new double[4][];
double[] totals = new double[4];
double[][] fractional_counts = new double[4][];
double[] fractional_totals = new double[4];
for (int i = 0; i < 4; i++)
{
counts[i] = new double[4];
fractional_counts[i] = new double[4];
for (int j = 0; j < best_bases.length(); j++)
{
if (best_bases.charAt(j) == bases[i])
{
counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += 1;
totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += 1;
fractional_counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += second_quals[j];
fractional_totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += second_quals[j];
}
}
}
out.printf("%s\t%c\t%s\n%s\t%c\t%s\n",
pileup.getLocation(),
ref,
best_bases,
pileup.getLocation(),
ref,
second_bases);
out.printf("%s\t%s\t%f\t%f\n",
pileup.getLocation(),
call_1b.genotype(),
call_1b.lodVsRef,
call_1b.lodVsNextBest);
out.printf("%s\t%s\t%f\t%f\n",
pileup.getLocation(),
call_4b.genotype(),
call_4b.lodVsRef,
call_4b.lodVsNextBest);
for (int i = 0; i < quals.length; i++)
{
out.printf("%s\t%c %c %f\t%f\t%f\t%f\t%f\n",
pileup.getLocation(),
best_bases.charAt(i),
second_bases.charAt(i),
quals[i],
second_quals[i],
-10.0 * Math.log10(1.0 - quals[i]),
-10.0 * Math.log10(1.0 - second_quals[i]),
Math.log10((quals[i])/(second_quals[i])));
}
out.printf("\n");
for (int i = 0; i < 4; i++)
{
out.printf("%s\t%c ", pileup.getLocation(), bases[i]);
for (int j = 0; j < 4; j++)
{
out.printf("%.03f ", counts[i][j] / totals[i]);
}
out.printf("\n");
}
/*
out.printf("\n");
for (int i = 0; i < 4; i++)
{
out.printf("%s\t%c ", pileup.getLocation(), bases[i]);
for (int j = 0; j < 4; j++)
{
out.printf("%.03f ", fractional_counts[i][j] / fractional_totals[i]);
}
out.printf("\n");
}
*/
out.printf("\n\n");
return 1;
}
// Given result of map function
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
}

View File

@ -10,10 +10,11 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods;
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
import java.io.*;
import java.util.*;
/**
@ -103,7 +104,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
out.printf("%sAs\t%sCs\t%sTs\t%sGs\t",numAs,numCs,numTs,numGs);
GenotypeLikelihoods G = new GenotypeLikelihoods(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY);
SSGGenotypeCall geno = (SSGGenotypeCall)G.callGenotypes(tracker, ref, pileup);
SSGGenotypeCall geno = G.callGenotypes(tracker, ref, pileup);
double mLikelihoods[] = geno.getLikelihoods();

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods;
import org.broadinstitute.sting.playground.utils.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.ReadBackedPileup;
@ -169,7 +170,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
G.applyPrior(ref, allele_likelihoods);
// Handle indels
if (CALL_INDELS)
/* if (CALL_INDELS)
{
String[] indels = BasicPileup.indelPileup(reads, offsets);
IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq);
@ -181,7 +182,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
{
G.addIndelLikelihood(null);
}
}
}*/
/*
// Handle 2nd-best base calls.
@ -229,7 +230,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
return allele_likelihoods;
}
double CountIndelFreq(GenotypeLikelihoods[] genotype_likelihoods)
/* double CountIndelFreq(GenotypeLikelihoods[] genotype_likelihoods)
{
HashMap<String, Double> indel_allele_likelihoods = new HashMap<String, Double>();
@ -256,7 +257,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
pAlt = pAlt / (pRef + pAlt);
return pAlt;
}
}*/
// Potential precision error here.
double Compute_pD(GenotypeLikelihoods[] genotype_likelihoods)
@ -346,10 +347,10 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
allele_likelihoods = CountFreqs(G);
if (CALL_INDELS)
{
indel_alt_freq = CountIndelFreq(G);
}
// if (CALL_INDELS)
// {
// indel_alt_freq = CountIndelFreq(G);
// }
}
return new EM_Result(sample_names, G, allele_likelihoods);

View File

@ -1,22 +1,13 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.playground.utils.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.playground.indels.Matrix;
import java.util.*;
import java.util.zip.*;
import java.io.*;
// Beta iterative multi-sample caller

View File

@ -1,301 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.*;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
public class PoolCallingExperiment extends LocusWalker<AlleleFrequencyEstimate, String>
{
List<AlleleFrequencyWalker> deep_callers = null;
List<AlleleFrequencyWalker> shallow_callers = null;
AlleleFrequencyWalker pooled_caller = null;
List<String> sample_names = null;
@Argument(required=false, shortName="downsample", doc="downsample") public int DOWNSAMPLE = 4;
@Argument(required=false, shortName="downsample_noise", doc="downsample noise") public int DOWNSAMPLE_NOISE = 3;
@Argument(required=false, shortName="log_metrics", doc="log metrics") public boolean LOG_METRICS = true;
@Argument(required=false, shortName="fractional_counts", doc="fractional counts") public boolean FRACTIONAL_COUNTS = false;
private Random random;
public void initialize()
{
GenomeAnalysisEngine toolkit = this.getToolkit();
SAMFileHeader header = toolkit.getSAMFileHeader();
List<SAMReadGroupRecord> read_groups = header.getReadGroups();
sample_names = new ArrayList<String>();
deep_callers = new ArrayList<AlleleFrequencyWalker>();
shallow_callers = new ArrayList<AlleleFrequencyWalker>();
random = new Random(42);
for (int i = 0; i < read_groups.size(); i++)
{
String sample_name = read_groups.get(i).getSample();
//System.out.println("SAMPLE: " + sample_name);
AlleleFrequencyWalker deep = new AlleleFrequencyWalker();
AlleleFrequencyWalker shallow = new AlleleFrequencyWalker();
deep.N = 2;
deep.DOWNSAMPLE = 0;
deep.GFF_OUTPUT_FILE = "/dev/null";
deep.initalize();
shallow.N = 2;
shallow.DOWNSAMPLE = 0;
shallow.GFF_OUTPUT_FILE = "/dev/null";
shallow.initalize();
sample_names.add(sample_name);
deep_callers.add(deep);
shallow_callers.add(shallow);
}
pooled_caller = new AlleleFrequencyWalker();
pooled_caller.N = sample_names.size() * 2;
pooled_caller.DOWNSAMPLE = 0;
pooled_caller.GFF_OUTPUT_FILE = "/dev/null";
pooled_caller.initalize();
pooled_caller.reduceInit();
}
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
{
// 1. seperate each context.
LocusContext[] deep_contexts = new LocusContext[sample_names.size()];
for (int i = 0; i < sample_names.size(); i++)
{
deep_contexts[i] = filterLocusContext(context, sample_names.get(i), 0);
}
// 2. Pool the deep contexts. (for the hell of it)
LocusContext deep_pooled_context = poolLocusContext(deep_contexts);
// 3. Call individuals from deep coverage.
AlleleFrequencyEstimate[] deep_calls = new AlleleFrequencyEstimate[sample_names.size()];
for (int i = 0; i < sample_names.size(); i++)
{
deep_calls[i] = deep_callers.get(i).map(tracker, ref, deep_contexts[i]);
}
// 4. Count "true" allele frequency from deep calls, and compare to the estimated frequency from the pool.
// 4.1. Count "true" allele frequency from deep calls, and downsample the individuals with solid truth.
double true_alt_freq = 0.0;
double true_N = 0.0;
LocusContext[] downsampled_contexts = new LocusContext[sample_names.size()];
for (int i = 0; i < deep_calls.length; i++)
{
downsampled_contexts[i] = null;
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
{
true_alt_freq += deep_calls[i].emperical_allele_frequency() * deep_calls[i].N;
true_N += 2;
downsampled_contexts[i] = filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian()*DOWNSAMPLE_NOISE));
}
}
true_alt_freq /= true_N;
if (true_N == 0.0) { return null; } // just bail if true_N == 0.
// 4.2. Pool just the contexts that have truth data.
LocusContext pooled_context = poolLocusContext(downsampled_contexts);
// EM Loop:
AlleleFrequencyEstimate pooled_call = null;
double correct_shallow_calls = 0;
double total_shallow_calls = 0;
AlleleFrequencyEstimate[] shallow_calls = null;
double EM_alt_freq = 0;
double EM_N = 0;
double shallow_calls_fraction_correct = 0;
// 5. Call the pool. (this step is the EM init)
pooled_caller.N = (int)true_N;
pooled_call = pooled_caller.map(tracker, ref, pooled_context);
System.out.print("POOLED_CALL " + pooled_call.asTabularString());
// (this loop is the EM cycle)
EM_alt_freq = pooled_call.qstar; //pooled_call.qhat;
int num_iterations = 10;
double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq;
double[] likelihood_trajectory = new double[num_iterations + 1]; likelihood_trajectory[0] = pooled_call.pBest;
for (int iterations = 0; iterations < num_iterations; iterations++)
{
// 6. Re-call from shallow coverage using the estimated frequency as a prior,
// and compare to true deep calls,
// and compute new MAF estimate.
correct_shallow_calls = 0;
total_shallow_calls = 0;
shallow_calls = new AlleleFrequencyEstimate[sample_names.size()];
EM_N = 0.0;
double EM_sum = 0.0;
double likelihood = 0.0;
for (int i = 0; i < deep_calls.length; i++)
{
// Only shallow-call things where we know the truth!
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
{
shallow_callers.get(i).setAlleleFrequencyPrior(EM_alt_freq);
shallow_calls[i] = shallow_callers.get(i).map(tracker, ref, downsampled_contexts[i]);
String deep_genotype = deep_calls[i].genotype();
String shallow_genotype = shallow_calls[i].genotype();
likelihood += shallow_calls[i].lodVsNextBest;
//System.out.printf("DBG: %f %f %f %f\n",
// deep_calls[i].lodVsNextBest,
// deep_calls[i].lodVsRef,
// shallow_calls[i].lodVsNextBest,
// shallow_calls[i].lodVsRef);
if (deep_genotype.equals(shallow_genotype))
{
correct_shallow_calls += 1;
}
total_shallow_calls += 1;
if (! FRACTIONAL_COUNTS)
{
EM_sum += shallow_calls[i].emperical_allele_frequency() * shallow_calls[i].N;
EM_N += shallow_calls[i].N;
}
else
{
for (int j = 0; j <= shallow_calls[i].N; j++)
{
if (Double.isInfinite(shallow_calls[i].posteriors[j])) { shallow_calls[i].posteriors[j] = -10000; }
System.out.printf("DBG3: %d %f %d\n", j, shallow_calls[i].posteriors[j], shallow_calls[i].N);
EM_sum += Math.pow(10,shallow_calls[i].posteriors[j]) * (double)j;
EM_N += shallow_calls[i].N;
}
}
}
}
EM_alt_freq = EM_sum / EM_N;
shallow_calls_fraction_correct = correct_shallow_calls / total_shallow_calls;
trajectory[iterations+1] = EM_alt_freq;
likelihood_trajectory[iterations+1] = likelihood/(double)total_shallow_calls;
System.out.printf("DBGTRAJ %f %f %f %f\n", EM_sum, EM_N, trajectory[iterations], trajectory[iterations+1]);
}
// 7. Compare to estimation from the pool.
System.out.printf("EVAL %s %f %f %f %f %f %f %d %d %f %f %f %f\n",
pooled_call.location,
pooled_call.lodVsRef,
true_alt_freq,
pooled_call.qhat,
pooled_call.qstar,
true_alt_freq * true_N,
pooled_call.emperical_allele_frequency() * true_N,
pooled_call.N,
pooled_call.depth,
total_shallow_calls,
correct_shallow_calls,
shallow_calls_fraction_correct,
EM_alt_freq);
for (int i = 0; i < likelihood_trajectory.length; i++)
{
System.out.printf("TRAJECTORY %f %f\n", trajectory[i], likelihood_trajectory[i]);
}
System.out.print("\n\n");
return null;
}
private LocusContext poolLocusContext(LocusContext[] contexts)
{
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
ArrayList<Integer> offsets = new ArrayList<Integer>();
GenomeLoc location = null;
for (int i = 0; i < contexts.length; i++)
{
if (contexts[i] != null)
{
location = contexts[i].getLocation();
reads.addAll(contexts[i].getReads());
offsets.addAll(contexts[i].getOffsets());
}
}
return new LocusContext(location, reads, offsets);
}
private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample)
{
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
ArrayList<Integer> offsets = new ArrayList<Integer>();
for (int i = 0; i < context.getReads().size(); i++)
{
SAMRecord read = context.getReads().get(i);
Integer offset = context.getOffsets().get(i);
String RG = (String)(read.getAttribute("RG"));
String sample = read.getHeader().getReadGroup(RG).getSample();
if (sample == sample_name)
{
reads.add(read);
offsets.add(offset);
}
}
if (downsample != 0)
{
List<Integer> perm = new ArrayList<Integer>();
for (int i = 0; i < reads.size(); i++) { perm.add(i); }
perm = Utils.RandomSubset(perm, downsample);
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
for (int i = 0; i < perm.size(); i++)
{
downsampled_reads.add(reads.get(perm.get(i)));
downsampled_offsets.add(offsets.get(perm.get(i)));
}
reads = downsampled_reads;
offsets = downsampled_offsets;
}
return new LocusContext(context.getLocation(), reads, offsets);
}
public void onTraversalDone()
{
return;
}
public String reduceInit()
{
return "";
}
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
return "";
}
}

View File

@ -1,28 +1,12 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@By(DataSource.REFERENCE)
public class PrintCoverageWalker extends LocusWalker<Integer, Integer> {

View File

@ -7,8 +7,8 @@ import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.*;
@ -40,10 +40,10 @@ public class ClusteredSNPFilterWalker extends RefWalker<Integer, Integer> {
public void initialize() {
try {
vwriter = new PrintWriter(VARIANTS_OUT);
vwriter.println(AlleleFrequencyEstimate.geliHeaderString());
vwriter.println(GeliTextWriter.headerLine);
if ( FILTERED_OUT != null ) {
fwriter = new PrintWriter(FILTERED_OUT);
fwriter.println(AlleleFrequencyEstimate.geliHeaderString());
fwriter.println(GeliTextWriter.headerLine);
}
} catch (FileNotFoundException e) {
throw new StingException(String.format("Could not open file for writing"));

View File

@ -3,8 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.FileNotFoundException;
@ -55,7 +55,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
try {
vwriter = new PrintWriter(VARIANTS_OUT_HEAD + ".included.geli.calls");
vwriter.println(AlleleFrequencyEstimate.geliHeaderString());
vwriter.println(GeliTextWriter.headerLine);
swriters = new HashMap<String, PrintWriter>();
@ -118,7 +118,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
swriters.get(STUDY_NAME).print(vec.getStudyHeader() + "\t");
PrintWriter writer = new PrintWriter(VARIANTS_OUT_HEAD + ".excluded." + exclusionClassName + ".geli.calls");
writer.println(AlleleFrequencyEstimate.geliHeaderString());
writer.println(GeliTextWriter.headerLine);
ewriters.put(exclusionClassName, writer);
} catch (InstantiationException e) {

View File

@ -1,210 +0,0 @@
package org.broadinstitute.sting.playground.utils;
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.Arrays;
public class AlleleFrequencyEstimate {
public GenomeLoc location;
public char ref;
public char alt;
public int N;
public double qhat;
public double qstar;
public double lodVsRef;
public double lodVsNextBest;
public double pBest;
public double pRef;
public int depth;
public String notes;
public String bases;
//public double[][] quals;
public double[] posteriors;
public String sample_name;
public int n_ref;
public int n_het;
public int n_hom;
public GenotypeLikelihoods genotypeLikelihoods = null;
GenomeLoc l;
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth, String bases, double[][] quals, double[] posteriors, String sample_name)
{
this.location = location;
this.ref = ref;
this.alt = alt;
this.N = N;
this.qhat = qhat;
this.qstar = qstar;
this.lodVsRef = lodVsRef;
this.lodVsNextBest = lodVsNextBest;
this.pBest = pBest;
this.pRef = pRef;
this.depth = depth;
this.notes = "";
this.bases = bases;
//this.quals = quals;
this.posteriors = posteriors;
this.sample_name = sample_name;
}
public boolean isREF() { return (this.lodVsRef <= -5.0); }
public boolean isSNP() { return (this.lodVsRef >= 5.0); }
/** Return the most likely genotype. */
public String genotype()
{
int alt_count = (int)(qstar * N);
int ref_count = N-alt_count;
char[] alleles = new char[N];
int i;
for (i = 0; i < ref_count; i++) { alleles[i] = ref; }
for (; i < N; i++) { alleles[i] = alt; }
Arrays.sort(alleles);
return new String(alleles);
}
public double emperical_allele_frequency()
{
return (double)Math.round((double)qstar * (double)N) / (double)N;
}
public double emperical_allele_frequency(int N)
{
return (double)Math.round((double)qstar * (double)N) / (double)N;
}
public String asGFFString()
{
String s = "";
s += String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\t",
location.getContig(),
location.getStart(),
location.getStart(),
lodVsRef);
s += String.format("\t;\tSAMPLE %s", sample_name);
s += String.format("\t;\tREF %c", ref);
s += String.format("\t;\tALT %c", alt);
s += String.format("\t;\tFREQ %f", qstar);
s += String.format("\t;\tGENOTYPE %s", this.genotype());
s += String.format("\t;\tDEPTH %d", depth);
s += String.format("\t;\tLODvsREF %f", lodVsRef);
s += String.format("\t;\tLODvsNEXTBEST %f", lodVsNextBest);
s += String.format("\t;\tQHAT %f", qhat);
s += String.format("\t;\tQSTAR %f", qstar);
s += String.format("\t;\tBASES %s", bases);
s += ";\n";
// add quals.
return s;
}
public static String asTabularStringHeader()
{
return "location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases";
}
public static String geliHeaderString() {
return "#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT";
}
public String asGeliString()
{
// #Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT
// chr1 7764136 A 48 99 CC 83.650421 9.18159 -92.83638 -18.367548 -96.91729 -96.614204 -9.185958 -23.33643 -23.033337 -101.282059 -101.583092 -101.279999
// chr pos ref Nreads maxMapQ genotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT
//public double[] posteriors;
return String.format("%s %16d %c %8d %d %s %.6f %.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f",
location.getContig(),
location.getStart(),
ref,
depth,
-1,
genotype(),
lodVsRef,
lodVsNextBest,
posteriors[0],
posteriors[1],
posteriors[2],
posteriors[3],
posteriors[4],
posteriors[5],
posteriors[6],
posteriors[7],
posteriors[8],
posteriors[9]);
}
public String asTabularString()
{
return String.format("%s %s %c %c %s %f %f %f %f %d %s",
location,
sample_name,
ref,
alt,
genotype(),
qhat,
qstar,
lodVsRef,
lodVsNextBest,
depth,
bases);
}
public String toString() { return asTabularString(); }
public String asString() {
// Print out the called bases
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
// base has you calculate qstar = 0.0 and qhat = 0.49 and that leads to a genotype predicition of AG according
// to qhat, but AA according to qstar. This needs to be further investigated to see whether we really want
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
long numNonrefBases = Math.round(qstar * N);
long numRefBases = N - numNonrefBases;
if (ref < alt) { // order bases alphabetically
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
}else{
return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases);
}
}
public String asPoolTabularString()
{
return String.format("%s %c %c %f %f %f %s %f %d %d %d %d",
location,
ref,
alt,
qstar,
pBest,
pRef,
"NA",
lodVsRef,
N,
n_ref,
n_het,
n_hom);
}
public double posterior()
{
return this.posteriors[(int)this.qstar * this.N];
}
public String callType() {
// Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP
String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"};
String genotype = genotype();
int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0);
return callTypeString[ref_matches];
}
}

View File

@ -1,198 +0,0 @@
package org.broadinstitute.sting.playground.utils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
public class AlleleMetrics {
private double LOD_cutoff = 5;
private long dbsnp_hits = 0;
private long num_variants = 0;
private long num_loci_total = 0;
private long num_loci_confident = 0;
private long hapmap_genotype_correct = 0;
private long hapmap_genotype_incorrect = 0;
private long hapmap_refvar_correct = 0;
private long hapmap_refvar_incorrect = 0;
private final double dbl_cmp_precision = 0.0001;
protected PrintStream out;
public AlleleMetrics(String metricsOutputPath) {
initialize(new File(metricsOutputPath), LOD_cutoff);
}
public AlleleMetrics(String metricsOutputPath, double lodThreshold) {
initialize(new File(metricsOutputPath), lodThreshold);
}
public AlleleMetrics(File metricsOutputFile) {
initialize(metricsOutputFile, LOD_cutoff);
}
public AlleleMetrics(File metricsOutputFile, double lodThreshold) {
initialize(metricsOutputFile, lodThreshold);
}
private void initialize(File metricsOutputFile, double lodThresold) {
try {
this.out = new PrintStream(metricsOutputFile);
} catch (FileNotFoundException e) {
System.err.format("Unable to open file '%s'. Perhaps the parent directory does not exist or is read-only.", metricsOutputFile.getAbsolutePath());
System.exit(-1);
}
this.LOD_cutoff = lodThresold;
}
public void nextPosition(GenotypeCall cl, RefMetaDataTracker tracker) {
SSGGenotypeCall call = (SSGGenotypeCall)cl;
num_loci_total += 1;
boolean is_dbSNP_SNP = false;
boolean has_hapmap_chip_genotype = false;
rodGFF hapmap_chip_genotype = null;
for ( ReferenceOrderedDatum datum : tracker.getAllRods() )
{
if ( datum != null )
{
if ( datum instanceof rodDbSNP)
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
if ( datum instanceof rodGFF )
{
has_hapmap_chip_genotype = true;
hapmap_chip_genotype = (rodGFF)datum;
}
}
}
double result = call.getBestRef();
if (Math.abs(call.getBestNext()) >= LOD_cutoff) { num_loci_confident += 1; }
if (call.isVariation() && result >= LOD_cutoff)
{
// Confident variant.
num_variants += 1;
if (is_dbSNP_SNP)
{
dbsnp_hits += 1;
}
}
if (has_hapmap_chip_genotype) {
// convert hapmap call to mixture of ref/nonref
String hapmap_genotype = hapmap_chip_genotype.getFeature();
long refs=0, alts=0;
double hapmap_q;
String str = call.getBases();
char alt = str.charAt(0);
if (str.charAt(0) == call.getReferencebase()) alt = str.charAt(1);
for (char c : hapmap_genotype.toCharArray()) {
if (c == call.getReferencebase()) { refs++; }
if (c == alt) { alts++; }
}
if (refs+alts > 0) {
hapmap_q = ((double) alts) / ((double) (refs+alts));
}else{
hapmap_q = -1.0;
}
// Hapmap debug info
//out.format("HAPMAP DEBUG %.2f %.2f %.2f ", hapmap_q, alleleFreq.qstar, alleleFreq.lodVsRef);
//String called_genotype = alleleFreq.asString();
//out.format("%s %s %c %c", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt);
//System.out.printf("DBG %f %s\n", LOD_cutoff, alleleFreq.asTabularString());
if (call.getBestNext() >= LOD_cutoff) {
// TODO : should this all be commented out?
/*
System.out.printf("DBG %f %f %f %f\n",
hapmap_q,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.lodVsNextBest);
*/
// Calculate genotyping performance - did we get the correct genotype of the N+1 choices?
//if (hapmap_q != -1 && hapmap_q == alleleFreq.qstar) {
/*if (Math.abs(hapmap_q - -1.0) > dbl_cmp_precision && Math.abs(hapmap_q - alleleFreq.qstar) <= dbl_cmp_precision) {
hapmap_genotype_correct++;
}else{
hapmap_genotype_incorrect++;
//System.out.printf(" INCORRECT GENOTYPE Bases: %s", AlleleFrequencyWalker.getBases(context));
//out.printf(" INCORRECT GENOTYPE");
//AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context));
}*/
}
if (result >= LOD_cutoff || -1 * result >= LOD_cutoff) {
// Now calculate ref / var performance - did we correctly classify the site as
// reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here
boolean hapmap_var = hapmap_q != 0.0;
boolean called_var = call.isVariation();
//if (hapmap_q != -1 && hapmap_var != called_var) {
if (Math.abs(hapmap_q - -1.0) > dbl_cmp_precision && hapmap_var != called_var) {
hapmap_refvar_incorrect++;
//out.printf(" INCORRECT REFVAR CALL");
}else{
hapmap_refvar_correct++;
}
}
//out.print("\n");
}
}
public void printMetrics()
{
if (num_loci_total == 0) { return; }
out.printf("\n");
out.printf("Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
out.printf("-------------------------------------------------\n");
out.printf("Total loci : %d\n", num_loci_total);
out.printf("Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
if (num_variants != 0)
{
out.printf("Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
out.printf("Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
out.printf("-------------------------------------------------\n");
out.printf("-- Hapmap Genotyping performance --\n");
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
out.printf("-------------------------------------------------\n");
out.printf("-- Hapmap Reference/Variant performance --\n");
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
}
out.println();
}
public void printMetricsAtLocusIntervals(int loci_interval) {
if (num_loci_total % loci_interval == 0) printMetrics();
}
}

View File

@ -154,7 +154,7 @@ public class BasicGenotype implements Genotype {
* @return
*/
@Override
public org.broadinstitute.sting.utils.genotype.variant.Variant toVariant() {
public Variant toVariant() {
return null;
}

View File

@ -2,7 +2,7 @@ package org.broadinstitute.sting.utils.genotype;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore;
import org.broadinstitute.sting.utils.genotype.variant.Variant;
import org.broadinstitute.sting.utils.genotype.Variant;
/**
* @author aaron

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.utils.genotype.calls;
package org.broadinstitute.sting.utils.genotype;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.Genotype;

View File

@ -1,7 +1,5 @@
package org.broadinstitute.sting.utils.genotype;
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
/**
* @author aaron
* <p/>

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.utils.genotype.variant;
package org.broadinstitute.sting.utils.genotype;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore;
@ -21,7 +21,7 @@ public interface Variant {
*
* @return VariantFrequency with the stored frequency
*/
public VariantFrequency getFrequency();
public double getNonRefAlleleFrequency();
/**
* get the confidence score for this variance

View File

@ -7,8 +7,8 @@ import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
import java.io.File;

View File

@ -3,7 +3,7 @@ package org.broadinstitute.sting.utils.genotype.geli;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
import java.io.File;
import java.io.FileNotFoundException;
@ -32,9 +32,11 @@ public class GeliTextWriter implements GenotypeWriter {
} catch (FileNotFoundException e) {
throw new StingException("Unable to open file " + file.toURI());
}
mWriter.println("#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT");
mWriter.println(headerLine);
}
public static String headerLine = "#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT";
/**
* Add a genotype, given a genotype locus
*

View File

@ -8,7 +8,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
import java.io.DataOutputStream;
import java.io.File;