Reorganization of the genotyping system
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1370 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9f1d3aed26
commit
3485397483
|
|
@ -1,15 +1,11 @@
|
|||
package org.broadinstitute.sting.playground.utils;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeGenerator;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
|
|
@ -18,7 +14,7 @@ import java.util.ArrayList;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
public class GenotypeLikelihoods implements GenotypeGenerator {
|
||||
public class GenotypeLikelihoods {
|
||||
// precalculate these for performance (pow/log10 is expensive!)
|
||||
|
||||
/**
|
||||
|
|
@ -336,16 +332,6 @@ public class GenotypeLikelihoods implements GenotypeGenerator {
|
|||
return this.ref_likelihood;
|
||||
}
|
||||
|
||||
private IndelLikelihood indel_likelihood;
|
||||
|
||||
public void addIndelLikelihood(IndelLikelihood indel_likelihood) {
|
||||
this.indel_likelihood = indel_likelihood;
|
||||
}
|
||||
|
||||
public IndelLikelihood getIndelLikelihood() {
|
||||
return this.indel_likelihood;
|
||||
}
|
||||
|
||||
/**
|
||||
* given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs
|
||||
*
|
||||
|
|
@ -355,8 +341,7 @@ public class GenotypeLikelihoods implements GenotypeGenerator {
|
|||
*
|
||||
* @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values
|
||||
*/
|
||||
@Override
|
||||
public GenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) {
|
||||
public SSGGenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) {
|
||||
filterQ0Bases(!keepQ0Bases); // Set the filtering / keeping flag
|
||||
|
||||
// for calculating the rms of the mapping qualities
|
||||
|
|
|
|||
|
|
@ -1,17 +1,13 @@
|
|||
package org.broadinstitute.sting.utils.genotype.calls;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
|
||||
import org.broadinstitute.sting.utils.genotype.LexigraphicalComparator;
|
||||
import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore;
|
||||
import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore;
|
||||
import org.broadinstitute.sting.utils.genotype.variant.Variant;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,22 +1,16 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleMetrics;
|
||||
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
|
|
@ -108,13 +102,11 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
}
|
||||
|
||||
/**
|
||||
* Compute the AlleleFrequencyEstimate at a given locus.
|
||||
* Compute at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param ref the reference base
|
||||
* @param context contextual information around the locus
|
||||
*
|
||||
* @return an AlleleFrequencyEstimate object
|
||||
*/
|
||||
public SSGGenotypeCall map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
SSGGenotypeCall oldAndBusted = mapOldAndBusted(tracker, ref, context);
|
||||
|
|
|
|||
|
|
@ -1,915 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
//import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleMetrics;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate, String>// implements AllelicVariant
|
||||
{
|
||||
@Argument(doc="Number of chromosomes in data") public int N;
|
||||
@Argument(required=false,doc="downsample") public int DOWNSAMPLE = 0;
|
||||
@Argument(doc="File to output GFF formatted allele frequency calls") public String GFF_OUTPUT_FILE;
|
||||
@Argument(shortName="met", doc="Turns on logging of metrics on the fly with AlleleFrequency calculation") public boolean LOG_METRICS;
|
||||
@Argument(required=false, doc="Name of file where metrics will output") public String METRICS_OUTPUT_FILE = "metrics.out";
|
||||
@Argument(required=false, doc="Ignores 4-base probabilities and only uses the quality of the best/called base") public boolean FORCE_1BASE_PROBS;
|
||||
|
||||
protected static Logger logger = Logger.getLogger(AlleleFrequencyWalker.class);
|
||||
|
||||
Random random;
|
||||
PrintStream output;
|
||||
|
||||
private boolean initalized = false;
|
||||
static private double quality_precision = 1e-4;
|
||||
public void initalize()
|
||||
{
|
||||
if (initalized) { return; }
|
||||
|
||||
try
|
||||
{
|
||||
this.random = new java.util.Random(0);
|
||||
if ( GFF_OUTPUT_FILE.equals("-") )
|
||||
this.output = out;
|
||||
else
|
||||
this.output = new PrintStream(GFF_OUTPUT_FILE);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
initalized = true;
|
||||
}
|
||||
|
||||
public boolean requiresReads() { return true; }
|
||||
|
||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
// init in the map because GATK doesn't appear to be initing me today. no problemo.
|
||||
this.initalize();
|
||||
|
||||
// Convert context data into bases and 4-base quals
|
||||
Pileup pileup = new ReadBackedPileup(ref, context);
|
||||
String bases = pileup.getBases();
|
||||
double quals[][] = getQuals(context);
|
||||
//String[] indels = getIndels(context);
|
||||
|
||||
logger.debug(String.format("In alleleFrequnecy walker: N=%d, d=%d", N, DOWNSAMPLE));
|
||||
|
||||
if ((DOWNSAMPLE != 0) && (DOWNSAMPLE < bases.length()))
|
||||
{
|
||||
String downsampled_bases = "";
|
||||
double downsampled_quals[][] = new double[DOWNSAMPLE][4];
|
||||
|
||||
int picked_bases[] = new int[bases.length()];
|
||||
for (int i = 0; i < picked_bases.length; i++) { picked_bases[i] = 0; }
|
||||
while (downsampled_bases.length() < DOWNSAMPLE)
|
||||
{
|
||||
int choice;
|
||||
|
||||
/*
|
||||
System.out.printf("DBG %b %b %b\n",
|
||||
random == null,
|
||||
bases == null,
|
||||
picked_bases == null);
|
||||
*/
|
||||
|
||||
for (choice = random.nextInt(bases.length()); picked_bases[choice] == 1; choice = random.nextInt(bases.length()));
|
||||
picked_bases[choice] = 1;
|
||||
downsampled_bases += bases.charAt(choice);
|
||||
downsampled_quals[downsampled_bases.length()-1] = quals[choice];
|
||||
}
|
||||
|
||||
//System.out.printf("From: %s\n", bases);
|
||||
//System.out.printf("To : %s\n", downsampled_bases);
|
||||
|
||||
bases = downsampled_bases;
|
||||
quals = downsampled_quals;
|
||||
}
|
||||
|
||||
// Count bases
|
||||
int[] base_counts = new int[4];
|
||||
for (byte b : bases.getBytes())
|
||||
base_counts[nuc2num[b]]++;
|
||||
|
||||
// Find alternate allele - 2nd most frequent non-ref allele
|
||||
// (maybe we should check for ties and eval both or check most common including quality scores)
|
||||
int altnum = -1; // start with first base numerical identity (0,1,2,3)
|
||||
double altcount = -1; // start with first base count
|
||||
int refnum = nuc2num[ref];
|
||||
|
||||
for (int b=0; b<4; b++) {
|
||||
if ((b != refnum) && (base_counts[b] > altcount)) {
|
||||
altnum = b; altcount = base_counts[b];
|
||||
}
|
||||
}
|
||||
assert(altnum != -1);
|
||||
|
||||
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation(), N, bases.getBytes(), quals, refnum, altnum, bases.length());
|
||||
|
||||
alleleFreq.notes = String.format("A:%d C:%d G:%d T:%d",
|
||||
base_counts[nuc2num['A']],
|
||||
base_counts[nuc2num['C']],
|
||||
base_counts[nuc2num['G']],
|
||||
base_counts[nuc2num['T']]);
|
||||
|
||||
// Print dbSNP data if its there
|
||||
if (true) {
|
||||
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
|
||||
if ( datum != null && datum instanceof rodDbSNP) {
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
//System.out.printf(" DBSNP %s on %s => %s%n", dbsnp.toSimpleString(), dbsnp.strand, Utils.join("/", dbsnp.getAllelesFWD()));
|
||||
alleleFreq.notes += String.format(" ROD: %s ",dbsnp.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug(String.format(" => result is %s", alleleFreq));
|
||||
|
||||
//if (LOG_METRICS) metrics.nextPosition(alleleFreq, tracker);
|
||||
|
||||
return alleleFreq;
|
||||
}
|
||||
|
||||
/*
|
||||
static public String[] getIndels(LocusContext context)
|
||||
{
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
String[] indels = new String[reads.size()];
|
||||
|
||||
for (int i = 0; i < reads.size(); i++)
|
||||
{
|
||||
SAMRecord read = reads.get(i);
|
||||
Cigar cigar = read.getCigar();
|
||||
|
||||
int k = 0;
|
||||
for (int j = 0; j < cigar.numCigarElements(); j++)
|
||||
{
|
||||
CigarOperator operator = cigar.getCigarElement(j).getOperator();
|
||||
int length = cigar.getCigarElement(j).getLength();
|
||||
if (operator == CigarOperator.M)
|
||||
{
|
||||
k += length;
|
||||
}
|
||||
else if ((k != offset) && (operator == CigarOperator.I))
|
||||
{
|
||||
k += length;
|
||||
}
|
||||
else if ((k == offset) && (operator == CigarOperator.I))
|
||||
{
|
||||
// this insertion is associated with this offset.
|
||||
|
||||
break;
|
||||
}
|
||||
else if ((k == offset) && (operator == CigarOperator.D))
|
||||
{
|
||||
|
||||
break;
|
||||
}
|
||||
else if ((k == offset) &&
|
||||
((operator == CigarOperator.I) || (operator == CigarOperator.D)))
|
||||
{
|
||||
// no indel here.
|
||||
indels[i] = "";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return indels;
|
||||
}
|
||||
*/
|
||||
|
||||
public double[][] getQuals (LocusContext context)
|
||||
{
|
||||
int numReads = context.getReads().size(); //numReads();
|
||||
double[][] quals = new double[numReads][4];
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < numReads; i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
|
||||
// First, set the called base to the correct quality
|
||||
int callednum = nuc2num[read.getReadBases()[offset]]; // number of the called base (0,1,2,3)
|
||||
|
||||
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
||||
|
||||
// For other 3 bases, check if 2-base probs exist
|
||||
assert (reads.size() > 0);
|
||||
Object SQ_field = reads.get(i).getAttribute("SQ");
|
||||
if (SQ_field == null || FORCE_1BASE_PROBS) {
|
||||
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
|
||||
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
|
||||
for (int b=0; b<4; b++)
|
||||
{
|
||||
if (b != callednum)
|
||||
{
|
||||
quals[i][b] = nonref_quals;
|
||||
if (quals[i][b] <= quality_precision) { quals[i][b] = quality_precision; } // avoid zero probs.
|
||||
}
|
||||
}
|
||||
}else{
|
||||
assert (SQ_field instanceof byte[]);
|
||||
byte[] hex_quals = (byte[]) SQ_field;
|
||||
//System.out.printf("SQ field (hex): %s\n", bytesToHexString(hex_quals));
|
||||
//System.out.printf("SAM record: %s\n", read.format());
|
||||
|
||||
byte hex_qual = hex_quals[offset];
|
||||
int called2num = QualityUtils.compressedQualityToBaseIndex(hex_qual);
|
||||
|
||||
/*
|
||||
int crossTalkPartnerIndex = BaseUtils.crossTalkPartnerIndex(callednum);
|
||||
|
||||
if (called2num == crossTalkPartnerIndex) {
|
||||
double nonref_quals = (1.0 - quals[i][callednum]) / 3.0;
|
||||
for (int b=0; b<4; b++)
|
||||
if (b != callednum)
|
||||
quals[i][b] = nonref_quals;
|
||||
} else {
|
||||
*/
|
||||
double qual2 = QualityUtils.compressedQualityToProb(hex_qual);
|
||||
//System.out.printf("2ND %x %d %f\n", hex_qual, called2num, qual2);
|
||||
quals[i][called2num] = qual2;
|
||||
if (quals[i][called2num] <= quality_precision) { quals[i][called2num] = quality_precision; } // avoid zero probs.
|
||||
|
||||
double nonref_quals = (1.0 - quals[i][callednum] - quals[i][called2num]) / 2.0;
|
||||
for (int b=0; b<4; b++)
|
||||
if (b != callednum && b != called2num)
|
||||
quals[i][b] = nonref_quals;
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
//print_base_qual_matrix(quals);
|
||||
return quals;
|
||||
}
|
||||
|
||||
// Code pulled from SAMUtils for debugging
|
||||
static String bytesToHexString(final byte[] data) {
|
||||
final char[] chars = new char[2 * data.length];
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
final byte b = data[i];
|
||||
chars[2*i] = toHexDigit((b >> 4) & 0xF);
|
||||
chars[2*i+1] = toHexDigit(b & 0xF);
|
||||
}
|
||||
return new String(chars);
|
||||
}
|
||||
|
||||
private static char toHexDigit(final int value) {
|
||||
return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10));
|
||||
}
|
||||
|
||||
public AlleleFrequencyEstimate AlleleFrequencyEstimator(GenomeLoc location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
|
||||
{
|
||||
|
||||
// q = hypothetical %nonref
|
||||
// qstar = true %nonref
|
||||
// G = N, qstar, alt
|
||||
// N = number of chromosomes
|
||||
// alt = alternate hypothesis base (most frequent nonref base)
|
||||
// ref = reference base
|
||||
|
||||
double qstar;
|
||||
int qstar_N;
|
||||
|
||||
double qstep = 0.001;
|
||||
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
|
||||
|
||||
double posterior_null_hyp;
|
||||
|
||||
// Initialize empyt MixtureLikelihood holders for each possible qstar (N+1)
|
||||
MixtureLikelihood[] bestMixtures = new MixtureLikelihood[N+1];
|
||||
double posteriors[] = new double[N+1];
|
||||
|
||||
{
|
||||
double q = ML_q_byEM(bases, quals, refnum, altnum);
|
||||
double pDq = P_D_q(bases, quals, q, refnum, altnum);
|
||||
long q_R = Math.round(q*bases.length);
|
||||
|
||||
posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, 0, 0) + P_G(N, 0);
|
||||
bestMixtures[0] = new MixtureLikelihood(posterior_null_hyp, 0.0, 0.0);
|
||||
posteriors[0] = posterior_null_hyp;
|
||||
|
||||
//System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n", location, 0.0, 0.0, P_D_q(bases, quals, 0.0, refnum, altnum), P_q_G(bases, N, 0.0, 0, 0), P_G(N, 0), prior_alt_frequency, posterior_null_hyp, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases));
|
||||
|
||||
assert(! Double.isNaN(P_D_q(bases, quals, 0.0, refnum, altnum)));
|
||||
assert(! Double.isNaN(P_q_G(bases, N, 0.0, 0, 0)));
|
||||
assert(! Double.isNaN(P_G(N, 0)));
|
||||
|
||||
assert(! Double.isNaN(posteriors[0]));
|
||||
assert(! Double.isInfinite(posteriors[0]));
|
||||
|
||||
// qstar - true allele balances
|
||||
//for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++)
|
||||
for (qstar_N = 1; qstar_N <= N; qstar_N += 1)
|
||||
{
|
||||
qstar = (double)qstar_N / (double)N;
|
||||
|
||||
double pqG = P_q_G(bases, N, q, qstar, q_R);
|
||||
double pG = P_G(N, qstar_N);
|
||||
double posterior = pDq + pqG + pG;
|
||||
|
||||
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
|
||||
posteriors[qstar_N] = posterior;
|
||||
|
||||
//System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n",
|
||||
// location, q, qstar, pDq, pqG, pG, prior_alt_frequency, posterior, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases));
|
||||
}
|
||||
}
|
||||
|
||||
// First reverse sort mixtures according to highest posterior probabililty
|
||||
Arrays.sort(bestMixtures);
|
||||
|
||||
// Calculate Lod of any variant call versus the reference call
|
||||
// Answers how confident are we in the best variant (nonref) mixture versus the null hypothesis
|
||||
// reference mixture - qhat = qstar = 0.0
|
||||
double lodVarVsRef = bestMixtures[0].posterior - posterior_null_hyp;
|
||||
|
||||
// Now reverse sort ALL mixtures according to highest posterior probability
|
||||
Arrays.sort(bestMixtures);
|
||||
|
||||
// Calculate Lod of the mixture versus other possible
|
||||
// Answers how confident are we in the best mixture versus the nextPosition best mixture
|
||||
double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior;
|
||||
|
||||
if (lodVarVsRef == 0) { lodVarVsRef = -1.0 * lodBestVsNextBest; }
|
||||
|
||||
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
|
||||
num2nuc[refnum],
|
||||
num2nuc[altnum],
|
||||
N,
|
||||
bestMixtures[0].qhat,
|
||||
bestMixtures[0].qstar,
|
||||
lodVarVsRef,
|
||||
lodBestVsNextBest,
|
||||
bestMixtures[0].posterior,
|
||||
posterior_null_hyp,
|
||||
depth,
|
||||
new String(bases),
|
||||
quals,
|
||||
posteriors,
|
||||
"unknown_sample");
|
||||
return alleleFreq;
|
||||
}
|
||||
|
||||
public class MixtureLikelihood implements Comparable<MixtureLikelihood> {
|
||||
public double posterior;
|
||||
public double qstar;
|
||||
public double qhat;
|
||||
|
||||
MixtureLikelihood() {
|
||||
this.posterior = Math.log10(0);
|
||||
this.qstar = Math.log10(0);
|
||||
this.qhat = Math.log10(0);
|
||||
}
|
||||
|
||||
MixtureLikelihood(double posterior, double qstar, double qhat) {
|
||||
this.posterior = posterior;
|
||||
this.qstar = qstar;
|
||||
this.qhat = qhat;
|
||||
}
|
||||
|
||||
public int compareTo (MixtureLikelihood other) {
|
||||
// default sorting is REVERSE sorting on posterior prob; gives array with top post. prob. as first element
|
||||
if (posterior < other.posterior) return 1;
|
||||
else if (posterior > other.posterior) return -1;
|
||||
else return 0;
|
||||
}
|
||||
}
|
||||
|
||||
double ML_q(byte[] bases, double[][]quals, int refnum, int altnum)
|
||||
{
|
||||
double ref_count = 0;
|
||||
double alt_count = 0;
|
||||
for (int i=0; i<bases.length; i++)
|
||||
{
|
||||
if (bases[i] == num2nuc[refnum]) { ref_count += 1; }
|
||||
if (bases[i] == num2nuc[altnum]) { alt_count += 1; }
|
||||
}
|
||||
return alt_count / (alt_count + ref_count);
|
||||
}
|
||||
|
||||
double ML_q_byEM(byte[] bases, double[][]quals, int refnum, int altnum)
|
||||
{
|
||||
double best_q = 0;
|
||||
double best_likelihood = -1000000;
|
||||
for (double q = 0.0; q <= 1.0; q += 0.001)
|
||||
{
|
||||
double likelihood = P_D_q(bases, quals, q, refnum, altnum);
|
||||
if (likelihood >= best_likelihood)
|
||||
{
|
||||
best_likelihood = likelihood;
|
||||
best_q = q;
|
||||
}
|
||||
}
|
||||
return best_q;
|
||||
}
|
||||
|
||||
double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum)
|
||||
{
|
||||
double p = 0.0;
|
||||
double epsilon = 1e-4;
|
||||
|
||||
for (int i=0; i<bases.length; i++)
|
||||
{
|
||||
double atomic = (1-q) * quals[i][refnum] + q * quals[i][altnum];
|
||||
if (atomic <= 0) { atomic = epsilon; }
|
||||
if (Double.isNaN(Math.log10(atomic))) { System.out.printf("DBG: %f %f %f\n", q, quals[i][refnum], quals[i][altnum]); }
|
||||
p += Math.log10(atomic);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
double P_q_G(byte [] bases, int N, double q, double qstar, long q_R)
|
||||
{
|
||||
double epsilon = 1e-3;
|
||||
if (qstar == 0) { qstar = epsilon; }
|
||||
if (qstar == 1) { qstar = 1.0 - epsilon; }
|
||||
|
||||
if (N != 2) { return 0.0; }
|
||||
else { return binomialProb(q_R, bases.length, qstar); }
|
||||
}
|
||||
|
||||
double P_G(int N, int qstar_N)
|
||||
{
|
||||
// badly hard coded right now.
|
||||
|
||||
if ((N == 2) && (prior_alt_frequency != -1))
|
||||
{
|
||||
double p = -1.0;
|
||||
|
||||
if (qstar_N == 0) { p = Math.pow(1.0 - prior_alt_frequency, 2.0); }
|
||||
else if (qstar_N == 1) { p = 2.0 * (prior_alt_frequency * (1.0 - prior_alt_frequency)); }
|
||||
else if (qstar_N == 2) { p = Math.pow(prior_alt_frequency, 2.0); }
|
||||
else { assert(false); }
|
||||
|
||||
//System.out.printf("DBG2: %d %d %f %f\n", N, qstar_N, prior_alt_frequency, p);
|
||||
|
||||
return Math.log10(p);
|
||||
}
|
||||
|
||||
if (qstar_N == 0) { return Math.log10(0.999); }
|
||||
else if (qstar_N == N) { return Math.log10(1e-5); }
|
||||
else { return Math.log10(1e-3); }
|
||||
}
|
||||
|
||||
static String genotypeTypeString(double q, int N){
|
||||
switch (Math.round((float)q*N)) {
|
||||
case (0):
|
||||
return "ref";
|
||||
case (1):
|
||||
return "het";
|
||||
case (2):
|
||||
return "hom";
|
||||
}
|
||||
return "ERROR";
|
||||
}
|
||||
|
||||
static double binomialProb(long k, long n, double p) {
|
||||
// k - number of successes
|
||||
// n - number of Bernoulli trials
|
||||
// p - probability of success
|
||||
|
||||
double ans;
|
||||
|
||||
//if (((double)n*p < 5.0) && ((double)n*(1.0-p) < 5.0))
|
||||
if (n < 1000)
|
||||
{
|
||||
// For small n and the edges, compute it directly.
|
||||
ans = Math.log10((double)nchoosek(n, k)) + Math.log10(Math.pow(p, k)) + Math.log10(Math.pow(1-p, n-k));
|
||||
//System.out.printf("DBG1: %d %d %f %f %f %f %f\n",
|
||||
// k, n, p,
|
||||
// nchoosek(n,k), Math.pow(p,k), Math.pow(1-p, n-k),
|
||||
// ans);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For large n, approximate with a gaussian.
|
||||
double mean = (double)(n*p);
|
||||
double var = Math.sqrt((double)(n*p)*(1.0-p));
|
||||
double ans_1 = Math.log10((double)(1.0 / (var*Math.sqrt(2*Math.PI)))*Math.exp(-1.0 * Math.pow((double)k-mean,2)/(2.0*var*var)));
|
||||
double ans_2 = ((Utils.logGamma(n+1) - Utils.logGamma(k+1) - Utils.logGamma(n-k+1))/Math.log(10)) + Math.log10(Math.pow(p, k)) + Math.log10(Math.pow(1-p, n-k));
|
||||
double check = Math.log10((double)nchoosek(n, k)) + Math.log10(Math.pow(p, k)) + Math.log10(Math.pow(1-p, n-k));
|
||||
double residual = ans_2 - check;
|
||||
|
||||
//System.out.format("DBG: %d %.10f %.10f\n", nchoosek(n, k), Math.pow(p, k), Math.pow(1-p, n-k));
|
||||
//System.out.format("RESIDUAL: (%d,%d,%f) %.10f %.10f %.10f\n", k, n, p, ans, check, residual);
|
||||
|
||||
//System.out.printf("DBG2: %d %d %f %f %f %f %f\n", k, n, p, check, ans_1, ans_2, residual);
|
||||
|
||||
//if ((Double.isInfinite(check) || (Double.isNaN(check))))
|
||||
//{
|
||||
// System.out.printf("DBG2: %d %d %f %f\n", k, n, p, check);
|
||||
//}
|
||||
|
||||
ans = ans_1;
|
||||
}
|
||||
|
||||
//System.out.printf("DBG3: %d %d %f %f\n", n, k, p, ans);
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
static double nchoosek(long n, long k)
|
||||
{
|
||||
|
||||
if (k > n)
|
||||
return 0;
|
||||
|
||||
if (k > n/2)
|
||||
k = n-k; // Take advantage of symmetry
|
||||
|
||||
double accum = 1;
|
||||
for (long i = 1; i <= k; i++)
|
||||
accum = accum * (n-k+i) / i;
|
||||
|
||||
//return accum + 0.5; // avoid rounding error
|
||||
return accum; // avoid rounding error
|
||||
|
||||
/*
|
||||
long m = n - k;
|
||||
if (k < m)
|
||||
k = m;
|
||||
|
||||
long t = 1;
|
||||
for (long i = n, j = 1; i > k; i--, j++)
|
||||
t = t * i / j;
|
||||
return t;
|
||||
*/
|
||||
}
|
||||
|
||||
public static String repeat(char letter, long count) {
|
||||
// Repeat a character count times
|
||||
String result = "";
|
||||
for (int i=0; i<count; i++) {
|
||||
result = result + letter;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private String confident_ref_interval_contig = "";
|
||||
private long confident_ref_interval_start = 0;
|
||||
private double confident_ref_interval_LOD_sum = 0;
|
||||
private double confident_ref_interval_length = 0;
|
||||
private long last_position_considered = -1;
|
||||
private boolean inside_confident_ref_interval = false;
|
||||
AlleleMetrics metrics;
|
||||
|
||||
public String reduceInit()
|
||||
{
|
||||
confident_ref_interval_contig = "";
|
||||
confident_ref_interval_start = 0;
|
||||
confident_ref_interval_LOD_sum = 0;
|
||||
confident_ref_interval_length = 0;
|
||||
last_position_considered = -1;
|
||||
inside_confident_ref_interval = false;
|
||||
if (LOG_METRICS) metrics = new AlleleMetrics(METRICS_OUTPUT_FILE);
|
||||
return "";
|
||||
}
|
||||
|
||||
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
|
||||
{
|
||||
// Print RESULT data for confident calls
|
||||
|
||||
long current_offset = alleleFreq.location.getStart(); //Integer.parseInt(tokens[1]);
|
||||
|
||||
if (inside_confident_ref_interval &&
|
||||
((alleleFreq.lodVsRef > -5.0) || (current_offset != last_position_considered + 1)) )
|
||||
{
|
||||
// No longer hom-ref, so output a ref line.
|
||||
double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length;
|
||||
|
||||
output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n",
|
||||
confident_ref_interval_contig,
|
||||
confident_ref_interval_start,
|
||||
last_position_considered,
|
||||
lod,
|
||||
(int)(confident_ref_interval_length));
|
||||
|
||||
inside_confident_ref_interval = false;
|
||||
}
|
||||
else if (inside_confident_ref_interval && (alleleFreq.lodVsRef <= -5.0))
|
||||
{
|
||||
// Still hom-ref so increment the counters.
|
||||
confident_ref_interval_LOD_sum += alleleFreq.lodVsRef;
|
||||
confident_ref_interval_length += 1;
|
||||
}
|
||||
else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef > -5.0))
|
||||
{
|
||||
// do nothing.
|
||||
}
|
||||
else if ((!inside_confident_ref_interval) && (alleleFreq.lodVsRef <= -5.0))
|
||||
{
|
||||
// We moved into a hom-ref region so start a new interval.
|
||||
confident_ref_interval_contig = alleleFreq.location.getContig();
|
||||
confident_ref_interval_start = alleleFreq.location.getStart();
|
||||
confident_ref_interval_LOD_sum = alleleFreq.lodVsRef;
|
||||
confident_ref_interval_length = 1;
|
||||
inside_confident_ref_interval = true;
|
||||
}
|
||||
|
||||
last_position_considered = current_offset;
|
||||
|
||||
if (alleleFreq.lodVsRef >= 5) {
|
||||
this.output.print(alleleFreq.asGFFString());
|
||||
|
||||
/*
|
||||
String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N);
|
||||
System.out.print("DEBUG " + gtype + " ");
|
||||
if (gtype.contentEquals("het")) {
|
||||
System.out.println(alleleFreq.ref + "" + alleleFreq.alt);
|
||||
} else if (gtype.contentEquals("hom")) {
|
||||
System.out.println(alleleFreq.ref + "" + alleleFreq.ref);
|
||||
} else {
|
||||
System.out.println(alleleFreq.alt + "" + alleleFreq.alt);
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
if (LOG_METRICS) metrics.printMetricsAtLocusIntervals(1000);
|
||||
return "";
|
||||
}
|
||||
|
||||
private double prior_alt_frequency = -1.0;
|
||||
public void setAlleleFrequencyPrior(double frequency)
|
||||
{
|
||||
assert(! Double.isNaN(frequency) );
|
||||
assert(! Double.isInfinite(frequency));
|
||||
if (frequency == 0) { frequency = 1e-5; } // how many epsilons do we need!? This is worrisome.
|
||||
this.prior_alt_frequency = frequency;
|
||||
}
|
||||
|
||||
static int nuc2num[];
|
||||
static char num2nuc[];
|
||||
public AlleleFrequencyWalker() {
|
||||
nuc2num = new int[128];
|
||||
nuc2num['A'] = 0;
|
||||
nuc2num['C'] = 1;
|
||||
//nuc2num['T'] = 2;
|
||||
nuc2num['G'] = 2;
|
||||
//nuc2num['G'] = 3;
|
||||
nuc2num['T'] = 3;
|
||||
nuc2num['a'] = 0;
|
||||
nuc2num['c'] = 1;
|
||||
//nuc2num['t'] = 2;
|
||||
nuc2num['g'] = 2;
|
||||
//nuc2num['g'] = 3;
|
||||
nuc2num['t'] = 3;
|
||||
|
||||
num2nuc = new char[4];
|
||||
num2nuc[0] = 'A';
|
||||
num2nuc[1] = 'C';
|
||||
//num2nuc[2] = 'T';
|
||||
num2nuc[2] = 'G';
|
||||
//num2nuc[3] = 'G';
|
||||
num2nuc[3] = 'T';
|
||||
|
||||
//if (System.getenv("N") != null) { this.N = (new Integer(System.getenv("N"))).intValue(); }
|
||||
//else { this.N = 2; }
|
||||
//
|
||||
//if (System.getenv("DOWNSAMPLE") != null) { this.DOWNSAMPLE = (new Integer(System.getenv("DOWNSAMPLE"))).intValue(); }
|
||||
//else { this.DOWNSAMPLE = 0; }
|
||||
}
|
||||
|
||||
public void onTraversalDone(String result)
|
||||
{
|
||||
if (inside_confident_ref_interval)
|
||||
{
|
||||
// if we have a confident reference interval still hanging open, close it.
|
||||
|
||||
double lod = confident_ref_interval_LOD_sum / confident_ref_interval_length;
|
||||
|
||||
output.format("%s\tCALLER\tREFERENCE\t%d\t%d\t%f\t.\t.\tLENGTH %d\n",
|
||||
confident_ref_interval_contig,
|
||||
confident_ref_interval_start,
|
||||
last_position_considered,
|
||||
lod,
|
||||
(int)(confident_ref_interval_length));
|
||||
|
||||
inside_confident_ref_interval = false;
|
||||
}
|
||||
|
||||
if (LOG_METRICS) metrics.printMetrics();
|
||||
|
||||
try
|
||||
{
|
||||
this.output.flush();
|
||||
if ( ! GFF_OUTPUT_FILE.equals("-") )
|
||||
this.output.close();
|
||||
this.output.close();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void print_base_qual_matrix(double [][]quals) {
|
||||
// Print quals for debugging
|
||||
System.out.println("Base quality matrix");
|
||||
int numReads = quals.length;
|
||||
for (int b=0; b<4; b++) {
|
||||
System.out.print(num2nuc[b]);
|
||||
for (int i=0; i < numReads; i++){
|
||||
System.out.format(" %.4f", quals[i][b]);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args)
|
||||
{
|
||||
|
||||
AlleleFrequencyWalker.binomialProb(5,10,0.5);
|
||||
AlleleFrequencyWalker.binomialProb(50,100,0.5);
|
||||
AlleleFrequencyWalker.binomialProb(1500,2965,0.508065);
|
||||
AlleleFrequencyWalker.binomialProb(0,197,0.5);
|
||||
AlleleFrequencyWalker.binomialProb(13,197,0.5);
|
||||
AlleleFrequencyWalker.binomialProb(0, 7, 1e-3);
|
||||
System.exit(0);
|
||||
|
||||
{
|
||||
int N = 2;
|
||||
byte[] het_bases = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
double[][] het_quals = {{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}};
|
||||
AlleleFrequencyWalker w = new AlleleFrequencyWalker();
|
||||
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20);
|
||||
System.out.print(String.format("50%% Het : %s %c %c %f %f %f %d %s\n",
|
||||
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null"));
|
||||
}
|
||||
|
||||
{
|
||||
byte[] het_bases = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
|
||||
double[][] het_quals = {
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
{0.999, 0.001/3.0, 0.001/3.0, 0.001/3.0},
|
||||
};
|
||||
|
||||
assert(het_bases.length == het_quals.length);
|
||||
|
||||
int N = 10;
|
||||
AlleleFrequencyWalker w = new AlleleFrequencyWalker();
|
||||
w.N = 10;
|
||||
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(null, N, het_bases, het_quals, 0, 1, 20);
|
||||
System.out.print(String.format("10%% Het : %s %c %c %f %f %f %d %s\n",
|
||||
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.lodVsRef, 20, "null"));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -6,9 +6,6 @@ import org.broadinstitute.sting.gatk.*;
|
|||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
|
|
|
|||
|
|
@ -5,15 +5,13 @@ import org.broadinstitute.sting.gatk.LocusContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.ListUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeCall;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,20 +1,11 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.playground.utils.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.zip.*;
|
||||
|
|
|
|||
|
|
@ -1,187 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Pileup;
|
||||
import org.broadinstitute.sting.utils.BasicPileup;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class FindNonrandomSecondBestBasePiles extends LocusWalker<Integer, Integer> {
|
||||
@Argument(fullName="verbose",doc="verbose",required=false)
|
||||
public boolean VERBOSE = false;
|
||||
|
||||
private AlleleFrequencyWalker caller_1b;
|
||||
private AlleleFrequencyWalker caller_4b;
|
||||
public void initialize()
|
||||
{
|
||||
caller_1b = new AlleleFrequencyWalker();
|
||||
caller_1b.N = 2;
|
||||
caller_1b.DOWNSAMPLE = 0;
|
||||
caller_1b.GFF_OUTPUT_FILE = "/dev/null";
|
||||
caller_1b.FORCE_1BASE_PROBS = true;
|
||||
caller_1b.initalize();
|
||||
caller_1b.reduceInit();
|
||||
|
||||
caller_4b = new AlleleFrequencyWalker();
|
||||
caller_4b.N = 2;
|
||||
caller_4b.DOWNSAMPLE = 0;
|
||||
caller_4b.GFF_OUTPUT_FILE = "/dev/null";
|
||||
caller_4b.FORCE_1BASE_PROBS = false;
|
||||
caller_4b.initalize();
|
||||
caller_4b.reduceInit();
|
||||
}
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
|
||||
// First, call the site, because we're mostly interested in hets
|
||||
ref = Character.toUpperCase(ref);
|
||||
AlleleFrequencyEstimate call_1b = caller_1b.map(tracker, ref, context);
|
||||
AlleleFrequencyEstimate call_4b = caller_4b.map(tracker, ref, context);
|
||||
|
||||
// Only output data for sites that are confident disagreements.
|
||||
if (Math.abs(call_1b.lodVsRef) < 5.0) { return 0; }
|
||||
if (Math.abs(call_4b.lodVsRef) < 5.0) { return 0; }
|
||||
if (call_1b.genotype().equals(call_4b.genotype())) { return 0; }
|
||||
|
||||
List<SAMRecord> reads = pileup.getReads();
|
||||
List<Integer> offsets = pileup.getOffsets();
|
||||
String best_bases = pileup.getBases();
|
||||
String second_bases;
|
||||
double[] quals = new double[reads.size()];
|
||||
double[] second_quals = new double[reads.size()];
|
||||
{
|
||||
char[] second_bases_array = new char[reads.size()];
|
||||
for ( int i = 0; i < reads.size(); i++ )
|
||||
{
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
byte qual = (byte)read.getBaseQualities()[offset];
|
||||
quals[i] = 1.0 - Math.pow(10,(double)qual/-10.0);
|
||||
|
||||
second_bases_array[i] = BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(((byte[])read.getAttribute("SQ"))[offset]));
|
||||
second_quals[i] = QualityUtils.compressedQualityToProb(((byte[])read.getAttribute("SQ"))[offset]);
|
||||
}
|
||||
second_bases = new String(second_bases_array);
|
||||
}
|
||||
|
||||
String rodString = "";
|
||||
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
|
||||
if ( datum != null && ! (datum instanceof rodDbSNP)) {
|
||||
//System.out.printf("rod = %s%n", datum.toSimpleString());
|
||||
rodString += datum.toSimpleString();
|
||||
//System.out.printf("Rod string %s%n", rodString);
|
||||
}
|
||||
}
|
||||
|
||||
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
|
||||
if ( dbsnp != null )
|
||||
rodString += dbsnp.toMediumString();
|
||||
|
||||
if ( rodString != "" )
|
||||
rodString = "[ROD: " + rodString + "]";
|
||||
|
||||
char[] bases = {'A', 'C', 'G', 'T'};
|
||||
double[][] counts = new double[4][];
|
||||
double[] totals = new double[4];
|
||||
double[][] fractional_counts = new double[4][];
|
||||
double[] fractional_totals = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
counts[i] = new double[4];
|
||||
fractional_counts[i] = new double[4];
|
||||
for (int j = 0; j < best_bases.length(); j++)
|
||||
{
|
||||
if (best_bases.charAt(j) == bases[i])
|
||||
{
|
||||
counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += 1;
|
||||
totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += 1;
|
||||
|
||||
fractional_counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += second_quals[j];
|
||||
fractional_totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += second_quals[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.printf("%s\t%c\t%s\n%s\t%c\t%s\n",
|
||||
pileup.getLocation(),
|
||||
ref,
|
||||
best_bases,
|
||||
pileup.getLocation(),
|
||||
ref,
|
||||
second_bases);
|
||||
out.printf("%s\t%s\t%f\t%f\n",
|
||||
pileup.getLocation(),
|
||||
call_1b.genotype(),
|
||||
call_1b.lodVsRef,
|
||||
call_1b.lodVsNextBest);
|
||||
out.printf("%s\t%s\t%f\t%f\n",
|
||||
pileup.getLocation(),
|
||||
call_4b.genotype(),
|
||||
call_4b.lodVsRef,
|
||||
call_4b.lodVsNextBest);
|
||||
for (int i = 0; i < quals.length; i++)
|
||||
{
|
||||
out.printf("%s\t%c %c %f\t%f\t%f\t%f\t%f\n",
|
||||
pileup.getLocation(),
|
||||
best_bases.charAt(i),
|
||||
second_bases.charAt(i),
|
||||
quals[i],
|
||||
second_quals[i],
|
||||
-10.0 * Math.log10(1.0 - quals[i]),
|
||||
-10.0 * Math.log10(1.0 - second_quals[i]),
|
||||
Math.log10((quals[i])/(second_quals[i])));
|
||||
}
|
||||
out.printf("\n");
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
out.printf("%s\t%c ", pileup.getLocation(), bases[i]);
|
||||
for (int j = 0; j < 4; j++)
|
||||
{
|
||||
out.printf("%.03f ", counts[i][j] / totals[i]);
|
||||
}
|
||||
out.printf("\n");
|
||||
}
|
||||
/*
|
||||
out.printf("\n");
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
out.printf("%s\t%c ", pileup.getLocation(), bases[i]);
|
||||
for (int j = 0; j < 4; j++)
|
||||
{
|
||||
out.printf("%.03f ", fractional_counts[i][j] / fractional_totals[i]);
|
||||
}
|
||||
out.printf("\n");
|
||||
}
|
||||
*/
|
||||
out.printf("\n\n");
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -10,10 +10,11 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
/**
|
||||
|
|
@ -103,7 +104,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
out.printf("%sAs\t%sCs\t%sTs\t%sGs\t",numAs,numCs,numTs,numGs);
|
||||
|
||||
GenotypeLikelihoods G = new GenotypeLikelihoods(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY);
|
||||
SSGGenotypeCall geno = (SSGGenotypeCall)G.callGenotypes(tracker, ref, pileup);
|
||||
SSGGenotypeCall geno = G.callGenotypes(tracker, ref, pileup);
|
||||
|
||||
|
||||
double mLikelihoods[] = geno.getLikelihoods();
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.playground.utils.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
|
|
@ -169,7 +170,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
|
|||
G.applyPrior(ref, allele_likelihoods);
|
||||
|
||||
// Handle indels
|
||||
if (CALL_INDELS)
|
||||
/* if (CALL_INDELS)
|
||||
{
|
||||
String[] indels = BasicPileup.indelPileup(reads, offsets);
|
||||
IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq);
|
||||
|
|
@ -181,7 +182,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
|
|||
{
|
||||
G.addIndelLikelihood(null);
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
/*
|
||||
// Handle 2nd-best base calls.
|
||||
|
|
@ -229,7 +230,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
|
|||
return allele_likelihoods;
|
||||
}
|
||||
|
||||
double CountIndelFreq(GenotypeLikelihoods[] genotype_likelihoods)
|
||||
/* double CountIndelFreq(GenotypeLikelihoods[] genotype_likelihoods)
|
||||
{
|
||||
HashMap<String, Double> indel_allele_likelihoods = new HashMap<String, Double>();
|
||||
|
||||
|
|
@ -256,7 +257,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
|
|||
pAlt = pAlt / (pRef + pAlt);
|
||||
|
||||
return pAlt;
|
||||
}
|
||||
}*/
|
||||
|
||||
// Potential precision error here.
|
||||
double Compute_pD(GenotypeLikelihoods[] genotype_likelihoods)
|
||||
|
|
@ -346,10 +347,10 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
|
|||
|
||||
allele_likelihoods = CountFreqs(G);
|
||||
|
||||
if (CALL_INDELS)
|
||||
{
|
||||
indel_alt_freq = CountIndelFreq(G);
|
||||
}
|
||||
// if (CALL_INDELS)
|
||||
// {
|
||||
// indel_alt_freq = CountIndelFreq(G);
|
||||
// }
|
||||
}
|
||||
|
||||
return new EM_Result(sample_names, G, allele_likelihoods);
|
||||
|
|
|
|||
|
|
@ -1,22 +1,13 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.playground.utils.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.playground.indels.Matrix;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.zip.*;
|
||||
import java.io.*;
|
||||
|
||||
// Beta iterative multi-sample caller
|
||||
|
|
|
|||
|
|
@ -1,301 +0,0 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolCallingExperiment extends LocusWalker<AlleleFrequencyEstimate, String>
|
||||
{
|
||||
List<AlleleFrequencyWalker> deep_callers = null;
|
||||
List<AlleleFrequencyWalker> shallow_callers = null;
|
||||
AlleleFrequencyWalker pooled_caller = null;
|
||||
List<String> sample_names = null;
|
||||
|
||||
@Argument(required=false, shortName="downsample", doc="downsample") public int DOWNSAMPLE = 4;
|
||||
@Argument(required=false, shortName="downsample_noise", doc="downsample noise") public int DOWNSAMPLE_NOISE = 3;
|
||||
@Argument(required=false, shortName="log_metrics", doc="log metrics") public boolean LOG_METRICS = true;
|
||||
@Argument(required=false, shortName="fractional_counts", doc="fractional counts") public boolean FRACTIONAL_COUNTS = false;
|
||||
|
||||
private Random random;
|
||||
|
||||
public void initialize()
|
||||
{
|
||||
GenomeAnalysisEngine toolkit = this.getToolkit();
|
||||
SAMFileHeader header = toolkit.getSAMFileHeader();
|
||||
List<SAMReadGroupRecord> read_groups = header.getReadGroups();
|
||||
|
||||
sample_names = new ArrayList<String>();
|
||||
deep_callers = new ArrayList<AlleleFrequencyWalker>();
|
||||
shallow_callers = new ArrayList<AlleleFrequencyWalker>();
|
||||
|
||||
random = new Random(42);
|
||||
|
||||
for (int i = 0; i < read_groups.size(); i++)
|
||||
{
|
||||
String sample_name = read_groups.get(i).getSample();
|
||||
//System.out.println("SAMPLE: " + sample_name);
|
||||
|
||||
AlleleFrequencyWalker deep = new AlleleFrequencyWalker();
|
||||
AlleleFrequencyWalker shallow = new AlleleFrequencyWalker();
|
||||
|
||||
deep.N = 2;
|
||||
deep.DOWNSAMPLE = 0;
|
||||
deep.GFF_OUTPUT_FILE = "/dev/null";
|
||||
deep.initalize();
|
||||
|
||||
shallow.N = 2;
|
||||
shallow.DOWNSAMPLE = 0;
|
||||
shallow.GFF_OUTPUT_FILE = "/dev/null";
|
||||
shallow.initalize();
|
||||
|
||||
sample_names.add(sample_name);
|
||||
deep_callers.add(deep);
|
||||
shallow_callers.add(shallow);
|
||||
}
|
||||
|
||||
pooled_caller = new AlleleFrequencyWalker();
|
||||
pooled_caller.N = sample_names.size() * 2;
|
||||
pooled_caller.DOWNSAMPLE = 0;
|
||||
pooled_caller.GFF_OUTPUT_FILE = "/dev/null";
|
||||
pooled_caller.initalize();
|
||||
pooled_caller.reduceInit();
|
||||
}
|
||||
|
||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
// 1. seperate each context.
|
||||
LocusContext[] deep_contexts = new LocusContext[sample_names.size()];
|
||||
for (int i = 0; i < sample_names.size(); i++)
|
||||
{
|
||||
deep_contexts[i] = filterLocusContext(context, sample_names.get(i), 0);
|
||||
}
|
||||
|
||||
// 2. Pool the deep contexts. (for the hell of it)
|
||||
LocusContext deep_pooled_context = poolLocusContext(deep_contexts);
|
||||
|
||||
// 3. Call individuals from deep coverage.
|
||||
AlleleFrequencyEstimate[] deep_calls = new AlleleFrequencyEstimate[sample_names.size()];
|
||||
for (int i = 0; i < sample_names.size(); i++)
|
||||
{
|
||||
deep_calls[i] = deep_callers.get(i).map(tracker, ref, deep_contexts[i]);
|
||||
}
|
||||
|
||||
// 4. Count "true" allele frequency from deep calls, and compare to the estimated frequency from the pool.
|
||||
// 4.1. Count "true" allele frequency from deep calls, and downsample the individuals with solid truth.
|
||||
double true_alt_freq = 0.0;
|
||||
double true_N = 0.0;
|
||||
LocusContext[] downsampled_contexts = new LocusContext[sample_names.size()];
|
||||
for (int i = 0; i < deep_calls.length; i++)
|
||||
{
|
||||
downsampled_contexts[i] = null;
|
||||
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
|
||||
{
|
||||
true_alt_freq += deep_calls[i].emperical_allele_frequency() * deep_calls[i].N;
|
||||
true_N += 2;
|
||||
downsampled_contexts[i] = filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian()*DOWNSAMPLE_NOISE));
|
||||
}
|
||||
}
|
||||
true_alt_freq /= true_N;
|
||||
|
||||
if (true_N == 0.0) { return null; } // just bail if true_N == 0.
|
||||
|
||||
// 4.2. Pool just the contexts that have truth data.
|
||||
LocusContext pooled_context = poolLocusContext(downsampled_contexts);
|
||||
|
||||
|
||||
// EM Loop:
|
||||
AlleleFrequencyEstimate pooled_call = null;
|
||||
|
||||
double correct_shallow_calls = 0;
|
||||
double total_shallow_calls = 0;
|
||||
AlleleFrequencyEstimate[] shallow_calls = null;
|
||||
double EM_alt_freq = 0;
|
||||
double EM_N = 0;
|
||||
double shallow_calls_fraction_correct = 0;
|
||||
|
||||
// 5. Call the pool. (this step is the EM init)
|
||||
pooled_caller.N = (int)true_N;
|
||||
pooled_call = pooled_caller.map(tracker, ref, pooled_context);
|
||||
System.out.print("POOLED_CALL " + pooled_call.asTabularString());
|
||||
|
||||
// (this loop is the EM cycle)
|
||||
EM_alt_freq = pooled_call.qstar; //pooled_call.qhat;
|
||||
int num_iterations = 10;
|
||||
double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq;
|
||||
double[] likelihood_trajectory = new double[num_iterations + 1]; likelihood_trajectory[0] = pooled_call.pBest;
|
||||
for (int iterations = 0; iterations < num_iterations; iterations++)
|
||||
{
|
||||
// 6. Re-call from shallow coverage using the estimated frequency as a prior,
|
||||
// and compare to true deep calls,
|
||||
// and compute new MAF estimate.
|
||||
correct_shallow_calls = 0;
|
||||
total_shallow_calls = 0;
|
||||
shallow_calls = new AlleleFrequencyEstimate[sample_names.size()];
|
||||
EM_N = 0.0;
|
||||
double EM_sum = 0.0;
|
||||
double likelihood = 0.0;
|
||||
|
||||
for (int i = 0; i < deep_calls.length; i++)
|
||||
{
|
||||
// Only shallow-call things where we know the truth!
|
||||
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
|
||||
{
|
||||
shallow_callers.get(i).setAlleleFrequencyPrior(EM_alt_freq);
|
||||
shallow_calls[i] = shallow_callers.get(i).map(tracker, ref, downsampled_contexts[i]);
|
||||
String deep_genotype = deep_calls[i].genotype();
|
||||
String shallow_genotype = shallow_calls[i].genotype();
|
||||
|
||||
likelihood += shallow_calls[i].lodVsNextBest;
|
||||
|
||||
//System.out.printf("DBG: %f %f %f %f\n",
|
||||
// deep_calls[i].lodVsNextBest,
|
||||
// deep_calls[i].lodVsRef,
|
||||
// shallow_calls[i].lodVsNextBest,
|
||||
// shallow_calls[i].lodVsRef);
|
||||
|
||||
if (deep_genotype.equals(shallow_genotype))
|
||||
{
|
||||
correct_shallow_calls += 1;
|
||||
}
|
||||
total_shallow_calls += 1;
|
||||
|
||||
if (! FRACTIONAL_COUNTS)
|
||||
{
|
||||
EM_sum += shallow_calls[i].emperical_allele_frequency() * shallow_calls[i].N;
|
||||
EM_N += shallow_calls[i].N;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = 0; j <= shallow_calls[i].N; j++)
|
||||
{
|
||||
if (Double.isInfinite(shallow_calls[i].posteriors[j])) { shallow_calls[i].posteriors[j] = -10000; }
|
||||
System.out.printf("DBG3: %d %f %d\n", j, shallow_calls[i].posteriors[j], shallow_calls[i].N);
|
||||
EM_sum += Math.pow(10,shallow_calls[i].posteriors[j]) * (double)j;
|
||||
EM_N += shallow_calls[i].N;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
EM_alt_freq = EM_sum / EM_N;
|
||||
shallow_calls_fraction_correct = correct_shallow_calls / total_shallow_calls;
|
||||
trajectory[iterations+1] = EM_alt_freq;
|
||||
likelihood_trajectory[iterations+1] = likelihood/(double)total_shallow_calls;
|
||||
|
||||
System.out.printf("DBGTRAJ %f %f %f %f\n", EM_sum, EM_N, trajectory[iterations], trajectory[iterations+1]);
|
||||
}
|
||||
|
||||
// 7. Compare to estimation from the pool.
|
||||
System.out.printf("EVAL %s %f %f %f %f %f %f %d %d %f %f %f %f\n",
|
||||
pooled_call.location,
|
||||
pooled_call.lodVsRef,
|
||||
true_alt_freq,
|
||||
pooled_call.qhat,
|
||||
pooled_call.qstar,
|
||||
true_alt_freq * true_N,
|
||||
pooled_call.emperical_allele_frequency() * true_N,
|
||||
pooled_call.N,
|
||||
pooled_call.depth,
|
||||
total_shallow_calls,
|
||||
correct_shallow_calls,
|
||||
shallow_calls_fraction_correct,
|
||||
EM_alt_freq);
|
||||
for (int i = 0; i < likelihood_trajectory.length; i++)
|
||||
{
|
||||
System.out.printf("TRAJECTORY %f %f\n", trajectory[i], likelihood_trajectory[i]);
|
||||
}
|
||||
System.out.print("\n\n");
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private LocusContext poolLocusContext(LocusContext[] contexts)
|
||||
{
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
GenomeLoc location = null;
|
||||
|
||||
for (int i = 0; i < contexts.length; i++)
|
||||
{
|
||||
if (contexts[i] != null)
|
||||
{
|
||||
location = contexts[i].getLocation();
|
||||
reads.addAll(contexts[i].getReads());
|
||||
offsets.addAll(contexts[i].getOffsets());
|
||||
}
|
||||
}
|
||||
|
||||
return new LocusContext(location, reads, offsets);
|
||||
}
|
||||
|
||||
private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample)
|
||||
{
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
for (int i = 0; i < context.getReads().size(); i++)
|
||||
{
|
||||
SAMRecord read = context.getReads().get(i);
|
||||
Integer offset = context.getOffsets().get(i);
|
||||
String RG = (String)(read.getAttribute("RG"));
|
||||
String sample = read.getHeader().getReadGroup(RG).getSample();
|
||||
if (sample == sample_name)
|
||||
{
|
||||
reads.add(read);
|
||||
offsets.add(offset);
|
||||
}
|
||||
}
|
||||
|
||||
if (downsample != 0)
|
||||
{
|
||||
List<Integer> perm = new ArrayList<Integer>();
|
||||
for (int i = 0; i < reads.size(); i++) { perm.add(i); }
|
||||
perm = Utils.RandomSubset(perm, downsample);
|
||||
|
||||
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
|
||||
|
||||
for (int i = 0; i < perm.size(); i++)
|
||||
{
|
||||
downsampled_reads.add(reads.get(perm.get(i)));
|
||||
downsampled_offsets.add(offsets.get(perm.get(i)));
|
||||
}
|
||||
|
||||
reads = downsampled_reads;
|
||||
offsets = downsampled_offsets;
|
||||
}
|
||||
|
||||
return new LocusContext(context.getLocation(), reads, offsets);
|
||||
}
|
||||
|
||||
public void onTraversalDone()
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
public String reduceInit()
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,28 +1,12 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@By(DataSource.REFERENCE)
|
||||
public class PrintCoverageWalker extends LocusWalker<Integer, Integer> {
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ import org.broadinstitute.sting.gatk.walkers.DataSource;
|
|||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.*;
|
||||
|
|
@ -40,10 +40,10 @@ public class ClusteredSNPFilterWalker extends RefWalker<Integer, Integer> {
|
|||
public void initialize() {
|
||||
try {
|
||||
vwriter = new PrintWriter(VARIANTS_OUT);
|
||||
vwriter.println(AlleleFrequencyEstimate.geliHeaderString());
|
||||
vwriter.println(GeliTextWriter.headerLine);
|
||||
if ( FILTERED_OUT != null ) {
|
||||
fwriter = new PrintWriter(FILTERED_OUT);
|
||||
fwriter.println(AlleleFrequencyEstimate.geliHeaderString());
|
||||
fwriter.println(GeliTextWriter.headerLine);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException(String.format("Could not open file for writing"));
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
|||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -55,7 +55,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
try {
|
||||
vwriter = new PrintWriter(VARIANTS_OUT_HEAD + ".included.geli.calls");
|
||||
vwriter.println(AlleleFrequencyEstimate.geliHeaderString());
|
||||
vwriter.println(GeliTextWriter.headerLine);
|
||||
|
||||
swriters = new HashMap<String, PrintWriter>();
|
||||
|
||||
|
|
@ -118,7 +118,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
|
|||
swriters.get(STUDY_NAME).print(vec.getStudyHeader() + "\t");
|
||||
|
||||
PrintWriter writer = new PrintWriter(VARIANTS_OUT_HEAD + ".excluded." + exclusionClassName + ".geli.calls");
|
||||
writer.println(AlleleFrequencyEstimate.geliHeaderString());
|
||||
writer.println(GeliTextWriter.headerLine);
|
||||
|
||||
ewriters.put(exclusionClassName, writer);
|
||||
} catch (InstantiationException e) {
|
||||
|
|
|
|||
|
|
@ -1,210 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.utils;
|
||||
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class AlleleFrequencyEstimate {
|
||||
|
||||
public GenomeLoc location;
|
||||
public char ref;
|
||||
public char alt;
|
||||
public int N;
|
||||
public double qhat;
|
||||
public double qstar;
|
||||
public double lodVsRef;
|
||||
public double lodVsNextBest;
|
||||
public double pBest;
|
||||
public double pRef;
|
||||
public int depth;
|
||||
public String notes;
|
||||
public String bases;
|
||||
//public double[][] quals;
|
||||
public double[] posteriors;
|
||||
public String sample_name;
|
||||
public int n_ref;
|
||||
public int n_het;
|
||||
public int n_hom;
|
||||
|
||||
public GenotypeLikelihoods genotypeLikelihoods = null;
|
||||
|
||||
GenomeLoc l;
|
||||
|
||||
|
||||
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth, String bases, double[][] quals, double[] posteriors, String sample_name)
|
||||
{
|
||||
this.location = location;
|
||||
this.ref = ref;
|
||||
this.alt = alt;
|
||||
this.N = N;
|
||||
this.qhat = qhat;
|
||||
this.qstar = qstar;
|
||||
this.lodVsRef = lodVsRef;
|
||||
this.lodVsNextBest = lodVsNextBest;
|
||||
this.pBest = pBest;
|
||||
this.pRef = pRef;
|
||||
this.depth = depth;
|
||||
this.notes = "";
|
||||
this.bases = bases;
|
||||
//this.quals = quals;
|
||||
this.posteriors = posteriors;
|
||||
this.sample_name = sample_name;
|
||||
}
|
||||
|
||||
public boolean isREF() { return (this.lodVsRef <= -5.0); }
|
||||
public boolean isSNP() { return (this.lodVsRef >= 5.0); }
|
||||
|
||||
/** Return the most likely genotype. */
|
||||
public String genotype()
|
||||
{
|
||||
int alt_count = (int)(qstar * N);
|
||||
int ref_count = N-alt_count;
|
||||
char[] alleles = new char[N];
|
||||
int i;
|
||||
for (i = 0; i < ref_count; i++) { alleles[i] = ref; }
|
||||
for (; i < N; i++) { alleles[i] = alt; }
|
||||
Arrays.sort(alleles);
|
||||
return new String(alleles);
|
||||
}
|
||||
|
||||
public double emperical_allele_frequency()
|
||||
{
|
||||
return (double)Math.round((double)qstar * (double)N) / (double)N;
|
||||
}
|
||||
|
||||
public double emperical_allele_frequency(int N)
|
||||
{
|
||||
return (double)Math.round((double)qstar * (double)N) / (double)N;
|
||||
}
|
||||
|
||||
public String asGFFString()
|
||||
{
|
||||
String s = "";
|
||||
s += String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\t",
|
||||
location.getContig(),
|
||||
location.getStart(),
|
||||
location.getStart(),
|
||||
lodVsRef);
|
||||
s += String.format("\t;\tSAMPLE %s", sample_name);
|
||||
s += String.format("\t;\tREF %c", ref);
|
||||
s += String.format("\t;\tALT %c", alt);
|
||||
s += String.format("\t;\tFREQ %f", qstar);
|
||||
s += String.format("\t;\tGENOTYPE %s", this.genotype());
|
||||
s += String.format("\t;\tDEPTH %d", depth);
|
||||
s += String.format("\t;\tLODvsREF %f", lodVsRef);
|
||||
s += String.format("\t;\tLODvsNEXTBEST %f", lodVsNextBest);
|
||||
s += String.format("\t;\tQHAT %f", qhat);
|
||||
s += String.format("\t;\tQSTAR %f", qstar);
|
||||
s += String.format("\t;\tBASES %s", bases);
|
||||
|
||||
s += ";\n";
|
||||
|
||||
// add quals.
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
public static String asTabularStringHeader()
|
||||
{
|
||||
return "location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases";
|
||||
}
|
||||
|
||||
public static String geliHeaderString() {
|
||||
return "#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT";
|
||||
}
|
||||
|
||||
public String asGeliString()
|
||||
{
|
||||
// #Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT
|
||||
// chr1 7764136 A 48 99 CC 83.650421 9.18159 -92.83638 -18.367548 -96.91729 -96.614204 -9.185958 -23.33643 -23.033337 -101.282059 -101.583092 -101.279999
|
||||
|
||||
// chr pos ref Nreads maxMapQ genotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT
|
||||
//public double[] posteriors;
|
||||
return String.format("%s %16d %c %8d %d %s %.6f %.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f %6.6f",
|
||||
location.getContig(),
|
||||
location.getStart(),
|
||||
ref,
|
||||
depth,
|
||||
-1,
|
||||
genotype(),
|
||||
lodVsRef,
|
||||
lodVsNextBest,
|
||||
posteriors[0],
|
||||
posteriors[1],
|
||||
posteriors[2],
|
||||
posteriors[3],
|
||||
posteriors[4],
|
||||
posteriors[5],
|
||||
posteriors[6],
|
||||
posteriors[7],
|
||||
posteriors[8],
|
||||
posteriors[9]);
|
||||
}
|
||||
|
||||
public String asTabularString()
|
||||
{
|
||||
return String.format("%s %s %c %c %s %f %f %f %f %d %s",
|
||||
location,
|
||||
sample_name,
|
||||
ref,
|
||||
alt,
|
||||
genotype(),
|
||||
qhat,
|
||||
qstar,
|
||||
lodVsRef,
|
||||
lodVsNextBest,
|
||||
depth,
|
||||
bases);
|
||||
}
|
||||
|
||||
public String toString() { return asTabularString(); }
|
||||
|
||||
public String asString() {
|
||||
// Print out the called bases
|
||||
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
|
||||
// base has you calculate qstar = 0.0 and qhat = 0.49 and that leads to a genotype predicition of AG according
|
||||
// to qhat, but AA according to qstar. This needs to be further investigated to see whether we really want
|
||||
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
|
||||
long numNonrefBases = Math.round(qstar * N);
|
||||
long numRefBases = N - numNonrefBases;
|
||||
if (ref < alt) { // order bases alphabetically
|
||||
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
|
||||
}else{
|
||||
return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases);
|
||||
}
|
||||
}
|
||||
|
||||
public String asPoolTabularString()
|
||||
{
|
||||
return String.format("%s %c %c %f %f %f %s %f %d %d %d %d",
|
||||
location,
|
||||
ref,
|
||||
alt,
|
||||
qstar,
|
||||
pBest,
|
||||
pRef,
|
||||
"NA",
|
||||
lodVsRef,
|
||||
N,
|
||||
n_ref,
|
||||
n_het,
|
||||
n_hom);
|
||||
}
|
||||
|
||||
|
||||
public double posterior()
|
||||
{
|
||||
return this.posteriors[(int)this.qstar * this.N];
|
||||
}
|
||||
|
||||
public String callType() {
|
||||
// Returns a string indicating whether the call is homozygous reference, heterozygous SNP, or homozygous SNP
|
||||
|
||||
String[] callTypeString = {"HomozygousSNP", "HeterozygousSNP", "HomozygousReference"};
|
||||
String genotype = genotype();
|
||||
int ref_matches = (genotype.charAt(0) == ref ? 1 : 0) + (genotype.charAt(1) == ref ? 1 : 0);
|
||||
return callTypeString[ref_matches];
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,198 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.utils;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
|
||||
public class AlleleMetrics {
|
||||
private double LOD_cutoff = 5;
|
||||
|
||||
private long dbsnp_hits = 0;
|
||||
private long num_variants = 0;
|
||||
private long num_loci_total = 0;
|
||||
private long num_loci_confident = 0;
|
||||
private long hapmap_genotype_correct = 0;
|
||||
private long hapmap_genotype_incorrect = 0;
|
||||
private long hapmap_refvar_correct = 0;
|
||||
private long hapmap_refvar_incorrect = 0;
|
||||
|
||||
private final double dbl_cmp_precision = 0.0001;
|
||||
|
||||
protected PrintStream out;
|
||||
|
||||
public AlleleMetrics(String metricsOutputPath) {
|
||||
initialize(new File(metricsOutputPath), LOD_cutoff);
|
||||
}
|
||||
|
||||
public AlleleMetrics(String metricsOutputPath, double lodThreshold) {
|
||||
initialize(new File(metricsOutputPath), lodThreshold);
|
||||
}
|
||||
|
||||
public AlleleMetrics(File metricsOutputFile) {
|
||||
initialize(metricsOutputFile, LOD_cutoff);
|
||||
}
|
||||
|
||||
public AlleleMetrics(File metricsOutputFile, double lodThreshold) {
|
||||
initialize(metricsOutputFile, lodThreshold);
|
||||
}
|
||||
|
||||
private void initialize(File metricsOutputFile, double lodThresold) {
|
||||
try {
|
||||
this.out = new PrintStream(metricsOutputFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
System.err.format("Unable to open file '%s'. Perhaps the parent directory does not exist or is read-only.", metricsOutputFile.getAbsolutePath());
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
this.LOD_cutoff = lodThresold;
|
||||
}
|
||||
|
||||
public void nextPosition(GenotypeCall cl, RefMetaDataTracker tracker) {
|
||||
SSGGenotypeCall call = (SSGGenotypeCall)cl;
|
||||
num_loci_total += 1;
|
||||
|
||||
boolean is_dbSNP_SNP = false;
|
||||
boolean has_hapmap_chip_genotype = false;
|
||||
rodGFF hapmap_chip_genotype = null;
|
||||
|
||||
for ( ReferenceOrderedDatum datum : tracker.getAllRods() )
|
||||
{
|
||||
if ( datum != null )
|
||||
{
|
||||
if ( datum instanceof rodDbSNP)
|
||||
{
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
||||
}
|
||||
|
||||
if ( datum instanceof rodGFF )
|
||||
{
|
||||
has_hapmap_chip_genotype = true;
|
||||
hapmap_chip_genotype = (rodGFF)datum;
|
||||
}
|
||||
}
|
||||
}
|
||||
double result = call.getBestRef();
|
||||
if (Math.abs(call.getBestNext()) >= LOD_cutoff) { num_loci_confident += 1; }
|
||||
|
||||
if (call.isVariation() && result >= LOD_cutoff)
|
||||
{
|
||||
// Confident variant.
|
||||
|
||||
num_variants += 1;
|
||||
|
||||
if (is_dbSNP_SNP)
|
||||
{
|
||||
dbsnp_hits += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_hapmap_chip_genotype) {
|
||||
|
||||
// convert hapmap call to mixture of ref/nonref
|
||||
String hapmap_genotype = hapmap_chip_genotype.getFeature();
|
||||
long refs=0, alts=0;
|
||||
double hapmap_q;
|
||||
String str = call.getBases();
|
||||
char alt = str.charAt(0);
|
||||
if (str.charAt(0) == call.getReferencebase()) alt = str.charAt(1);
|
||||
for (char c : hapmap_genotype.toCharArray()) {
|
||||
if (c == call.getReferencebase()) { refs++; }
|
||||
if (c == alt) { alts++; }
|
||||
}
|
||||
|
||||
if (refs+alts > 0) {
|
||||
hapmap_q = ((double) alts) / ((double) (refs+alts));
|
||||
}else{
|
||||
hapmap_q = -1.0;
|
||||
}
|
||||
|
||||
// Hapmap debug info
|
||||
//out.format("HAPMAP DEBUG %.2f %.2f %.2f ", hapmap_q, alleleFreq.qstar, alleleFreq.lodVsRef);
|
||||
//String called_genotype = alleleFreq.asString();
|
||||
//out.format("%s %s %c %c", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt);
|
||||
|
||||
//System.out.printf("DBG %f %s\n", LOD_cutoff, alleleFreq.asTabularString());
|
||||
if (call.getBestNext() >= LOD_cutoff) {
|
||||
// TODO : should this all be commented out?
|
||||
/*
|
||||
System.out.printf("DBG %f %f %f %f\n",
|
||||
hapmap_q,
|
||||
alleleFreq.qhat,
|
||||
alleleFreq.qstar,
|
||||
alleleFreq.lodVsNextBest);
|
||||
*/
|
||||
|
||||
// Calculate genotyping performance - did we get the correct genotype of the N+1 choices?
|
||||
//if (hapmap_q != -1 && hapmap_q == alleleFreq.qstar) {
|
||||
/*if (Math.abs(hapmap_q - -1.0) > dbl_cmp_precision && Math.abs(hapmap_q - alleleFreq.qstar) <= dbl_cmp_precision) {
|
||||
hapmap_genotype_correct++;
|
||||
}else{
|
||||
hapmap_genotype_incorrect++;
|
||||
//System.out.printf(" INCORRECT GENOTYPE Bases: %s", AlleleFrequencyWalker.getBases(context));
|
||||
//out.printf(" INCORRECT GENOTYPE");
|
||||
//AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context));
|
||||
}*/
|
||||
}
|
||||
|
||||
if (result >= LOD_cutoff || -1 * result >= LOD_cutoff) {
|
||||
|
||||
// Now calculate ref / var performance - did we correctly classify the site as
|
||||
// reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here
|
||||
boolean hapmap_var = hapmap_q != 0.0;
|
||||
boolean called_var = call.isVariation();
|
||||
//if (hapmap_q != -1 && hapmap_var != called_var) {
|
||||
if (Math.abs(hapmap_q - -1.0) > dbl_cmp_precision && hapmap_var != called_var) {
|
||||
hapmap_refvar_incorrect++;
|
||||
//out.printf(" INCORRECT REFVAR CALL");
|
||||
}else{
|
||||
hapmap_refvar_correct++;
|
||||
}
|
||||
}
|
||||
|
||||
//out.print("\n");
|
||||
}
|
||||
}
|
||||
|
||||
public void printMetrics()
|
||||
{
|
||||
if (num_loci_total == 0) { return; }
|
||||
|
||||
out.printf("\n");
|
||||
out.printf("Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
|
||||
out.printf("-------------------------------------------------\n");
|
||||
out.printf("Total loci : %d\n", num_loci_total);
|
||||
out.printf("Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||
if (num_variants != 0)
|
||||
{
|
||||
out.printf("Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||
out.printf("Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||
out.printf("-------------------------------------------------\n");
|
||||
out.printf("-- Hapmap Genotyping performance --\n");
|
||||
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
|
||||
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
|
||||
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
|
||||
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
|
||||
out.printf("-------------------------------------------------\n");
|
||||
out.printf("-- Hapmap Reference/Variant performance --\n");
|
||||
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
|
||||
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
|
||||
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
|
||||
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
|
||||
public void printMetricsAtLocusIntervals(int loci_interval) {
|
||||
if (num_loci_total % loci_interval == 0) printMetrics();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -154,7 +154,7 @@ public class BasicGenotype implements Genotype {
|
|||
* @return
|
||||
*/
|
||||
@Override
|
||||
public org.broadinstitute.sting.utils.genotype.variant.Variant toVariant() {
|
||||
public Variant toVariant() {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ package org.broadinstitute.sting.utils.genotype;
|
|||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore;
|
||||
import org.broadinstitute.sting.utils.genotype.variant.Variant;
|
||||
import org.broadinstitute.sting.utils.genotype.Variant;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.utils.genotype.calls;
|
||||
package org.broadinstitute.sting.utils.genotype;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
package org.broadinstitute.sting.utils.genotype;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.calls.GenotypeCall;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.utils.genotype.variant;
|
||||
package org.broadinstitute.sting.utils.genotype;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore;
|
||||
|
|
@ -21,7 +21,7 @@ public interface Variant {
|
|||
*
|
||||
* @return VariantFrequency with the stored frequency
|
||||
*/
|
||||
public VariantFrequency getFrequency();
|
||||
public double getNonRefAlleleFrequency();
|
||||
|
||||
/**
|
||||
* get the confidence score for this variance
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
|
|||
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ package org.broadinstitute.sting.utils.genotype.geli;
|
|||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -32,9 +32,11 @@ public class GeliTextWriter implements GenotypeWriter {
|
|||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Unable to open file " + file.toURI());
|
||||
}
|
||||
mWriter.println("#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT");
|
||||
mWriter.println(headerLine);
|
||||
}
|
||||
|
||||
public static String headerLine = "#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod AA AC AG AT CC CG CT GG GT TT";
|
||||
|
||||
/**
|
||||
* Add a genotype, given a genotype locus
|
||||
*
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeOutput;
|
|||
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
import org.broadinstitute.sting.utils.genotype.calls.SSGGenotypeCall;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall;
|
||||
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.File;
|
||||
|
|
|
|||
Loading…
Reference in New Issue