Updated Bayesian phasing method to output per-site phasing statistics (and to not cap PQ at 40)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4064 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-08-19 19:55:47 +00:00
parent 04e5b28f6d
commit effeedf1a3
2 changed files with 101 additions and 78 deletions

View File

@ -36,14 +36,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.PreciseNonNegativeDouble;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriterImpl;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -67,13 +63,17 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm; [default:20]", required = false)
protected Integer maxPhaseSites = 20; // 2^20 == 10^6 biallelic haplotypes
@Argument(fullName = "phaseScoreThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing; [default:0.66]", required = false)
protected Double phaseScoreThresh = 0.66;
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing; [default:4.77]", required = false)
protected Double phaseQualityThresh = 4.77; // PQ = 4.77 <=> P(error) = 10^(-4.77/10) = 0.33, P(correct) = 0.66, so that we have odds ratio of >= 2
@Argument(fullName = "phasedVCFFile", shortName = "phasedVCF", doc = "The name of the phased VCF file output", required = true)
protected String phasedVCFFile = null;
@Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files", required = false)
protected String variantStatsFilePrefix = null;
private VCFWriter writer = null;
private PhasingQualityStatsWriter statsWriter = null;
private LinkedList<VariantAndReads> siteQueue = null;
private VariantAndReads prevVr = null; // the VC emitted after phasing, and the alignment bases at the position emitted
@ -96,6 +96,9 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
public void initialize() {
siteQueue = new LinkedList<VariantAndReads>();
prevVr = new VariantAndReads(null, null, true);
if (variantStatsFilePrefix != null)
statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix);
}
public boolean generateExtendedEvents() {
@ -279,14 +282,18 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry();
double score = maxEntry.getScore().getValue();
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tscore:\t" + score);
double posteriorProb = maxEntry.getScore().getValue();
int phaseQuality = new Integer(QualityUtils.probToQual(posteriorProb, 0.0)); // 0.0 <=> do NOT cap the quality!
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality);
genotypesArePhased = (score >= phaseScoreThresh);
if (statsWriter != null)
statsWriter.addStat(samp, distance(prevVc, vc), phaseQuality);
genotypesArePhased = (phaseQuality >= phaseQualityThresh);
if (genotypesArePhased) {
Biallele prevBiall = new Biallele(prevVc.getGenotype(samp));
ensurePhasing(biall, prevBiall, maxEntry.getHaplotypeClass().getRepresentative());
gtAttribs.put("PQ", new Integer(QualityUtils.probToQual(score)));
gtAttribs.put("PQ", phaseQuality);
logger.debug("CHOSE PHASE:\n" + biall + "\n\n");
}
@ -347,6 +354,15 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
return (loc1.onSameContig(loc2) && loc1.distance(loc2) <= cacheWindow);
}
private static int distance(VariantContext vc1, VariantContext vc2) {
GenomeLoc loc1 = VariantContextUtils.getLocation(vc1);
GenomeLoc loc2 = VariantContextUtils.getLocation(vc2);
if (!loc1.onSameContig(loc2))
return Integer.MAX_VALUE;
return loc1.distance(loc2);
}
private void writeVCF(VariantContext vc) {
if (writer == null)
initializeVcfWriter(vc);
@ -381,12 +397,14 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
writeVarContList(finalList);
if (writer != null)
writer.close();
if (statsWriter != null)
statsWriter.close();
out.println("Number of reads observed: " + result.getNumReads());
out.println("Number of variant sites observed: " + result.getNumVarSites());
out.println("Average coverage: " + ((double) result.getNumReads() / result.getNumVarSites()));
out.println("\n-- Phasing summary [minimal haplotype probability: " + phaseScoreThresh + "] --");
out.println("\n-- Phasing summary [minimal haplotype probability: " + phaseQualityThresh + "] --");
for (Map.Entry<String, PhaseCounts> sampPhaseCountEntry : result.getPhaseCounts()) {
PhaseCounts pc = sampPhaseCountEntry.getValue();
out.println("Sample: " + sampPhaseCountEntry.getKey() + "\tNumber of tested sites: " + pc.numTestedSites + "\tNumber of phased sites: " + pc.numPhased);
@ -1085,56 +1103,45 @@ class PhasingStatsAndOutput {
}
}
class CardinalityCounter implements Iterator<int[]>, Iterable<int[]> {
private int[] cards;
private int[] valList;
private boolean hasNext;
class PhasingQualityStatsWriter {
private String variantStatsFilePrefix;
private HashMap<String, BufferedWriter> sampleToStatsWriter = new HashMap<String, BufferedWriter>();
public CardinalityCounter(int[] cards) {
this.cards = cards;
this.valList = new int[cards.length];
for (int i = 0; i < cards.length; i++) {
if (this.cards[i] <= 0)
throw new StingException("CANNOT have zero cardinalities!");
this.valList[i] = 0;
}
this.hasNext = true;
public PhasingQualityStatsWriter(String variantStatsFilePrefix) {
this.variantStatsFilePrefix = variantStatsFilePrefix;
}
public boolean hasNext() {
return hasNext;
}
public void addStat(String sample, int distanceFromPrevious, int phasingQuality) {
BufferedWriter sampWriter = sampleToStatsWriter.get(sample);
if (sampWriter == null) {
String fileName = variantStatsFilePrefix + "." + sample + ".distance_PQ.txt";
public int[] next() {
if (!hasNext())
throw new StingException("CANNOT iterate past end!");
// Copy the assignment to be returned:
int[] nextList = new int[valList.length];
for (int i = 0; i < valList.length; i++)
nextList[i] = valList[i];
// Find the assignment after this one:
hasNext = false;
int i = cards.length - 1;
for (; i >= 0; i--) {
if (valList[i] < (cards[i] - 1)) {
valList[i]++;
hasNext = true;
break;
FileOutputStream output;
try {
output = new FileOutputStream(fileName);
} catch (FileNotFoundException e) {
throw new RuntimeException("Unable to create phasing quality stats file at location: " + fileName);
}
valList[i] = 0;
sampWriter = new BufferedWriter(new OutputStreamWriter(output));
sampleToStatsWriter.put(sample, sampWriter);
}
try {
sampWriter.write(distanceFromPrevious + "\t" + phasingQuality + "\n");
sampWriter.flush();
} catch (IOException e) {
throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e);
}
return nextList;
}
public void remove() {
throw new StingException("Cannot remove from CardinalityCounter!");
public void close() {
for (Map.Entry<String, BufferedWriter> sampWriterEntry : sampleToStatsWriter.entrySet()) {
BufferedWriter sampWriter = sampWriterEntry.getValue();
try {
sampWriter.flush();
sampWriter.close();
} catch (IOException e) {
throw new RuntimeException("Unable to close per-sample phasing quality stats file");
}
}
}
public Iterator<int[]> iterator() {
return this;
}
}
}

View File

@ -32,7 +32,7 @@ public class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDo
}
}
public PreciseNonNegativeDouble(org.broadinstitute.sting.utils.PreciseNonNegativeDouble pd) {
public PreciseNonNegativeDouble(PreciseNonNegativeDouble pd) {
this.logValue = pd.logValue;
}
@ -44,24 +44,28 @@ public class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDo
return logValue;
}
public org.broadinstitute.sting.utils.PreciseNonNegativeDouble setEqual(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
public PreciseNonNegativeDouble setEqual(PreciseNonNegativeDouble other) {
logValue = other.logValue;
return this;
}
public org.broadinstitute.sting.utils.PreciseNonNegativeDouble plus(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
return new org.broadinstitute.sting.utils.PreciseNonNegativeDouble(this).plusEqual(other);
public PreciseNonNegativeDouble plus(PreciseNonNegativeDouble other) {
return new PreciseNonNegativeDouble(this).plusEqual(other);
}
public org.broadinstitute.sting.utils.PreciseNonNegativeDouble times(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
return new org.broadinstitute.sting.utils.PreciseNonNegativeDouble(this).timesEqual(other);
public PreciseNonNegativeDouble times(PreciseNonNegativeDouble other) {
return new PreciseNonNegativeDouble(this).timesEqual(other);
}
public org.broadinstitute.sting.utils.PreciseNonNegativeDouble div(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
return new org.broadinstitute.sting.utils.PreciseNonNegativeDouble(this).divEqual(other);
public PreciseNonNegativeDouble div(PreciseNonNegativeDouble other) {
return new PreciseNonNegativeDouble(this).divEqual(other);
}
public int compareTo(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
public PreciseNonNegativeDouble absDiff(PreciseNonNegativeDouble other) {
return new PreciseNonNegativeDouble(absSubLog(this.logValue, other.logValue), true);
}
public int compareTo(PreciseNonNegativeDouble other) {
// Since log is monotonic: e^a R e^b <=> a R b, where R is one of: >, <, ==
double logValDiff = this.logValue - other.logValue;
if (Math.abs(logValDiff) <= EQUALS_THRESH)
@ -70,23 +74,33 @@ public class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDo
return new Double(Math.signum(logValDiff)).intValue();
}
public boolean equals(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
public boolean equals(PreciseNonNegativeDouble other) {
return (this.compareTo(other) == 0);
}
public boolean gt(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
public boolean gt(PreciseNonNegativeDouble other) {
return (this.compareTo(other) > 0);
}
public boolean lt(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
public boolean lt(PreciseNonNegativeDouble other) {
return (this.compareTo(other) < 0);
}
public org.broadinstitute.sting.utils.PreciseNonNegativeDouble plusEqual(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
public PreciseNonNegativeDouble plusEqual(PreciseNonNegativeDouble other) {
logValue = addInLogSpace(logValue, other.logValue);
return this;
}
public PreciseNonNegativeDouble timesEqual(PreciseNonNegativeDouble other) {
logValue += other.logValue;
return this;
}
public PreciseNonNegativeDouble divEqual(PreciseNonNegativeDouble other) {
logValue -= other.logValue;
return this;
}
// If x = log(a), y = log(b), returns log(a+b)
public static double addInLogSpace(double x, double y) {
if (x == INFINITY || y == INFINITY) return INFINITY; //log( e^INFINITY + e^y ) = INFINITY
@ -108,15 +122,17 @@ public class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDo
return maxVal + Math.log(1.0 + Math.exp(negDiff));
}
public org.broadinstitute.sting.utils.PreciseNonNegativeDouble timesEqual(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
logValue += other.logValue;
return this;
}
public org.broadinstitute.sting.utils.PreciseNonNegativeDouble divEqual(org.broadinstitute.sting.utils.PreciseNonNegativeDouble other) {
logValue -= other.logValue;
return this;
}
// If x = log(a), y = log(b), returns log |a-b|
double absSubLog(double x, double y) {
if (x == -INFINITY && y == -INFINITY) {
// log |e^-INFINITY - e^-INFINITY| = log |0-0| = log(0) = -INFINITY
return -INFINITY;
}
else if (x >= y) // x + log(1-e^(y-x)) = log(a) + log(1-e^(log(b)-log(a))) = log(a) + log(1-b/a) = a - b = |a-b|, since x >= y
return x + Math.log(1 - Math.exp(y-x));
else
return y + Math.log(1 - Math.exp(x-y));
}
public String toString() {
return new StringBuilder().append(getValue()).toString();