Fixed phasing algorithm to: 1. More correctly weed out irrelevant reads and sites; 2. Crudely flag sites with large phase discrepancies betweens reads

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4368 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-09-28 23:02:53 +00:00
parent 5a5c72c80d
commit 8d8980e8eb
6 changed files with 270 additions and 167 deletions

View File

@ -111,15 +111,15 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
samplePrevGenotypes.put(samp, null);
}
else { // Both comp and eval have a non-null Genotype at this site:
Biallele compBiallele = new Biallele(compSampGt);
Biallele evalBiallele = new Biallele(evalSampGt);
AllelePair compAllelePair = new AllelePair(compSampGt);
AllelePair evalAllelePair = new AllelePair(evalSampGt);
boolean breakPhasing = false;
if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom())
breakPhasing = true; // since they are not both het or both hom
else { // both are het, or both are hom:
boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compBiallele, evalBiallele) && bottomMatchesBottom(compBiallele, evalBiallele));
boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compBiallele, evalBiallele) && bottomMatchesTop(compBiallele, evalBiallele));
boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compAllelePair, evalAllelePair) && bottomMatchesBottom(compAllelePair, evalAllelePair));
boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compAllelePair, evalAllelePair) && bottomMatchesTop(compAllelePair, evalAllelePair));
if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop)
breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample
}
@ -154,12 +154,12 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
interesting.addReason("ONLY_EVAL", samp, group, "");
}
else { // both comp and eval are phased:
Biallele prevCompBiallele = new Biallele(prevCompAndEval.getCompGenotpye());
Biallele prevEvalBiallele = new Biallele(prevCompAndEval.getEvalGenotype());
AllelePair prevCompAllelePair = new AllelePair(prevCompAndEval.getCompGenotpye());
AllelePair prevEvalAllelePair = new AllelePair(prevCompAndEval.getEvalGenotype());
// Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample:
boolean topsMatch = (topMatchesTop(prevCompBiallele, prevEvalBiallele) && topMatchesTop(compBiallele, evalBiallele));
boolean topMatchesBottom = (topMatchesBottom(prevCompBiallele, prevEvalBiallele) && topMatchesBottom(compBiallele, evalBiallele));
boolean topsMatch = (topMatchesTop(prevCompAllelePair, prevEvalAllelePair) && topMatchesTop(compAllelePair, evalAllelePair));
boolean topMatchesBottom = (topMatchesBottom(prevCompAllelePair, prevEvalAllelePair) && topMatchesBottom(compAllelePair, evalAllelePair));
if (topsMatch || topMatchesBottom) {
ps.phasesAgree++;
@ -172,7 +172,7 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
else {
ps.phasesDisagree++;
logger.debug("SWITCHED locus: " + curLocus);
interesting.addReason("SWITCH", samp, group, toString(prevCompBiallele, compBiallele) + " -> " + toString(prevEvalBiallele, evalBiallele));
interesting.addReason("SWITCH", samp, group, toString(prevCompAllelePair, compAllelePair) + " -> " + toString(prevEvalAllelePair, evalAllelePair));
}
}
}
@ -212,23 +212,23 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
return new Double(pq.toString());
}
public boolean topMatchesTop(Biallele b1, Biallele b2) {
public boolean topMatchesTop(AllelePair b1, AllelePair b2) {
return b1.getTopAllele().equals(b2.getTopAllele());
}
public boolean topMatchesBottom(Biallele b1, Biallele b2) {
public boolean topMatchesBottom(AllelePair b1, AllelePair b2) {
return b1.getTopAllele().equals(b2.getBottomAllele());
}
public boolean bottomMatchesTop(Biallele b1, Biallele b2) {
public boolean bottomMatchesTop(AllelePair b1, AllelePair b2) {
return topMatchesBottom(b2, b1);
}
public boolean bottomMatchesBottom(Biallele b1, Biallele b2) {
public boolean bottomMatchesBottom(AllelePair b1, AllelePair b2) {
return b1.getBottomAllele().equals(b2.getBottomAllele());
}
public String toString(Biallele prev, Biallele cur) {
public String toString(AllelePair prev, AllelePair cur) {
return prev.getTopAllele().getBaseString() + "," + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "," + cur.getBottomAllele().getBaseString();
}

View File

@ -30,13 +30,13 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.List;
public class Biallele {
public class AllelePair {
private Allele top;
private Allele bottom;
public Biallele(Genotype gt) {
public AllelePair(Genotype gt) {
if (gt.getPloidy() != 2)
throw new ReviewedStingException("Biallele must have ploidy of 2!");
throw new ReviewedStingException("AllelePair must have ploidy of 2!");
this.top = gt.getAllele(0);
this.bottom = gt.getAllele(1);

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.playground.gatk.walkers.phasing;
import java.util.LinkedList;
import java.util.List;
import java.util.*;
/*
* Copyright (c) 2010, The Broad Institute
@ -55,11 +54,11 @@ public class DisjointSet {
}
public boolean inSameSet(int x, int y) {
return (findSet(x) == findSet(y));
return (x == y || nodes[x].parent == nodes[y].parent || findSet(x) == findSet(y));
}
public List<Integer> inSameSetAs(int x, int[] testSet) {
List<Integer> sameSetInds = new LinkedList<Integer>();
public Set<Integer> inSameSetAs(int x, Collection<Integer> testSet) {
Set<Integer> sameSetInds = new TreeSet<Integer>();
int xSet = findSet(x);
for (int t : testSet) {

View File

@ -26,6 +26,7 @@ import java.util.*;
* OTHER DEALINGS IN THE SOFTWARE.
*/
// Represents an undirected graph with no self-edges:
public class Graph implements Iterable<GraphEdge> {
private Neighbors[] adj;
@ -36,45 +37,58 @@ public class Graph implements Iterable<GraphEdge> {
}
public void addEdge(GraphEdge e) {
if (e.v1 == e.v2) // do not permit self-edges
return;
adj[e.v1].addNeighbor(e);
adj[e.v2].addNeighbor(e);
}
public void addEdges(Collection<GraphEdge> edges) {
for (GraphEdge e : edges)
addEdge(e);
}
public void removeEdge(GraphEdge e) {
adj[e.v1].removeNeighbor(e);
adj[e.v2].removeNeighbor(e);
}
public Collection<GraphEdge> removeAllIncidentEdges(int vertexIndex) {
Collection<GraphEdge> incidentEdges = new TreeSet<GraphEdge>(adj[vertexIndex].neighbors); // implemented GraphEdge.compareTo()
for (GraphEdge neighbEdge : incidentEdges) {
if (vertexIndex != neighbEdge.v1) // vertexIndex == neighbEdge.v2
adj[neighbEdge.v1].removeNeighbor(neighbEdge);
else if (vertexIndex != neighbEdge.v2) // vertexIndex == neighbEdge.v1
adj[neighbEdge.v2].removeNeighbor(neighbEdge);
}
adj[vertexIndex].clearAllNeighbors();
return incidentEdges;
}
public DisjointSet getConnectedComponents() {
DisjointSet cc = new DisjointSet(adj.length);
for (int i = 0; i < adj.length; i++)
for (GraphEdge e : adj[i])
cc.setUnion(e.v1, e.v2);
for (GraphEdge e : this)
cc.setUnion(e.v1, e.v2);
return cc;
}
// Note that this will give each edge TWICE [since e=(v1,v2) is stored as a neighbor for both v1 and v2]
public Iterator<GraphEdge> iterator() {
return new AllEdgesIterator();
}
public List<GraphEdge> getAllEdges() {
Set<GraphEdge> allEdges = new TreeSet<GraphEdge>(); // implemented GraphEdge.compareTo()
for (GraphEdge e : this)
allEdges.add(e);
return new LinkedList<GraphEdge>(allEdges);
}
public String toString() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < adj.length; i++) {
sb.append(i + ":");
for (GraphEdge e : adj[i])
sb.append(" " + e);
for (GraphEdge e : adj[i]) {
sb.append(" " + (e.v1 == i ? e.v2 : e.v1));
}
sb.append("\n");
}
@ -84,18 +98,29 @@ public class Graph implements Iterable<GraphEdge> {
private class AllEdgesIterator implements Iterator<GraphEdge> {
private int curInd;
private Iterator<GraphEdge> innerIt;
private GraphEdge nextEdge;
public AllEdgesIterator() {
curInd = 0;
innerIt = null;
nextEdge = null;
}
public boolean hasNext() {
if (nextEdge != null)
return true;
for (; curInd < adj.length; curInd++) {
if (innerIt == null)
innerIt = adj[curInd].iterator();
if (innerIt.hasNext())
return true;
while (innerIt.hasNext()) {
GraphEdge e = innerIt.next();
if (e.v1 == curInd) { // only want to see each edge once
nextEdge = e;
return true;
}
}
innerIt = null;
}
@ -107,7 +132,9 @@ public class Graph implements Iterable<GraphEdge> {
if (!hasNext())
throw new NoSuchElementException();
return innerIt.next();
GraphEdge tmpEdge = nextEdge;
nextEdge = null;
return tmpEdge;
}
public void remove() {
@ -133,5 +160,9 @@ public class Graph implements Iterable<GraphEdge> {
public Iterator<GraphEdge> iterator() {
return neighbors.iterator();
}
public void clearAllNeighbors() {
neighbors.clear();
}
}
}

View File

@ -43,7 +43,6 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import javax.naming.OperationNotSupportedException;
import java.io.*;
import java.util.*;
@ -245,7 +244,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (isCalledDiploidGenotype(gt) && gt.isHet()) { // Can attempt to phase this genotype
PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
BialleleSNP biall = new BialleleSNP(gt);
SNPallelePair biall = new SNPallelePair(gt);
logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + biall);
DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
@ -257,9 +256,18 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
Genotype prevHetGenotype = prevUvc.getGenotype(samp);
PhaseResult pr = phaseSample(phaseWindow);
boolean genotypesArePhased = (pr.phaseQuality >= phaseQualityThresh);
boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
//
//
if (pr.phaseQuality < 0) {
logger.warn("MORE than 10% of the reads are inconsistent for phasing of " + VariantContextUtils.getLocation(vc));
}
//
//
if (genotypesArePhased) {
BialleleSNP prevBiall = new BialleleSNP(prevHetGenotype);
SNPallelePair prevBiall = new SNPallelePair(prevHetGenotype);
logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevBiall + "\n");
logger.debug("THE PHASE CHOSEN HERE:\n" + biall + "\n\n");
@ -311,6 +319,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
phaseStats.addIn(new PhasingStats(samplePhaseStats));
}
public boolean passesPhasingThreshold(double PQ) {
return PQ >= phaseQualityThresh;
}
private static class GenotypeAndReadBases {
public Genotype genotype;
public ReadBasesAtPosition readBases;
@ -449,7 +461,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
logger.debug("Number of sites in window = " + index);
if (logger.isDebugEnabled()) {
logger.debug("ALL READS:");
logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:");
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
String rdName = nameToReads.getKey();
Read rd = nameToReads.getValue();
@ -458,85 +470,98 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
}
private class ReadProperties {
public List<GraphEdge> rdEdges;
public int[] siteInds;
private class EdgeToReads {
private Map<GraphEdge, List<String>> edgeReads;
public ReadProperties(Read rd) {
this.siteInds = rd.getNonNullIndices();
this.rdEdges = new LinkedList<GraphEdge>();
public EdgeToReads() {
this.edgeReads = new TreeMap<GraphEdge, List<String>>(); // implemented GraphEdge.compareTo()
}
// sufficient to create a path linking the sites in rd, so they all end up in the same connected component:
for (int i = 0; i < siteInds.length - 1; i++) {
GraphEdge e = new GraphEdge(siteInds[i], siteInds[i + 1]);
rdEdges.add(e);
public void addRead(GraphEdge e, String readName) {
List<String> reads = edgeReads.get(e);
if (reads == null) {
reads = new LinkedList<String>();
edgeReads.put(e, reads);
}
reads.add(readName);
}
public List<String> getReads(GraphEdge e) {
return edgeReads.get(e);
}
}
private class EdgeCounts {
private Map<GraphEdge, Integer> counts;
private class IntegerSet implements Iterable<Integer> {
private Set<Integer> list;
public EdgeCounts() {
this.counts = new TreeMap<GraphEdge, Integer>(); // implemented GraphEdge.compareTo()
public IntegerSet(Set<Integer> list) {
this.list = list;
}
public int getCount(GraphEdge e) {
Integer count = counts.get(e);
if (count == null)
return 0;
return count;
public boolean contains(int i) {
return list.contains(i);
}
public int incrementEdge(GraphEdge e) {
Integer eCount = counts.get(e);
int cnt;
if (eCount == null)
cnt = 0;
else
cnt = eCount;
cnt++;
counts.put(e, cnt);
return cnt;
public Iterator<Integer> iterator() {
return list.iterator();
}
public int decrementEdge(GraphEdge e) {
Integer eCount = counts.get(e);
if (eCount == null)
return 0;
int cnt = eCount - 1;
counts.put(e, cnt);
return cnt;
public String toString() {
StringBuilder sb = new StringBuilder();
for (int i : this) {
sb.append(i + ", ");
}
return sb.toString();
}
}
public Set<String> removeExtraneousReads(int numHetSites) {
Graph readGraph = new Graph(numHetSites);
Map<String, ReadProperties> readToGraphProperties = new HashMap<String, ReadProperties>();
EdgeCounts edgeCounts = new EdgeCounts();
EdgeToReads edgeToReads = new EdgeToReads();
Set<Integer> sitesWithEdges = new TreeSet<Integer>();
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
String rdName = nameToReads.getKey();
Read rd = nameToReads.getValue();
ReadProperties rp = new ReadProperties(rd);
if (!rp.rdEdges.isEmpty()) { // otherwise, this read is clearly irrelevant since it can't link anything
for (GraphEdge e : rp.rdEdges) {
readGraph.addEdge(e);
int[] siteInds = rd.getNonNullIndices();
// Connect each pair of non-null sites in rd:
for (int i = 0; i < siteInds.length; i++) {
for (int j = i + 1; j < siteInds.length; j++) {
GraphEdge e = new GraphEdge(siteInds[i], siteInds[j]);
logger.debug("Read = " + rdName + " is adding edge: " + e);
readGraph.addEdge(e);
edgeCounts.incrementEdge(e);
edgeToReads.addRead(e, rdName);
sitesWithEdges.add(e.v1);
sitesWithEdges.add(e.v2);
}
readToGraphProperties.put(rdName, rp);
}
}
logger.debug("Read graph:\n" + readGraph);
Set<String> keepReads = new HashSet<String>();
// Check which Reads are involved in paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
/* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
In detail:
Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph".
A read provides non-trivial bias toward the final haplotype decision if it participates in a path from prev ---> cur. This is tested by
considering each edge that the read contributes. For edge e=(v1,v2), if there exists a path from prev ---> v1 [that doesn't include v2] and
cur ---> v2 [that doesn't include v1], then there is a path from prev ---> cur that uses e, hence making the read significant.
By excluding each vertex's edges and then calculating connected components, we are able to make the determination, for example,
if a path exists from prev ---> v1 that excludes v2.
Furthermore, if the path DOES use other edges that exist solely due to the read, then that's fine, since adding in the read will give those edges as well.
And, if the path uses edges from other reads, then keeping all other reads that contribute those edges
[which will happen since those edges are also in paths from prev ---> cur] is sufficient for this path to exist.
NOTE:
If we would use NON-UNIFORM priors for the various haplotypes, then this calculation would not be correct, since the equivalence of:
1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values).
2. The read has edges involved in a path from prev ---> cur.
DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior.
*/
int prev = phasingSiteIndex - 1;
int cur = phasingSiteIndex;
@ -546,56 +571,51 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
return keepReads;
}
for (Map.Entry<String, ReadProperties> rdEdgesEntry : readToGraphProperties.entrySet()) {
String testRead = rdEdgesEntry.getKey();
ReadProperties rp = rdEdgesEntry.getValue();
logger.debug("Testing the connectivity of Read: " + testRead);
/* Check the connected components of prev and cur when removing each individual vertex's edges:
[Total run-time: for each vertex, calculate connected components after removing it's edges: O(V * E)]
*/
IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites];
IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites];
for (int i : sitesWithEdges) {
logger.debug("Calculating CC after removing edges of site: " + i);
// Check the connected components after removing this read's UNIQUE edges:
for (GraphEdge e : rp.rdEdges) {
if (edgeCounts.getCount(e) == 1) // otherwise, the edge still exists without this read
readGraph.removeEdge(e);
}
// Remove all edges incident to i and see which positions have paths to prev and cur:
Collection<GraphEdge> removedEdges = readGraph.removeAllIncidentEdges(i);
// Run-time for efficiently calculating connected components using DisjointSet: O(E)
DisjointSet ccAfterRemove = readGraph.getConnectedComponents();
removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges));
removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges));
/* testRead contributes a path between prev and cur iff:
There exists i != j s.t. testRead[i] != null, testRead[j] != null, ccAfterRemove.inSameSet(prev,i) && ccAfterRemove.inSameSet(j,cur)
[since ALL non-null indices in testRead are connected to one another, as one clique].
logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]);
logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]);
// Add the removed edges back in:
readGraph.addEdges(removedEdges);
}
for (GraphEdge e : readGraph) {
logger.debug("Testing the path-connectivity of Edge: " + e);
/* Edge e={v1,v2} contributes a path between prev and cur for testRead iff:
testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa].
Note that the path from prev ---> v1 will NOT contain v2, since we removed all of v2's edges,
and the path from v2 ---> cur will NOT contain v1.
*/
List<Integer> sameCCasPrev = ccAfterRemove.inSameSetAs(prev, rp.siteInds);
List<Integer> sameCCasCur = ccAfterRemove.inSameSetAs(cur, rp.siteInds);
if (logger.isDebugEnabled()) {
StringBuilder sb = new StringBuilder("sameCCasPrev:");
for (int ind : sameCCasPrev)
sb.append(" " + ind);
logger.debug(sb.toString());
boolean prevTo2and1ToCur = removedSiteSameCCAsPrev[e.v1].contains(e.v2) && removedSiteSameCCAsCur[e.v2].contains(e.v1);
boolean prevTo1and2ToCur = removedSiteSameCCAsPrev[e.v2].contains(e.v1) && removedSiteSameCCAsCur[e.v1].contains(e.v2);
sb = new StringBuilder("sameCCasCur:");
for (int ind : sameCCasCur)
sb.append(" " + ind);
logger.debug(sb.toString());
}
if (prevTo2and1ToCur || prevTo1and2ToCur) {
for (String readName : edgeToReads.getReads(e)) {
keepReads.add(readName);
boolean keepRead = false;
if (!sameCCasPrev.isEmpty() && !sameCCasCur.isEmpty()) { // There exists a path from prev to cur that goes through the sites in testRead
// Now, make sure that TWO DISTINCT sites, i and j, in testRead are used in the path:
Set<Integer> union = new HashSet<Integer>(sameCCasPrev);
union.addAll(sameCCasCur);
if (union.size() >= 2) // i != j
keepRead = true;
}
if (keepRead) {
logger.debug("Read is part of path from " + prev + " to " + cur);
keepReads.add(testRead);
// Add the removed edges back in, since we're keeping the read:
for (GraphEdge e : rp.rdEdges)
readGraph.addEdge(e);
}
else { // Decrease the count for the edges [note that any read-specific edges were already removed above]:
for (GraphEdge e : rp.rdEdges)
edgeCounts.decrementEdge(e);
if (logger.isDebugEnabled()) {
if (prevTo2and1ToCur)
logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.v2 + " -> " + e.v1 + " ---> " + cur);
else
logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.v1 + " -> " + e.v2 + " ---> " + cur);
}
}
}
}
@ -677,7 +697,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private PhaseResult phaseSample(PhasingWindow phaseWindow) {
/* Will map a phase and its "complement" to a single representative phase,
and marginalizeTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
*/
HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2);
PhasingTable sampleHaps = tabCreator.getNewTable();
@ -693,6 +713,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
// Update the phasing table based on each of the sub-reads for this sample:
MaxHaplotypeAndQuality prevMaxHapAndQual = null;
int numHighQualityIterations = 0;
int numInconsistentIterations = 0;
for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
Read rd = nameToReads.getValue();
@ -704,38 +727,88 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
}
//
//
//
// Check the current best haplotype assignment:
MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false);
logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality);
if (prevMaxHapAndQual != null && passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) {
numHighQualityIterations++;
if (!curMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative().equals(prevMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative()) || // switched phase
curMaxHapAndQual.phaseQuality < 0.9 * prevMaxHapAndQual.phaseQuality) { // a 10% ["significant"] decrease in PQ
logger.debug("Inconsistent read found!");
numInconsistentIterations++;
}
}
prevMaxHapAndQual = curMaxHapAndQual;
//
//
//
}
logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true);
double posteriorProb = maxHapQual.maxEntry.getScore().getValue();
// Marginalize each haplotype to its first 2 positions:
sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps);
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + sampleHaps + "\n");
logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality);
logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations);
// Determine the phase at this position:
sampleHaps.normalizeScores();
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
double repPhaseQuality = maxHapQual.phaseQuality;
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry();
double posteriorProb = maxEntry.getScore().getValue();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
if (pte != maxEntry)
sumErrorProbs.plusEqual(pte.getScore());
//
//
if (numInconsistentIterations / (double) numHighQualityIterations >= 0.1) {
//
// ????
// NEED TO CHANGE phaseSite() to always output PQ field, EVEN if it's LESS THAN threshold ??????
// ????
//
repPhaseQuality *= -1;
}
double phaseQuality = -10.0 * (sumErrorProbs.getLog10Value());
//
//
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality);
return new PhaseResult(maxHapQual.maxEntry.getHaplotypeClass().getRepresentative(), repPhaseQuality);
}
return new PhaseResult(maxEntry.getHaplotypeClass().getRepresentative(), phaseQuality);
private static class MaxHaplotypeAndQuality {
public PhasingTable.PhasingTableEntry maxEntry;
public double phaseQuality;
public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) {
// Marginalize each haplotype to its first 2 positions:
hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable);
if (printDebug)
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n");
// Determine the phase at this position:
hapTable.normalizeScores();
if (printDebug)
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n");
this.maxEntry = hapTable.maxEntry();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
this.phaseQuality = getPhasingQuality(hapTable, maxEntry);
}
// Returns the PQ of entry (within table hapTable, which MUST be normalized):
public static double getPhasingQuality(PhasingTable hapTable, PhasingTable.PhasingTableEntry entry) {
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : hapTable) {
if (pte != entry)
sumErrorProbs.plusEqual(pte.getScore());
}
return -10.0 * (sumErrorProbs.getLog10Value());
}
}
/*
Ensure that curBiall is phased relative to prevBiall as specified by hap.
*/
public static void ensurePhasing(BialleleSNP curBiall, BialleleSNP prevBiall, Haplotype hap) {
public static void ensurePhasing(SNPallelePair curBiall, SNPallelePair prevBiall, Haplotype hap) {
if (hap.size() < 2)
throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!");
@ -784,7 +857,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
byte refBase;
if (!vc.isIndel()) {
Allele varAllele = vc.getReference();
refBase = BialleleSNP.getSingleBase(varAllele);
refBase = SNPallelePair.getSingleBase(varAllele);
}
else {
refBase = vc.getReferenceBaseForIndel();
@ -1023,14 +1096,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
byte[] hapBases = new byte[numSites];
for (int i = 0; i < numSites; i++) {
Allele alleleI = genotypes[i].getAllele(alleleInds[i]);
hapBases[i] = BialleleSNP.getSingleBase(alleleI);
hapBases[i] = SNPallelePair.getSingleBase(alleleI);
}
allHaps.add(new Haplotype(hapBases));
}
return allHaps;
}
public static PhasingTable marginalizeTable(PhasingTable table) {
public static PhasingTable marginalizeAsNewTable(PhasingTable table) {
Map<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
for (PhasingTable.PhasingTableEntry pte : table) {
Haplotype rep = pte.getHaplotypeClass().getRepresentative();
@ -1056,23 +1129,23 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
private static class BiallelicComplementHaplotypeTableCreator extends HaplotypeTableCreator {
private BialleleSNP[] bialleleSNPs;
private SNPallelePair[] bialleleSNPs;
private int startIndex;
private int marginalizeLength;
public BiallelicComplementHaplotypeTableCreator(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) {
super(hetGenotypes);
this.bialleleSNPs = new BialleleSNP[genotypes.length];
this.bialleleSNPs = new SNPallelePair[genotypes.length];
for (int i = 0; i < genotypes.length; i++)
bialleleSNPs[i] = new BialleleSNP(genotypes[i]);
bialleleSNPs[i] = new SNPallelePair(genotypes[i]);
this.startIndex = startIndex;
this.marginalizeLength = marginalizeLength;
}
public PhasingTable getNewTable() {
double hapClassPrior = 1.0; // can change later
double hapClassPrior = 1.0; // can change later, BUT see NOTE above in removeExtraneousReads()
PhasingTable table = new PhasingTable();
for (Haplotype hap : getAllHaplotypes()) {

View File

@ -28,15 +28,15 @@ import org.broad.tribble.util.variantcontext.Genotype;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
public class BialleleSNP extends Biallele {
public class SNPallelePair extends AllelePair {
public BialleleSNP(Genotype gt) {
public SNPallelePair(Genotype gt) {
super(gt);
if (getTopAllele().getBases().length != 1)
throw new ReviewedStingException("LOGICAL ERROR: BialleleSNP may not contain non-SNP site!");
throw new ReviewedStingException("LOGICAL ERROR: SNPallelePair may not contain non-SNP site!");
if (getBottomAllele().getBases().length != 1)
throw new ReviewedStingException("LOGICAL ERROR: BialleleSNP may not contain non-SNP site!");
throw new ReviewedStingException("LOGICAL ERROR: SNPallelePair may not contain non-SNP site!");
}
public byte getTopBase() {