Fixed phasing algorithm to: 1. More correctly weed out irrelevant reads and sites; 2. Crudely flag sites with large phase discrepancies betweens reads

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4368 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-09-28 23:02:53 +00:00
parent 5a5c72c80d
commit 8d8980e8eb
6 changed files with 270 additions and 167 deletions

View File

@ -111,15 +111,15 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
samplePrevGenotypes.put(samp, null); samplePrevGenotypes.put(samp, null);
} }
else { // Both comp and eval have a non-null Genotype at this site: else { // Both comp and eval have a non-null Genotype at this site:
Biallele compBiallele = new Biallele(compSampGt); AllelePair compAllelePair = new AllelePair(compSampGt);
Biallele evalBiallele = new Biallele(evalSampGt); AllelePair evalAllelePair = new AllelePair(evalSampGt);
boolean breakPhasing = false; boolean breakPhasing = false;
if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom()) if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom())
breakPhasing = true; // since they are not both het or both hom breakPhasing = true; // since they are not both het or both hom
else { // both are het, or both are hom: else { // both are het, or both are hom:
boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compBiallele, evalBiallele) && bottomMatchesBottom(compBiallele, evalBiallele)); boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compAllelePair, evalAllelePair) && bottomMatchesBottom(compAllelePair, evalAllelePair));
boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compBiallele, evalBiallele) && bottomMatchesTop(compBiallele, evalBiallele)); boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compAllelePair, evalAllelePair) && bottomMatchesTop(compAllelePair, evalAllelePair));
if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop) if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop)
breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample
} }
@ -154,12 +154,12 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
interesting.addReason("ONLY_EVAL", samp, group, ""); interesting.addReason("ONLY_EVAL", samp, group, "");
} }
else { // both comp and eval are phased: else { // both comp and eval are phased:
Biallele prevCompBiallele = new Biallele(prevCompAndEval.getCompGenotpye()); AllelePair prevCompAllelePair = new AllelePair(prevCompAndEval.getCompGenotpye());
Biallele prevEvalBiallele = new Biallele(prevCompAndEval.getEvalGenotype()); AllelePair prevEvalAllelePair = new AllelePair(prevCompAndEval.getEvalGenotype());
// Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample: // Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample:
boolean topsMatch = (topMatchesTop(prevCompBiallele, prevEvalBiallele) && topMatchesTop(compBiallele, evalBiallele)); boolean topsMatch = (topMatchesTop(prevCompAllelePair, prevEvalAllelePair) && topMatchesTop(compAllelePair, evalAllelePair));
boolean topMatchesBottom = (topMatchesBottom(prevCompBiallele, prevEvalBiallele) && topMatchesBottom(compBiallele, evalBiallele)); boolean topMatchesBottom = (topMatchesBottom(prevCompAllelePair, prevEvalAllelePair) && topMatchesBottom(compAllelePair, evalAllelePair));
if (topsMatch || topMatchesBottom) { if (topsMatch || topMatchesBottom) {
ps.phasesAgree++; ps.phasesAgree++;
@ -172,7 +172,7 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
else { else {
ps.phasesDisagree++; ps.phasesDisagree++;
logger.debug("SWITCHED locus: " + curLocus); logger.debug("SWITCHED locus: " + curLocus);
interesting.addReason("SWITCH", samp, group, toString(prevCompBiallele, compBiallele) + " -> " + toString(prevEvalBiallele, evalBiallele)); interesting.addReason("SWITCH", samp, group, toString(prevCompAllelePair, compAllelePair) + " -> " + toString(prevEvalAllelePair, evalAllelePair));
} }
} }
} }
@ -212,23 +212,23 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
return new Double(pq.toString()); return new Double(pq.toString());
} }
public boolean topMatchesTop(Biallele b1, Biallele b2) { public boolean topMatchesTop(AllelePair b1, AllelePair b2) {
return b1.getTopAllele().equals(b2.getTopAllele()); return b1.getTopAllele().equals(b2.getTopAllele());
} }
public boolean topMatchesBottom(Biallele b1, Biallele b2) { public boolean topMatchesBottom(AllelePair b1, AllelePair b2) {
return b1.getTopAllele().equals(b2.getBottomAllele()); return b1.getTopAllele().equals(b2.getBottomAllele());
} }
public boolean bottomMatchesTop(Biallele b1, Biallele b2) { public boolean bottomMatchesTop(AllelePair b1, AllelePair b2) {
return topMatchesBottom(b2, b1); return topMatchesBottom(b2, b1);
} }
public boolean bottomMatchesBottom(Biallele b1, Biallele b2) { public boolean bottomMatchesBottom(AllelePair b1, AllelePair b2) {
return b1.getBottomAllele().equals(b2.getBottomAllele()); return b1.getBottomAllele().equals(b2.getBottomAllele());
} }
public String toString(Biallele prev, Biallele cur) { public String toString(AllelePair prev, AllelePair cur) {
return prev.getTopAllele().getBaseString() + "," + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "," + cur.getBottomAllele().getBaseString(); return prev.getTopAllele().getBaseString() + "," + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "," + cur.getBottomAllele().getBaseString();
} }

View File

@ -30,13 +30,13 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
public class Biallele { public class AllelePair {
private Allele top; private Allele top;
private Allele bottom; private Allele bottom;
public Biallele(Genotype gt) { public AllelePair(Genotype gt) {
if (gt.getPloidy() != 2) if (gt.getPloidy() != 2)
throw new ReviewedStingException("Biallele must have ploidy of 2!"); throw new ReviewedStingException("AllelePair must have ploidy of 2!");
this.top = gt.getAllele(0); this.top = gt.getAllele(0);
this.bottom = gt.getAllele(1); this.bottom = gt.getAllele(1);

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.playground.gatk.walkers.phasing; package org.broadinstitute.sting.playground.gatk.walkers.phasing;
import java.util.LinkedList; import java.util.*;
import java.util.List;
/* /*
* Copyright (c) 2010, The Broad Institute * Copyright (c) 2010, The Broad Institute
@ -55,11 +54,11 @@ public class DisjointSet {
} }
public boolean inSameSet(int x, int y) { public boolean inSameSet(int x, int y) {
return (findSet(x) == findSet(y)); return (x == y || nodes[x].parent == nodes[y].parent || findSet(x) == findSet(y));
} }
public List<Integer> inSameSetAs(int x, int[] testSet) { public Set<Integer> inSameSetAs(int x, Collection<Integer> testSet) {
List<Integer> sameSetInds = new LinkedList<Integer>(); Set<Integer> sameSetInds = new TreeSet<Integer>();
int xSet = findSet(x); int xSet = findSet(x);
for (int t : testSet) { for (int t : testSet) {

View File

@ -26,6 +26,7 @@ import java.util.*;
* OTHER DEALINGS IN THE SOFTWARE. * OTHER DEALINGS IN THE SOFTWARE.
*/ */
// Represents an undirected graph with no self-edges:
public class Graph implements Iterable<GraphEdge> { public class Graph implements Iterable<GraphEdge> {
private Neighbors[] adj; private Neighbors[] adj;
@ -36,45 +37,58 @@ public class Graph implements Iterable<GraphEdge> {
} }
public void addEdge(GraphEdge e) { public void addEdge(GraphEdge e) {
if (e.v1 == e.v2) // do not permit self-edges
return;
adj[e.v1].addNeighbor(e); adj[e.v1].addNeighbor(e);
adj[e.v2].addNeighbor(e); adj[e.v2].addNeighbor(e);
} }
public void addEdges(Collection<GraphEdge> edges) {
for (GraphEdge e : edges)
addEdge(e);
}
public void removeEdge(GraphEdge e) { public void removeEdge(GraphEdge e) {
adj[e.v1].removeNeighbor(e); adj[e.v1].removeNeighbor(e);
adj[e.v2].removeNeighbor(e); adj[e.v2].removeNeighbor(e);
} }
public Collection<GraphEdge> removeAllIncidentEdges(int vertexIndex) {
Collection<GraphEdge> incidentEdges = new TreeSet<GraphEdge>(adj[vertexIndex].neighbors); // implemented GraphEdge.compareTo()
for (GraphEdge neighbEdge : incidentEdges) {
if (vertexIndex != neighbEdge.v1) // vertexIndex == neighbEdge.v2
adj[neighbEdge.v1].removeNeighbor(neighbEdge);
else if (vertexIndex != neighbEdge.v2) // vertexIndex == neighbEdge.v1
adj[neighbEdge.v2].removeNeighbor(neighbEdge);
}
adj[vertexIndex].clearAllNeighbors();
return incidentEdges;
}
public DisjointSet getConnectedComponents() { public DisjointSet getConnectedComponents() {
DisjointSet cc = new DisjointSet(adj.length); DisjointSet cc = new DisjointSet(adj.length);
for (int i = 0; i < adj.length; i++) for (GraphEdge e : this)
for (GraphEdge e : adj[i]) cc.setUnion(e.v1, e.v2);
cc.setUnion(e.v1, e.v2);
return cc; return cc;
} }
// Note that this will give each edge TWICE [since e=(v1,v2) is stored as a neighbor for both v1 and v2]
public Iterator<GraphEdge> iterator() { public Iterator<GraphEdge> iterator() {
return new AllEdgesIterator(); return new AllEdgesIterator();
} }
public List<GraphEdge> getAllEdges() {
Set<GraphEdge> allEdges = new TreeSet<GraphEdge>(); // implemented GraphEdge.compareTo()
for (GraphEdge e : this)
allEdges.add(e);
return new LinkedList<GraphEdge>(allEdges);
}
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (int i = 0; i < adj.length; i++) { for (int i = 0; i < adj.length; i++) {
sb.append(i + ":"); sb.append(i + ":");
for (GraphEdge e : adj[i]) for (GraphEdge e : adj[i]) {
sb.append(" " + e); sb.append(" " + (e.v1 == i ? e.v2 : e.v1));
}
sb.append("\n"); sb.append("\n");
} }
@ -84,18 +98,29 @@ public class Graph implements Iterable<GraphEdge> {
private class AllEdgesIterator implements Iterator<GraphEdge> { private class AllEdgesIterator implements Iterator<GraphEdge> {
private int curInd; private int curInd;
private Iterator<GraphEdge> innerIt; private Iterator<GraphEdge> innerIt;
private GraphEdge nextEdge;
public AllEdgesIterator() { public AllEdgesIterator() {
curInd = 0; curInd = 0;
innerIt = null; innerIt = null;
nextEdge = null;
} }
public boolean hasNext() { public boolean hasNext() {
if (nextEdge != null)
return true;
for (; curInd < adj.length; curInd++) { for (; curInd < adj.length; curInd++) {
if (innerIt == null) if (innerIt == null)
innerIt = adj[curInd].iterator(); innerIt = adj[curInd].iterator();
if (innerIt.hasNext())
return true; while (innerIt.hasNext()) {
GraphEdge e = innerIt.next();
if (e.v1 == curInd) { // only want to see each edge once
nextEdge = e;
return true;
}
}
innerIt = null; innerIt = null;
} }
@ -107,7 +132,9 @@ public class Graph implements Iterable<GraphEdge> {
if (!hasNext()) if (!hasNext())
throw new NoSuchElementException(); throw new NoSuchElementException();
return innerIt.next(); GraphEdge tmpEdge = nextEdge;
nextEdge = null;
return tmpEdge;
} }
public void remove() { public void remove() {
@ -133,5 +160,9 @@ public class Graph implements Iterable<GraphEdge> {
public Iterator<GraphEdge> iterator() { public Iterator<GraphEdge> iterator() {
return neighbors.iterator(); return neighbors.iterator();
} }
public void clearAllNeighbors() {
neighbors.clear();
}
} }
} }

View File

@ -43,7 +43,6 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import javax.naming.OperationNotSupportedException;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
@ -245,7 +244,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (isCalledDiploidGenotype(gt) && gt.isHet()) { // Can attempt to phase this genotype if (isCalledDiploidGenotype(gt) && gt.isHet()) { // Can attempt to phase this genotype
PhasingWindow phaseWindow = new PhasingWindow(vr, samp); PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
BialleleSNP biall = new BialleleSNP(gt); SNPallelePair biall = new SNPallelePair(gt);
logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + biall); logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + biall);
DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
@ -257,9 +256,18 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
Genotype prevHetGenotype = prevUvc.getGenotype(samp); Genotype prevHetGenotype = prevUvc.getGenotype(samp);
PhaseResult pr = phaseSample(phaseWindow); PhaseResult pr = phaseSample(phaseWindow);
boolean genotypesArePhased = (pr.phaseQuality >= phaseQualityThresh); boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
//
//
if (pr.phaseQuality < 0) {
logger.warn("MORE than 10% of the reads are inconsistent for phasing of " + VariantContextUtils.getLocation(vc));
}
//
//
if (genotypesArePhased) { if (genotypesArePhased) {
BialleleSNP prevBiall = new BialleleSNP(prevHetGenotype); SNPallelePair prevBiall = new SNPallelePair(prevHetGenotype);
logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevBiall + "\n"); logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevBiall + "\n");
logger.debug("THE PHASE CHOSEN HERE:\n" + biall + "\n\n"); logger.debug("THE PHASE CHOSEN HERE:\n" + biall + "\n\n");
@ -311,6 +319,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
phaseStats.addIn(new PhasingStats(samplePhaseStats)); phaseStats.addIn(new PhasingStats(samplePhaseStats));
} }
public boolean passesPhasingThreshold(double PQ) {
return PQ >= phaseQualityThresh;
}
private static class GenotypeAndReadBases { private static class GenotypeAndReadBases {
public Genotype genotype; public Genotype genotype;
public ReadBasesAtPosition readBases; public ReadBasesAtPosition readBases;
@ -449,7 +461,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
logger.debug("Number of sites in window = " + index); logger.debug("Number of sites in window = " + index);
if (logger.isDebugEnabled()) { if (logger.isDebugEnabled()) {
logger.debug("ALL READS:"); logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:");
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) { for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
String rdName = nameToReads.getKey(); String rdName = nameToReads.getKey();
Read rd = nameToReads.getValue(); Read rd = nameToReads.getValue();
@ -458,85 +470,98 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
} }
private class ReadProperties { private class EdgeToReads {
public List<GraphEdge> rdEdges; private Map<GraphEdge, List<String>> edgeReads;
public int[] siteInds;
public ReadProperties(Read rd) { public EdgeToReads() {
this.siteInds = rd.getNonNullIndices(); this.edgeReads = new TreeMap<GraphEdge, List<String>>(); // implemented GraphEdge.compareTo()
this.rdEdges = new LinkedList<GraphEdge>(); }
// sufficient to create a path linking the sites in rd, so they all end up in the same connected component: public void addRead(GraphEdge e, String readName) {
for (int i = 0; i < siteInds.length - 1; i++) { List<String> reads = edgeReads.get(e);
GraphEdge e = new GraphEdge(siteInds[i], siteInds[i + 1]); if (reads == null) {
rdEdges.add(e); reads = new LinkedList<String>();
edgeReads.put(e, reads);
} }
reads.add(readName);
}
public List<String> getReads(GraphEdge e) {
return edgeReads.get(e);
} }
} }
private class EdgeCounts { private class IntegerSet implements Iterable<Integer> {
private Map<GraphEdge, Integer> counts; private Set<Integer> list;
public EdgeCounts() { public IntegerSet(Set<Integer> list) {
this.counts = new TreeMap<GraphEdge, Integer>(); // implemented GraphEdge.compareTo() this.list = list;
} }
public int getCount(GraphEdge e) { public boolean contains(int i) {
Integer count = counts.get(e); return list.contains(i);
if (count == null)
return 0;
return count;
} }
public int incrementEdge(GraphEdge e) { public Iterator<Integer> iterator() {
Integer eCount = counts.get(e); return list.iterator();
int cnt;
if (eCount == null)
cnt = 0;
else
cnt = eCount;
cnt++;
counts.put(e, cnt);
return cnt;
} }
public int decrementEdge(GraphEdge e) { public String toString() {
Integer eCount = counts.get(e); StringBuilder sb = new StringBuilder();
if (eCount == null) for (int i : this) {
return 0; sb.append(i + ", ");
}
int cnt = eCount - 1; return sb.toString();
counts.put(e, cnt);
return cnt;
} }
} }
public Set<String> removeExtraneousReads(int numHetSites) { public Set<String> removeExtraneousReads(int numHetSites) {
Graph readGraph = new Graph(numHetSites); Graph readGraph = new Graph(numHetSites);
Map<String, ReadProperties> readToGraphProperties = new HashMap<String, ReadProperties>(); EdgeToReads edgeToReads = new EdgeToReads();
EdgeCounts edgeCounts = new EdgeCounts(); Set<Integer> sitesWithEdges = new TreeSet<Integer>();
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) { for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
String rdName = nameToReads.getKey(); String rdName = nameToReads.getKey();
Read rd = nameToReads.getValue(); Read rd = nameToReads.getValue();
ReadProperties rp = new ReadProperties(rd); int[] siteInds = rd.getNonNullIndices();
if (!rp.rdEdges.isEmpty()) { // otherwise, this read is clearly irrelevant since it can't link anything // Connect each pair of non-null sites in rd:
for (GraphEdge e : rp.rdEdges) { for (int i = 0; i < siteInds.length; i++) {
readGraph.addEdge(e); for (int j = i + 1; j < siteInds.length; j++) {
GraphEdge e = new GraphEdge(siteInds[i], siteInds[j]);
logger.debug("Read = " + rdName + " is adding edge: " + e); logger.debug("Read = " + rdName + " is adding edge: " + e);
readGraph.addEdge(e);
edgeCounts.incrementEdge(e); edgeToReads.addRead(e, rdName);
sitesWithEdges.add(e.v1);
sitesWithEdges.add(e.v2);
} }
readToGraphProperties.put(rdName, rp);
} }
} }
logger.debug("Read graph:\n" + readGraph); logger.debug("Read graph:\n" + readGraph);
Set<String> keepReads = new HashSet<String>(); Set<String> keepReads = new HashSet<String>();
// Check which Reads are involved in paths from (phasingSiteIndex - 1) to (phasingSiteIndex): /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
In detail:
Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph".
A read provides non-trivial bias toward the final haplotype decision if it participates in a path from prev ---> cur. This is tested by
considering each edge that the read contributes. For edge e=(v1,v2), if there exists a path from prev ---> v1 [that doesn't include v2] and
cur ---> v2 [that doesn't include v1], then there is a path from prev ---> cur that uses e, hence making the read significant.
By excluding each vertex's edges and then calculating connected components, we are able to make the determination, for example,
if a path exists from prev ---> v1 that excludes v2.
Furthermore, if the path DOES use other edges that exist solely due to the read, then that's fine, since adding in the read will give those edges as well.
And, if the path uses edges from other reads, then keeping all other reads that contribute those edges
[which will happen since those edges are also in paths from prev ---> cur] is sufficient for this path to exist.
NOTE:
If we would use NON-UNIFORM priors for the various haplotypes, then this calculation would not be correct, since the equivalence of:
1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values).
2. The read has edges involved in a path from prev ---> cur.
DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior.
*/
int prev = phasingSiteIndex - 1; int prev = phasingSiteIndex - 1;
int cur = phasingSiteIndex; int cur = phasingSiteIndex;
@ -546,56 +571,51 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
return keepReads; return keepReads;
} }
for (Map.Entry<String, ReadProperties> rdEdgesEntry : readToGraphProperties.entrySet()) { /* Check the connected components of prev and cur when removing each individual vertex's edges:
String testRead = rdEdgesEntry.getKey(); [Total run-time: for each vertex, calculate connected components after removing it's edges: O(V * E)]
ReadProperties rp = rdEdgesEntry.getValue(); */
logger.debug("Testing the connectivity of Read: " + testRead); IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites];
IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites];
for (int i : sitesWithEdges) {
logger.debug("Calculating CC after removing edges of site: " + i);
// Check the connected components after removing this read's UNIQUE edges: // Remove all edges incident to i and see which positions have paths to prev and cur:
for (GraphEdge e : rp.rdEdges) { Collection<GraphEdge> removedEdges = readGraph.removeAllIncidentEdges(i);
if (edgeCounts.getCount(e) == 1) // otherwise, the edge still exists without this read
readGraph.removeEdge(e); // Run-time for efficiently calculating connected components using DisjointSet: O(E)
}
DisjointSet ccAfterRemove = readGraph.getConnectedComponents(); DisjointSet ccAfterRemove = readGraph.getConnectedComponents();
removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges));
removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges));
/* testRead contributes a path between prev and cur iff: logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]);
There exists i != j s.t. testRead[i] != null, testRead[j] != null, ccAfterRemove.inSameSet(prev,i) && ccAfterRemove.inSameSet(j,cur) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]);
[since ALL non-null indices in testRead are connected to one another, as one clique].
// Add the removed edges back in:
readGraph.addEdges(removedEdges);
}
for (GraphEdge e : readGraph) {
logger.debug("Testing the path-connectivity of Edge: " + e);
/* Edge e={v1,v2} contributes a path between prev and cur for testRead iff:
testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa].
Note that the path from prev ---> v1 will NOT contain v2, since we removed all of v2's edges,
and the path from v2 ---> cur will NOT contain v1.
*/ */
List<Integer> sameCCasPrev = ccAfterRemove.inSameSetAs(prev, rp.siteInds); boolean prevTo2and1ToCur = removedSiteSameCCAsPrev[e.v1].contains(e.v2) && removedSiteSameCCAsCur[e.v2].contains(e.v1);
List<Integer> sameCCasCur = ccAfterRemove.inSameSetAs(cur, rp.siteInds); boolean prevTo1and2ToCur = removedSiteSameCCAsPrev[e.v2].contains(e.v1) && removedSiteSameCCAsCur[e.v1].contains(e.v2);
if (logger.isDebugEnabled()) {
StringBuilder sb = new StringBuilder("sameCCasPrev:");
for (int ind : sameCCasPrev)
sb.append(" " + ind);
logger.debug(sb.toString());
sb = new StringBuilder("sameCCasCur:"); if (prevTo2and1ToCur || prevTo1and2ToCur) {
for (int ind : sameCCasCur) for (String readName : edgeToReads.getReads(e)) {
sb.append(" " + ind); keepReads.add(readName);
logger.debug(sb.toString());
}
boolean keepRead = false; if (logger.isDebugEnabled()) {
if (!sameCCasPrev.isEmpty() && !sameCCasCur.isEmpty()) { // There exists a path from prev to cur that goes through the sites in testRead if (prevTo2and1ToCur)
// Now, make sure that TWO DISTINCT sites, i and j, in testRead are used in the path: logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.v2 + " -> " + e.v1 + " ---> " + cur);
Set<Integer> union = new HashSet<Integer>(sameCCasPrev); else
union.addAll(sameCCasCur); logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.v1 + " -> " + e.v2 + " ---> " + cur);
if (union.size() >= 2) // i != j }
keepRead = true; }
}
if (keepRead) {
logger.debug("Read is part of path from " + prev + " to " + cur);
keepReads.add(testRead);
// Add the removed edges back in, since we're keeping the read:
for (GraphEdge e : rp.rdEdges)
readGraph.addEdge(e);
}
else { // Decrease the count for the edges [note that any read-specific edges were already removed above]:
for (GraphEdge e : rp.rdEdges)
edgeCounts.decrementEdge(e);
} }
} }
@ -677,7 +697,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private PhaseResult phaseSample(PhasingWindow phaseWindow) { private PhaseResult phaseSample(PhasingWindow phaseWindow) {
/* Will map a phase and its "complement" to a single representative phase, /* Will map a phase and its "complement" to a single representative phase,
and marginalizeTable() marginalizes to 2 positions [starting at the previous position, and then the current position]: and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
*/ */
HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2); HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2);
PhasingTable sampleHaps = tabCreator.getNewTable(); PhasingTable sampleHaps = tabCreator.getNewTable();
@ -693,6 +713,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
// Update the phasing table based on each of the sub-reads for this sample: // Update the phasing table based on each of the sub-reads for this sample:
MaxHaplotypeAndQuality prevMaxHapAndQual = null;
int numHighQualityIterations = 0;
int numInconsistentIterations = 0;
for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) { for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
Read rd = nameToReads.getValue(); Read rd = nameToReads.getValue();
@ -704,38 +727,88 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
} }
//
//
//
// Check the current best haplotype assignment:
MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false);
logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality);
if (prevMaxHapAndQual != null && passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) {
numHighQualityIterations++;
if (!curMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative().equals(prevMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative()) || // switched phase
curMaxHapAndQual.phaseQuality < 0.9 * prevMaxHapAndQual.phaseQuality) { // a 10% ["significant"] decrease in PQ
logger.debug("Inconsistent read found!");
numInconsistentIterations++;
}
}
prevMaxHapAndQual = curMaxHapAndQual;
//
//
//
} }
logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n"); logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true);
double posteriorProb = maxHapQual.maxEntry.getScore().getValue();
// Marginalize each haplotype to its first 2 positions: logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality);
sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps); logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations);
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + sampleHaps + "\n");
// Determine the phase at this position: double repPhaseQuality = maxHapQual.phaseQuality;
sampleHaps.normalizeScores();
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry(); //
double posteriorProb = maxEntry.getScore().getValue(); //
if (numInconsistentIterations / (double) numHighQualityIterations >= 0.1) {
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb): //
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO); // ????
for (PhasingTable.PhasingTableEntry pte : sampleHaps) { // NEED TO CHANGE phaseSite() to always output PQ field, EVEN if it's LESS THAN threshold ??????
if (pte != maxEntry) // ????
sumErrorProbs.plusEqual(pte.getScore()); //
repPhaseQuality *= -1;
} }
double phaseQuality = -10.0 * (sumErrorProbs.getLog10Value()); //
//
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality); return new PhaseResult(maxHapQual.maxEntry.getHaplotypeClass().getRepresentative(), repPhaseQuality);
}
return new PhaseResult(maxEntry.getHaplotypeClass().getRepresentative(), phaseQuality); private static class MaxHaplotypeAndQuality {
public PhasingTable.PhasingTableEntry maxEntry;
public double phaseQuality;
public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) {
// Marginalize each haplotype to its first 2 positions:
hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable);
if (printDebug)
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n");
// Determine the phase at this position:
hapTable.normalizeScores();
if (printDebug)
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n");
this.maxEntry = hapTable.maxEntry();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
this.phaseQuality = getPhasingQuality(hapTable, maxEntry);
}
// Returns the PQ of entry (within table hapTable, which MUST be normalized):
public static double getPhasingQuality(PhasingTable hapTable, PhasingTable.PhasingTableEntry entry) {
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : hapTable) {
if (pte != entry)
sumErrorProbs.plusEqual(pte.getScore());
}
return -10.0 * (sumErrorProbs.getLog10Value());
}
} }
/* /*
Ensure that curBiall is phased relative to prevBiall as specified by hap. Ensure that curBiall is phased relative to prevBiall as specified by hap.
*/ */
public static void ensurePhasing(SNPallelePair curBiall, SNPallelePair prevBiall, Haplotype hap) {
public static void ensurePhasing(BialleleSNP curBiall, BialleleSNP prevBiall, Haplotype hap) {
if (hap.size() < 2) if (hap.size() < 2)
throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!"); throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!");
@ -784,7 +857,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
byte refBase; byte refBase;
if (!vc.isIndel()) { if (!vc.isIndel()) {
Allele varAllele = vc.getReference(); Allele varAllele = vc.getReference();
refBase = BialleleSNP.getSingleBase(varAllele); refBase = SNPallelePair.getSingleBase(varAllele);
} }
else { else {
refBase = vc.getReferenceBaseForIndel(); refBase = vc.getReferenceBaseForIndel();
@ -1023,14 +1096,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
byte[] hapBases = new byte[numSites]; byte[] hapBases = new byte[numSites];
for (int i = 0; i < numSites; i++) { for (int i = 0; i < numSites; i++) {
Allele alleleI = genotypes[i].getAllele(alleleInds[i]); Allele alleleI = genotypes[i].getAllele(alleleInds[i]);
hapBases[i] = BialleleSNP.getSingleBase(alleleI); hapBases[i] = SNPallelePair.getSingleBase(alleleI);
} }
allHaps.add(new Haplotype(hapBases)); allHaps.add(new Haplotype(hapBases));
} }
return allHaps; return allHaps;
} }
public static PhasingTable marginalizeTable(PhasingTable table) { public static PhasingTable marginalizeAsNewTable(PhasingTable table) {
Map<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>(); Map<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
for (PhasingTable.PhasingTableEntry pte : table) { for (PhasingTable.PhasingTableEntry pte : table) {
Haplotype rep = pte.getHaplotypeClass().getRepresentative(); Haplotype rep = pte.getHaplotypeClass().getRepresentative();
@ -1056,23 +1129,23 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
private static class BiallelicComplementHaplotypeTableCreator extends HaplotypeTableCreator { private static class BiallelicComplementHaplotypeTableCreator extends HaplotypeTableCreator {
private BialleleSNP[] bialleleSNPs; private SNPallelePair[] bialleleSNPs;
private int startIndex; private int startIndex;
private int marginalizeLength; private int marginalizeLength;
public BiallelicComplementHaplotypeTableCreator(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) { public BiallelicComplementHaplotypeTableCreator(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) {
super(hetGenotypes); super(hetGenotypes);
this.bialleleSNPs = new BialleleSNP[genotypes.length]; this.bialleleSNPs = new SNPallelePair[genotypes.length];
for (int i = 0; i < genotypes.length; i++) for (int i = 0; i < genotypes.length; i++)
bialleleSNPs[i] = new BialleleSNP(genotypes[i]); bialleleSNPs[i] = new SNPallelePair(genotypes[i]);
this.startIndex = startIndex; this.startIndex = startIndex;
this.marginalizeLength = marginalizeLength; this.marginalizeLength = marginalizeLength;
} }
public PhasingTable getNewTable() { public PhasingTable getNewTable() {
double hapClassPrior = 1.0; // can change later double hapClassPrior = 1.0; // can change later, BUT see NOTE above in removeExtraneousReads()
PhasingTable table = new PhasingTable(); PhasingTable table = new PhasingTable();
for (Haplotype hap : getAllHaplotypes()) { for (Haplotype hap : getAllHaplotypes()) {

View File

@ -28,15 +28,15 @@ import org.broad.tribble.util.variantcontext.Genotype;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
public class BialleleSNP extends Biallele { public class SNPallelePair extends AllelePair {
public BialleleSNP(Genotype gt) { public SNPallelePair(Genotype gt) {
super(gt); super(gt);
if (getTopAllele().getBases().length != 1) if (getTopAllele().getBases().length != 1)
throw new ReviewedStingException("LOGICAL ERROR: BialleleSNP may not contain non-SNP site!"); throw new ReviewedStingException("LOGICAL ERROR: SNPallelePair may not contain non-SNP site!");
if (getBottomAllele().getBases().length != 1) if (getBottomAllele().getBases().length != 1)
throw new ReviewedStingException("LOGICAL ERROR: BialleleSNP may not contain non-SNP site!"); throw new ReviewedStingException("LOGICAL ERROR: SNPallelePair may not contain non-SNP site!");
} }
public byte getTopBase() { public byte getTopBase() {