Fixed phasing algorithm to: 1. More correctly weed out irrelevant reads and sites; 2. Crudely flag sites with large phase discrepancies betweens reads
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4368 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5a5c72c80d
commit
8d8980e8eb
|
|
@ -111,15 +111,15 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
|
|||
samplePrevGenotypes.put(samp, null);
|
||||
}
|
||||
else { // Both comp and eval have a non-null Genotype at this site:
|
||||
Biallele compBiallele = new Biallele(compSampGt);
|
||||
Biallele evalBiallele = new Biallele(evalSampGt);
|
||||
AllelePair compAllelePair = new AllelePair(compSampGt);
|
||||
AllelePair evalAllelePair = new AllelePair(evalSampGt);
|
||||
|
||||
boolean breakPhasing = false;
|
||||
if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom())
|
||||
breakPhasing = true; // since they are not both het or both hom
|
||||
else { // both are het, or both are hom:
|
||||
boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compBiallele, evalBiallele) && bottomMatchesBottom(compBiallele, evalBiallele));
|
||||
boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compBiallele, evalBiallele) && bottomMatchesTop(compBiallele, evalBiallele));
|
||||
boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compAllelePair, evalAllelePair) && bottomMatchesBottom(compAllelePair, evalAllelePair));
|
||||
boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compAllelePair, evalAllelePair) && bottomMatchesTop(compAllelePair, evalAllelePair));
|
||||
if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop)
|
||||
breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample
|
||||
}
|
||||
|
|
@ -154,12 +154,12 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
|
|||
interesting.addReason("ONLY_EVAL", samp, group, "");
|
||||
}
|
||||
else { // both comp and eval are phased:
|
||||
Biallele prevCompBiallele = new Biallele(prevCompAndEval.getCompGenotpye());
|
||||
Biallele prevEvalBiallele = new Biallele(prevCompAndEval.getEvalGenotype());
|
||||
AllelePair prevCompAllelePair = new AllelePair(prevCompAndEval.getCompGenotpye());
|
||||
AllelePair prevEvalAllelePair = new AllelePair(prevCompAndEval.getEvalGenotype());
|
||||
|
||||
// Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample:
|
||||
boolean topsMatch = (topMatchesTop(prevCompBiallele, prevEvalBiallele) && topMatchesTop(compBiallele, evalBiallele));
|
||||
boolean topMatchesBottom = (topMatchesBottom(prevCompBiallele, prevEvalBiallele) && topMatchesBottom(compBiallele, evalBiallele));
|
||||
boolean topsMatch = (topMatchesTop(prevCompAllelePair, prevEvalAllelePair) && topMatchesTop(compAllelePair, evalAllelePair));
|
||||
boolean topMatchesBottom = (topMatchesBottom(prevCompAllelePair, prevEvalAllelePair) && topMatchesBottom(compAllelePair, evalAllelePair));
|
||||
|
||||
if (topsMatch || topMatchesBottom) {
|
||||
ps.phasesAgree++;
|
||||
|
|
@ -172,7 +172,7 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
|
|||
else {
|
||||
ps.phasesDisagree++;
|
||||
logger.debug("SWITCHED locus: " + curLocus);
|
||||
interesting.addReason("SWITCH", samp, group, toString(prevCompBiallele, compBiallele) + " -> " + toString(prevEvalBiallele, evalBiallele));
|
||||
interesting.addReason("SWITCH", samp, group, toString(prevCompAllelePair, compAllelePair) + " -> " + toString(prevEvalAllelePair, evalAllelePair));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -212,23 +212,23 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
|
|||
return new Double(pq.toString());
|
||||
}
|
||||
|
||||
public boolean topMatchesTop(Biallele b1, Biallele b2) {
|
||||
public boolean topMatchesTop(AllelePair b1, AllelePair b2) {
|
||||
return b1.getTopAllele().equals(b2.getTopAllele());
|
||||
}
|
||||
|
||||
public boolean topMatchesBottom(Biallele b1, Biallele b2) {
|
||||
public boolean topMatchesBottom(AllelePair b1, AllelePair b2) {
|
||||
return b1.getTopAllele().equals(b2.getBottomAllele());
|
||||
}
|
||||
|
||||
public boolean bottomMatchesTop(Biallele b1, Biallele b2) {
|
||||
public boolean bottomMatchesTop(AllelePair b1, AllelePair b2) {
|
||||
return topMatchesBottom(b2, b1);
|
||||
}
|
||||
|
||||
public boolean bottomMatchesBottom(Biallele b1, Biallele b2) {
|
||||
public boolean bottomMatchesBottom(AllelePair b1, AllelePair b2) {
|
||||
return b1.getBottomAllele().equals(b2.getBottomAllele());
|
||||
}
|
||||
|
||||
public String toString(Biallele prev, Biallele cur) {
|
||||
public String toString(AllelePair prev, AllelePair cur) {
|
||||
return prev.getTopAllele().getBaseString() + "," + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "," + cur.getBottomAllele().getBaseString();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -30,13 +30,13 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class Biallele {
|
||||
public class AllelePair {
|
||||
private Allele top;
|
||||
private Allele bottom;
|
||||
|
||||
public Biallele(Genotype gt) {
|
||||
public AllelePair(Genotype gt) {
|
||||
if (gt.getPloidy() != 2)
|
||||
throw new ReviewedStingException("Biallele must have ploidy of 2!");
|
||||
throw new ReviewedStingException("AllelePair must have ploidy of 2!");
|
||||
|
||||
this.top = gt.getAllele(0);
|
||||
this.bottom = gt.getAllele(1);
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.phasing;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
|
|
@ -55,11 +54,11 @@ public class DisjointSet {
|
|||
}
|
||||
|
||||
public boolean inSameSet(int x, int y) {
|
||||
return (findSet(x) == findSet(y));
|
||||
return (x == y || nodes[x].parent == nodes[y].parent || findSet(x) == findSet(y));
|
||||
}
|
||||
|
||||
public List<Integer> inSameSetAs(int x, int[] testSet) {
|
||||
List<Integer> sameSetInds = new LinkedList<Integer>();
|
||||
public Set<Integer> inSameSetAs(int x, Collection<Integer> testSet) {
|
||||
Set<Integer> sameSetInds = new TreeSet<Integer>();
|
||||
|
||||
int xSet = findSet(x);
|
||||
for (int t : testSet) {
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ import java.util.*;
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// Represents an undirected graph with no self-edges:
|
||||
public class Graph implements Iterable<GraphEdge> {
|
||||
private Neighbors[] adj;
|
||||
|
||||
|
|
@ -36,45 +37,58 @@ public class Graph implements Iterable<GraphEdge> {
|
|||
}
|
||||
|
||||
public void addEdge(GraphEdge e) {
|
||||
if (e.v1 == e.v2) // do not permit self-edges
|
||||
return;
|
||||
|
||||
adj[e.v1].addNeighbor(e);
|
||||
adj[e.v2].addNeighbor(e);
|
||||
}
|
||||
|
||||
public void addEdges(Collection<GraphEdge> edges) {
|
||||
for (GraphEdge e : edges)
|
||||
addEdge(e);
|
||||
}
|
||||
|
||||
public void removeEdge(GraphEdge e) {
|
||||
adj[e.v1].removeNeighbor(e);
|
||||
adj[e.v2].removeNeighbor(e);
|
||||
}
|
||||
|
||||
public Collection<GraphEdge> removeAllIncidentEdges(int vertexIndex) {
|
||||
Collection<GraphEdge> incidentEdges = new TreeSet<GraphEdge>(adj[vertexIndex].neighbors); // implemented GraphEdge.compareTo()
|
||||
|
||||
for (GraphEdge neighbEdge : incidentEdges) {
|
||||
if (vertexIndex != neighbEdge.v1) // vertexIndex == neighbEdge.v2
|
||||
adj[neighbEdge.v1].removeNeighbor(neighbEdge);
|
||||
else if (vertexIndex != neighbEdge.v2) // vertexIndex == neighbEdge.v1
|
||||
adj[neighbEdge.v2].removeNeighbor(neighbEdge);
|
||||
}
|
||||
adj[vertexIndex].clearAllNeighbors();
|
||||
|
||||
return incidentEdges;
|
||||
}
|
||||
|
||||
public DisjointSet getConnectedComponents() {
|
||||
DisjointSet cc = new DisjointSet(adj.length);
|
||||
|
||||
for (int i = 0; i < adj.length; i++)
|
||||
for (GraphEdge e : adj[i])
|
||||
cc.setUnion(e.v1, e.v2);
|
||||
for (GraphEdge e : this)
|
||||
cc.setUnion(e.v1, e.v2);
|
||||
|
||||
return cc;
|
||||
}
|
||||
|
||||
// Note that this will give each edge TWICE [since e=(v1,v2) is stored as a neighbor for both v1 and v2]
|
||||
public Iterator<GraphEdge> iterator() {
|
||||
return new AllEdgesIterator();
|
||||
}
|
||||
|
||||
public List<GraphEdge> getAllEdges() {
|
||||
Set<GraphEdge> allEdges = new TreeSet<GraphEdge>(); // implemented GraphEdge.compareTo()
|
||||
for (GraphEdge e : this)
|
||||
allEdges.add(e);
|
||||
|
||||
return new LinkedList<GraphEdge>(allEdges);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < adj.length; i++) {
|
||||
sb.append(i + ":");
|
||||
for (GraphEdge e : adj[i])
|
||||
sb.append(" " + e);
|
||||
for (GraphEdge e : adj[i]) {
|
||||
sb.append(" " + (e.v1 == i ? e.v2 : e.v1));
|
||||
}
|
||||
sb.append("\n");
|
||||
}
|
||||
|
||||
|
|
@ -84,18 +98,29 @@ public class Graph implements Iterable<GraphEdge> {
|
|||
private class AllEdgesIterator implements Iterator<GraphEdge> {
|
||||
private int curInd;
|
||||
private Iterator<GraphEdge> innerIt;
|
||||
private GraphEdge nextEdge;
|
||||
|
||||
public AllEdgesIterator() {
|
||||
curInd = 0;
|
||||
innerIt = null;
|
||||
nextEdge = null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (nextEdge != null)
|
||||
return true;
|
||||
|
||||
for (; curInd < adj.length; curInd++) {
|
||||
if (innerIt == null)
|
||||
innerIt = adj[curInd].iterator();
|
||||
if (innerIt.hasNext())
|
||||
return true;
|
||||
|
||||
while (innerIt.hasNext()) {
|
||||
GraphEdge e = innerIt.next();
|
||||
if (e.v1 == curInd) { // only want to see each edge once
|
||||
nextEdge = e;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
innerIt = null;
|
||||
}
|
||||
|
|
@ -107,7 +132,9 @@ public class Graph implements Iterable<GraphEdge> {
|
|||
if (!hasNext())
|
||||
throw new NoSuchElementException();
|
||||
|
||||
return innerIt.next();
|
||||
GraphEdge tmpEdge = nextEdge;
|
||||
nextEdge = null;
|
||||
return tmpEdge;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
|
|
@ -133,5 +160,9 @@ public class Graph implements Iterable<GraphEdge> {
|
|||
public Iterator<GraphEdge> iterator() {
|
||||
return neighbors.iterator();
|
||||
}
|
||||
|
||||
public void clearAllNeighbors() {
|
||||
neighbors.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,7 +43,6 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import javax.naming.OperationNotSupportedException;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -245,7 +244,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
if (isCalledDiploidGenotype(gt) && gt.isHet()) { // Can attempt to phase this genotype
|
||||
PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
|
||||
if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
|
||||
BialleleSNP biall = new BialleleSNP(gt);
|
||||
SNPallelePair biall = new SNPallelePair(gt);
|
||||
logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + biall);
|
||||
|
||||
DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
|
||||
|
|
@ -257,9 +256,18 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
Genotype prevHetGenotype = prevUvc.getGenotype(samp);
|
||||
|
||||
PhaseResult pr = phaseSample(phaseWindow);
|
||||
boolean genotypesArePhased = (pr.phaseQuality >= phaseQualityThresh);
|
||||
boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
|
||||
|
||||
//
|
||||
//
|
||||
if (pr.phaseQuality < 0) {
|
||||
logger.warn("MORE than 10% of the reads are inconsistent for phasing of " + VariantContextUtils.getLocation(vc));
|
||||
}
|
||||
//
|
||||
//
|
||||
|
||||
if (genotypesArePhased) {
|
||||
BialleleSNP prevBiall = new BialleleSNP(prevHetGenotype);
|
||||
SNPallelePair prevBiall = new SNPallelePair(prevHetGenotype);
|
||||
|
||||
logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevBiall + "\n");
|
||||
logger.debug("THE PHASE CHOSEN HERE:\n" + biall + "\n\n");
|
||||
|
|
@ -311,6 +319,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
phaseStats.addIn(new PhasingStats(samplePhaseStats));
|
||||
}
|
||||
|
||||
public boolean passesPhasingThreshold(double PQ) {
|
||||
return PQ >= phaseQualityThresh;
|
||||
}
|
||||
|
||||
private static class GenotypeAndReadBases {
|
||||
public Genotype genotype;
|
||||
public ReadBasesAtPosition readBases;
|
||||
|
|
@ -449,7 +461,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
logger.debug("Number of sites in window = " + index);
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("ALL READS:");
|
||||
logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:");
|
||||
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
|
||||
String rdName = nameToReads.getKey();
|
||||
Read rd = nameToReads.getValue();
|
||||
|
|
@ -458,85 +470,98 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
}
|
||||
|
||||
private class ReadProperties {
|
||||
public List<GraphEdge> rdEdges;
|
||||
public int[] siteInds;
|
||||
private class EdgeToReads {
|
||||
private Map<GraphEdge, List<String>> edgeReads;
|
||||
|
||||
public ReadProperties(Read rd) {
|
||||
this.siteInds = rd.getNonNullIndices();
|
||||
this.rdEdges = new LinkedList<GraphEdge>();
|
||||
public EdgeToReads() {
|
||||
this.edgeReads = new TreeMap<GraphEdge, List<String>>(); // implemented GraphEdge.compareTo()
|
||||
}
|
||||
|
||||
// sufficient to create a path linking the sites in rd, so they all end up in the same connected component:
|
||||
for (int i = 0; i < siteInds.length - 1; i++) {
|
||||
GraphEdge e = new GraphEdge(siteInds[i], siteInds[i + 1]);
|
||||
rdEdges.add(e);
|
||||
public void addRead(GraphEdge e, String readName) {
|
||||
List<String> reads = edgeReads.get(e);
|
||||
if (reads == null) {
|
||||
reads = new LinkedList<String>();
|
||||
edgeReads.put(e, reads);
|
||||
}
|
||||
reads.add(readName);
|
||||
}
|
||||
|
||||
public List<String> getReads(GraphEdge e) {
|
||||
return edgeReads.get(e);
|
||||
}
|
||||
}
|
||||
|
||||
private class EdgeCounts {
|
||||
private Map<GraphEdge, Integer> counts;
|
||||
private class IntegerSet implements Iterable<Integer> {
|
||||
private Set<Integer> list;
|
||||
|
||||
public EdgeCounts() {
|
||||
this.counts = new TreeMap<GraphEdge, Integer>(); // implemented GraphEdge.compareTo()
|
||||
public IntegerSet(Set<Integer> list) {
|
||||
this.list = list;
|
||||
}
|
||||
|
||||
public int getCount(GraphEdge e) {
|
||||
Integer count = counts.get(e);
|
||||
if (count == null)
|
||||
return 0;
|
||||
|
||||
return count;
|
||||
public boolean contains(int i) {
|
||||
return list.contains(i);
|
||||
}
|
||||
|
||||
public int incrementEdge(GraphEdge e) {
|
||||
Integer eCount = counts.get(e);
|
||||
int cnt;
|
||||
if (eCount == null)
|
||||
cnt = 0;
|
||||
else
|
||||
cnt = eCount;
|
||||
|
||||
cnt++;
|
||||
counts.put(e, cnt);
|
||||
return cnt;
|
||||
public Iterator<Integer> iterator() {
|
||||
return list.iterator();
|
||||
}
|
||||
|
||||
public int decrementEdge(GraphEdge e) {
|
||||
Integer eCount = counts.get(e);
|
||||
if (eCount == null)
|
||||
return 0;
|
||||
|
||||
int cnt = eCount - 1;
|
||||
counts.put(e, cnt);
|
||||
return cnt;
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i : this) {
|
||||
sb.append(i + ", ");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
public Set<String> removeExtraneousReads(int numHetSites) {
|
||||
Graph readGraph = new Graph(numHetSites);
|
||||
Map<String, ReadProperties> readToGraphProperties = new HashMap<String, ReadProperties>();
|
||||
EdgeCounts edgeCounts = new EdgeCounts();
|
||||
EdgeToReads edgeToReads = new EdgeToReads();
|
||||
Set<Integer> sitesWithEdges = new TreeSet<Integer>();
|
||||
|
||||
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
|
||||
String rdName = nameToReads.getKey();
|
||||
Read rd = nameToReads.getValue();
|
||||
|
||||
ReadProperties rp = new ReadProperties(rd);
|
||||
if (!rp.rdEdges.isEmpty()) { // otherwise, this read is clearly irrelevant since it can't link anything
|
||||
for (GraphEdge e : rp.rdEdges) {
|
||||
readGraph.addEdge(e);
|
||||
int[] siteInds = rd.getNonNullIndices();
|
||||
// Connect each pair of non-null sites in rd:
|
||||
for (int i = 0; i < siteInds.length; i++) {
|
||||
for (int j = i + 1; j < siteInds.length; j++) {
|
||||
GraphEdge e = new GraphEdge(siteInds[i], siteInds[j]);
|
||||
logger.debug("Read = " + rdName + " is adding edge: " + e);
|
||||
readGraph.addEdge(e);
|
||||
|
||||
edgeCounts.incrementEdge(e);
|
||||
edgeToReads.addRead(e, rdName);
|
||||
|
||||
sitesWithEdges.add(e.v1);
|
||||
sitesWithEdges.add(e.v2);
|
||||
}
|
||||
readToGraphProperties.put(rdName, rp);
|
||||
}
|
||||
}
|
||||
logger.debug("Read graph:\n" + readGraph);
|
||||
Set<String> keepReads = new HashSet<String>();
|
||||
|
||||
// Check which Reads are involved in paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
|
||||
/* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
|
||||
|
||||
In detail:
|
||||
Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph".
|
||||
A read provides non-trivial bias toward the final haplotype decision if it participates in a path from prev ---> cur. This is tested by
|
||||
considering each edge that the read contributes. For edge e=(v1,v2), if there exists a path from prev ---> v1 [that doesn't include v2] and
|
||||
cur ---> v2 [that doesn't include v1], then there is a path from prev ---> cur that uses e, hence making the read significant.
|
||||
By excluding each vertex's edges and then calculating connected components, we are able to make the determination, for example,
|
||||
if a path exists from prev ---> v1 that excludes v2.
|
||||
|
||||
Furthermore, if the path DOES use other edges that exist solely due to the read, then that's fine, since adding in the read will give those edges as well.
|
||||
And, if the path uses edges from other reads, then keeping all other reads that contribute those edges
|
||||
[which will happen since those edges are also in paths from prev ---> cur] is sufficient for this path to exist.
|
||||
|
||||
NOTE:
|
||||
If we would use NON-UNIFORM priors for the various haplotypes, then this calculation would not be correct, since the equivalence of:
|
||||
1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values).
|
||||
2. The read has edges involved in a path from prev ---> cur.
|
||||
DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior.
|
||||
*/
|
||||
int prev = phasingSiteIndex - 1;
|
||||
int cur = phasingSiteIndex;
|
||||
|
||||
|
|
@ -546,56 +571,51 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
return keepReads;
|
||||
}
|
||||
|
||||
for (Map.Entry<String, ReadProperties> rdEdgesEntry : readToGraphProperties.entrySet()) {
|
||||
String testRead = rdEdgesEntry.getKey();
|
||||
ReadProperties rp = rdEdgesEntry.getValue();
|
||||
logger.debug("Testing the connectivity of Read: " + testRead);
|
||||
/* Check the connected components of prev and cur when removing each individual vertex's edges:
|
||||
[Total run-time: for each vertex, calculate connected components after removing it's edges: O(V * E)]
|
||||
*/
|
||||
IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites];
|
||||
IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites];
|
||||
for (int i : sitesWithEdges) {
|
||||
logger.debug("Calculating CC after removing edges of site: " + i);
|
||||
|
||||
// Check the connected components after removing this read's UNIQUE edges:
|
||||
for (GraphEdge e : rp.rdEdges) {
|
||||
if (edgeCounts.getCount(e) == 1) // otherwise, the edge still exists without this read
|
||||
readGraph.removeEdge(e);
|
||||
}
|
||||
// Remove all edges incident to i and see which positions have paths to prev and cur:
|
||||
Collection<GraphEdge> removedEdges = readGraph.removeAllIncidentEdges(i);
|
||||
|
||||
// Run-time for efficiently calculating connected components using DisjointSet: O(E)
|
||||
DisjointSet ccAfterRemove = readGraph.getConnectedComponents();
|
||||
removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges));
|
||||
removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges));
|
||||
|
||||
/* testRead contributes a path between prev and cur iff:
|
||||
There exists i != j s.t. testRead[i] != null, testRead[j] != null, ccAfterRemove.inSameSet(prev,i) && ccAfterRemove.inSameSet(j,cur)
|
||||
[since ALL non-null indices in testRead are connected to one another, as one clique].
|
||||
logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]);
|
||||
logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]);
|
||||
|
||||
// Add the removed edges back in:
|
||||
readGraph.addEdges(removedEdges);
|
||||
}
|
||||
|
||||
for (GraphEdge e : readGraph) {
|
||||
logger.debug("Testing the path-connectivity of Edge: " + e);
|
||||
|
||||
/* Edge e={v1,v2} contributes a path between prev and cur for testRead iff:
|
||||
testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa].
|
||||
Note that the path from prev ---> v1 will NOT contain v2, since we removed all of v2's edges,
|
||||
and the path from v2 ---> cur will NOT contain v1.
|
||||
*/
|
||||
List<Integer> sameCCasPrev = ccAfterRemove.inSameSetAs(prev, rp.siteInds);
|
||||
List<Integer> sameCCasCur = ccAfterRemove.inSameSetAs(cur, rp.siteInds);
|
||||
if (logger.isDebugEnabled()) {
|
||||
StringBuilder sb = new StringBuilder("sameCCasPrev:");
|
||||
for (int ind : sameCCasPrev)
|
||||
sb.append(" " + ind);
|
||||
logger.debug(sb.toString());
|
||||
boolean prevTo2and1ToCur = removedSiteSameCCAsPrev[e.v1].contains(e.v2) && removedSiteSameCCAsCur[e.v2].contains(e.v1);
|
||||
boolean prevTo1and2ToCur = removedSiteSameCCAsPrev[e.v2].contains(e.v1) && removedSiteSameCCAsCur[e.v1].contains(e.v2);
|
||||
|
||||
sb = new StringBuilder("sameCCasCur:");
|
||||
for (int ind : sameCCasCur)
|
||||
sb.append(" " + ind);
|
||||
logger.debug(sb.toString());
|
||||
}
|
||||
if (prevTo2and1ToCur || prevTo1and2ToCur) {
|
||||
for (String readName : edgeToReads.getReads(e)) {
|
||||
keepReads.add(readName);
|
||||
|
||||
boolean keepRead = false;
|
||||
if (!sameCCasPrev.isEmpty() && !sameCCasCur.isEmpty()) { // There exists a path from prev to cur that goes through the sites in testRead
|
||||
// Now, make sure that TWO DISTINCT sites, i and j, in testRead are used in the path:
|
||||
Set<Integer> union = new HashSet<Integer>(sameCCasPrev);
|
||||
union.addAll(sameCCasCur);
|
||||
if (union.size() >= 2) // i != j
|
||||
keepRead = true;
|
||||
}
|
||||
|
||||
if (keepRead) {
|
||||
logger.debug("Read is part of path from " + prev + " to " + cur);
|
||||
keepReads.add(testRead);
|
||||
|
||||
// Add the removed edges back in, since we're keeping the read:
|
||||
for (GraphEdge e : rp.rdEdges)
|
||||
readGraph.addEdge(e);
|
||||
}
|
||||
else { // Decrease the count for the edges [note that any read-specific edges were already removed above]:
|
||||
for (GraphEdge e : rp.rdEdges)
|
||||
edgeCounts.decrementEdge(e);
|
||||
if (logger.isDebugEnabled()) {
|
||||
if (prevTo2and1ToCur)
|
||||
logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.v2 + " -> " + e.v1 + " ---> " + cur);
|
||||
else
|
||||
logger.debug("Keep read " + readName + " due to path: " + prev + " ---> " + e.v1 + " -> " + e.v2 + " ---> " + cur);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -677,7 +697,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
private PhaseResult phaseSample(PhasingWindow phaseWindow) {
|
||||
/* Will map a phase and its "complement" to a single representative phase,
|
||||
and marginalizeTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
|
||||
and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
|
||||
*/
|
||||
HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2);
|
||||
PhasingTable sampleHaps = tabCreator.getNewTable();
|
||||
|
|
@ -693,6 +713,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
// Update the phasing table based on each of the sub-reads for this sample:
|
||||
MaxHaplotypeAndQuality prevMaxHapAndQual = null;
|
||||
int numHighQualityIterations = 0;
|
||||
int numInconsistentIterations = 0;
|
||||
for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
|
||||
Read rd = nameToReads.getValue();
|
||||
|
||||
|
|
@ -704,38 +727,88 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
// Check the current best haplotype assignment:
|
||||
MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false);
|
||||
logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality);
|
||||
if (prevMaxHapAndQual != null && passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) {
|
||||
numHighQualityIterations++;
|
||||
if (!curMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative().equals(prevMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative()) || // switched phase
|
||||
curMaxHapAndQual.phaseQuality < 0.9 * prevMaxHapAndQual.phaseQuality) { // a 10% ["significant"] decrease in PQ
|
||||
logger.debug("Inconsistent read found!");
|
||||
numInconsistentIterations++;
|
||||
}
|
||||
}
|
||||
prevMaxHapAndQual = curMaxHapAndQual;
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
}
|
||||
logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
|
||||
MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true);
|
||||
double posteriorProb = maxHapQual.maxEntry.getScore().getValue();
|
||||
|
||||
// Marginalize each haplotype to its first 2 positions:
|
||||
sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps);
|
||||
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + sampleHaps + "\n");
|
||||
logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality);
|
||||
logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations);
|
||||
|
||||
// Determine the phase at this position:
|
||||
sampleHaps.normalizeScores();
|
||||
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
|
||||
double repPhaseQuality = maxHapQual.phaseQuality;
|
||||
|
||||
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry();
|
||||
double posteriorProb = maxEntry.getScore().getValue();
|
||||
|
||||
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
|
||||
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
|
||||
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
|
||||
if (pte != maxEntry)
|
||||
sumErrorProbs.plusEqual(pte.getScore());
|
||||
//
|
||||
//
|
||||
if (numInconsistentIterations / (double) numHighQualityIterations >= 0.1) {
|
||||
//
|
||||
// ????
|
||||
// NEED TO CHANGE phaseSite() to always output PQ field, EVEN if it's LESS THAN threshold ??????
|
||||
// ????
|
||||
//
|
||||
repPhaseQuality *= -1;
|
||||
}
|
||||
double phaseQuality = -10.0 * (sumErrorProbs.getLog10Value());
|
||||
//
|
||||
//
|
||||
|
||||
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality);
|
||||
return new PhaseResult(maxHapQual.maxEntry.getHaplotypeClass().getRepresentative(), repPhaseQuality);
|
||||
}
|
||||
|
||||
return new PhaseResult(maxEntry.getHaplotypeClass().getRepresentative(), phaseQuality);
|
||||
private static class MaxHaplotypeAndQuality {
|
||||
public PhasingTable.PhasingTableEntry maxEntry;
|
||||
public double phaseQuality;
|
||||
|
||||
public MaxHaplotypeAndQuality(PhasingTable hapTable, boolean printDebug) {
|
||||
// Marginalize each haplotype to its first 2 positions:
|
||||
hapTable = HaplotypeTableCreator.marginalizeAsNewTable(hapTable);
|
||||
if (printDebug)
|
||||
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n");
|
||||
|
||||
// Determine the phase at this position:
|
||||
hapTable.normalizeScores();
|
||||
if (printDebug)
|
||||
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n");
|
||||
|
||||
this.maxEntry = hapTable.maxEntry();
|
||||
|
||||
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
|
||||
this.phaseQuality = getPhasingQuality(hapTable, maxEntry);
|
||||
}
|
||||
|
||||
// Returns the PQ of entry (within table hapTable, which MUST be normalized):
|
||||
public static double getPhasingQuality(PhasingTable hapTable, PhasingTable.PhasingTableEntry entry) {
|
||||
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
|
||||
for (PhasingTable.PhasingTableEntry pte : hapTable) {
|
||||
if (pte != entry)
|
||||
sumErrorProbs.plusEqual(pte.getScore());
|
||||
}
|
||||
return -10.0 * (sumErrorProbs.getLog10Value());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Ensure that curBiall is phased relative to prevBiall as specified by hap.
|
||||
*/
|
||||
|
||||
public static void ensurePhasing(BialleleSNP curBiall, BialleleSNP prevBiall, Haplotype hap) {
|
||||
public static void ensurePhasing(SNPallelePair curBiall, SNPallelePair prevBiall, Haplotype hap) {
|
||||
if (hap.size() < 2)
|
||||
throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!");
|
||||
|
||||
|
|
@ -784,7 +857,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
byte refBase;
|
||||
if (!vc.isIndel()) {
|
||||
Allele varAllele = vc.getReference();
|
||||
refBase = BialleleSNP.getSingleBase(varAllele);
|
||||
refBase = SNPallelePair.getSingleBase(varAllele);
|
||||
}
|
||||
else {
|
||||
refBase = vc.getReferenceBaseForIndel();
|
||||
|
|
@ -1023,14 +1096,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
byte[] hapBases = new byte[numSites];
|
||||
for (int i = 0; i < numSites; i++) {
|
||||
Allele alleleI = genotypes[i].getAllele(alleleInds[i]);
|
||||
hapBases[i] = BialleleSNP.getSingleBase(alleleI);
|
||||
hapBases[i] = SNPallelePair.getSingleBase(alleleI);
|
||||
}
|
||||
allHaps.add(new Haplotype(hapBases));
|
||||
}
|
||||
return allHaps;
|
||||
}
|
||||
|
||||
public static PhasingTable marginalizeTable(PhasingTable table) {
|
||||
public static PhasingTable marginalizeAsNewTable(PhasingTable table) {
|
||||
Map<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
|
||||
for (PhasingTable.PhasingTableEntry pte : table) {
|
||||
Haplotype rep = pte.getHaplotypeClass().getRepresentative();
|
||||
|
|
@ -1056,23 +1129,23 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private static class BiallelicComplementHaplotypeTableCreator extends HaplotypeTableCreator {
|
||||
private BialleleSNP[] bialleleSNPs;
|
||||
private SNPallelePair[] bialleleSNPs;
|
||||
private int startIndex;
|
||||
private int marginalizeLength;
|
||||
|
||||
public BiallelicComplementHaplotypeTableCreator(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) {
|
||||
super(hetGenotypes);
|
||||
|
||||
this.bialleleSNPs = new BialleleSNP[genotypes.length];
|
||||
this.bialleleSNPs = new SNPallelePair[genotypes.length];
|
||||
for (int i = 0; i < genotypes.length; i++)
|
||||
bialleleSNPs[i] = new BialleleSNP(genotypes[i]);
|
||||
bialleleSNPs[i] = new SNPallelePair(genotypes[i]);
|
||||
|
||||
this.startIndex = startIndex;
|
||||
this.marginalizeLength = marginalizeLength;
|
||||
}
|
||||
|
||||
public PhasingTable getNewTable() {
|
||||
double hapClassPrior = 1.0; // can change later
|
||||
double hapClassPrior = 1.0; // can change later, BUT see NOTE above in removeExtraneousReads()
|
||||
|
||||
PhasingTable table = new PhasingTable();
|
||||
for (Haplotype hap : getAllHaplotypes()) {
|
||||
|
|
|
|||
|
|
@ -28,15 +28,15 @@ import org.broad.tribble.util.variantcontext.Genotype;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
public class BialleleSNP extends Biallele {
|
||||
public class SNPallelePair extends AllelePair {
|
||||
|
||||
public BialleleSNP(Genotype gt) {
|
||||
public SNPallelePair(Genotype gt) {
|
||||
super(gt);
|
||||
|
||||
if (getTopAllele().getBases().length != 1)
|
||||
throw new ReviewedStingException("LOGICAL ERROR: BialleleSNP may not contain non-SNP site!");
|
||||
throw new ReviewedStingException("LOGICAL ERROR: SNPallelePair may not contain non-SNP site!");
|
||||
if (getBottomAllele().getBases().length != 1)
|
||||
throw new ReviewedStingException("LOGICAL ERROR: BialleleSNP may not contain non-SNP site!");
|
||||
throw new ReviewedStingException("LOGICAL ERROR: SNPallelePair may not contain non-SNP site!");
|
||||
}
|
||||
|
||||
public byte getTopBase() {
|
||||
Loading…
Reference in New Issue