From 44ccfc35312048298709f109b0f947f79c5c33ca Mon Sep 17 00:00:00 2001 From: fromer Date: Wed, 22 Sep 2010 21:29:58 +0000 Subject: [PATCH] Updated Phasing algorithm + evaluation module to properly implement haplotypes [including homozygous genotypes]; Implemented dynamic window phasing model for LARGE increase in efficiency git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4332 348d0f76-0448-11de-a6fe-93d51630548a --- .../varianteval/GenotypePhasingEvaluator.java | 45 +- .../gatk/walkers/phasing/DisjointSet.java | 102 +++ .../walkers/phasing/DoublyLinkedList.java | 208 +++++ .../gatk/walkers/phasing/Graph.java | 137 +++ .../gatk/walkers/phasing/GraphEdge.java | 61 ++ .../phasing/ReadBackedPhasingWalker.java | 819 +++++++++++++----- 6 files changed, 1123 insertions(+), 249 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DisjointSet.java create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DoublyLinkedList.java create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/Graph.java create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/GraphEdge.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java index f27d09145..bff5af37a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.playground.utils.report.tags.DataPoint; import org.broadinstitute.sting.playground.utils.report.utils.TableType; import org.broadinstitute.sting.utils.GenomeLoc; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; import java.util.*; @@ -104,9 +105,9 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { if (evalSampGenotypes != null) evalSampGt = evalSampGenotypes.get(samp); - if (compSampGt == null || evalSampGt == null) { - // Having a hom site (or an unphased het site) breaks the phasing for the sample - hence, must reset phasing knowledge for both comp and eval [put a null CompEvalGenotypes]: - if ((compSampGt != null && !permitsTransitivePhasing(compSampGt)) || (evalSampGt != null && !permitsTransitivePhasing(evalSampGt))) + if (compSampGt == null || evalSampGt == null) { // Since either comp or eval (or both) are missing the site, the best we can do is hope to preserve phase [if the non-missing one preserves phase] + // Having an unphased site breaks the phasing for the sample [does NOT permit "transitive phasing"] - hence, must reset phasing knowledge for both comp and eval [put a null CompEvalGenotypes]: + if (isNonNullButUnphased(compSampGt) || isNonNullButUnphased(evalSampGt)) samplePrevGenotypes.put(samp, null); } else { // Both comp and eval have a non-null Genotype at this site: @@ -114,9 +115,9 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { Biallele evalBiallele = new Biallele(evalSampGt); boolean breakPhasing = false; - if (!compSampGt.isHet() || !evalSampGt.isHet()) - breakPhasing = true; - else { // both are het + if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom()) + breakPhasing = true; // since they are not both het or both hom + else { // both are het, or both are hom: boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compBiallele, evalBiallele) && bottomMatchesBottom(compBiallele, evalBiallele)); boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compBiallele, evalBiallele) && bottomMatchesTop(compBiallele, evalBiallele)); if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop) @@ -126,16 +127,19 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { if (breakPhasing) { samplePrevGenotypes.put(samp, null); // nothing to do for this site, AND must remove any history for the future } - else { // comp and eval have the same het Genotype at this site: + else if (compSampGt.isHet() && evalSampGt.isHet()) { + /* comp and eval have the HET same Genotype at this site: + [Note that if both are hom, then nothing is done here, but the het history IS preserved]. + */ CompEvalGenotypes prevCompAndEval = samplePrevGenotypes.get(samp); if (prevCompAndEval != null && !prevCompAndEval.getLocus().onSameContig(curLocus)) // exclude curLocus if it is "phased" relative to a different chromosome prevCompAndEval = null; - // Replace the previous with current: + // Replace the previous hets with the current hets: samplePrevGenotypes.put(samp, curLocus, compSampGt, evalSampGt); if (prevCompAndEval != null) { - logger.debug("Potentially phaseable locus: " + curLocus); + logger.debug("Potentially phaseable het locus: " + curLocus + " [relative to previous het locus: " + prevCompAndEval.getLocus() + "]"); PhaseStats ps = samplePhasingStatistics.ensureSampleStats(samp); boolean compSampIsPhased = genotypesArePhasedAboveThreshold(compSampGt); @@ -149,7 +153,7 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { ps.onlyEvalPhased++; interesting.addReason("ONLY_EVAL", samp, group, ""); } - else { + else { // both comp and eval are phased: Biallele prevCompBiallele = new Biallele(prevCompAndEval.getCompGenotpye()); Biallele prevEvalBiallele = new Biallele(prevCompAndEval.getEvalGenotype()); @@ -159,6 +163,11 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { if (topsMatch || topMatchesBottom) { ps.phasesAgree++; + + Double compPQ = getPQ(compSampGt); + Double evalPQ = getPQ(evalSampGt); + if (compPQ != null && evalPQ != null && MathUtils.compareDoubles(compPQ, evalPQ) != 0) + interesting.addReason("PQ_CHANGE", samp, group, compPQ + " -> " + evalPQ); } else { ps.phasesDisagree++; @@ -183,16 +192,24 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { return (vc != null && !vc.isFiltered()); } + public boolean isNonNullButUnphased(Genotype gt) { + return (gt != null && !genotypesArePhasedAboveThreshold(gt)); + } + public boolean genotypesArePhasedAboveThreshold(Genotype gt) { if (!gt.genotypesArePhased()) return false; - Object pq = gt.getAttributes().get("PQ"); - return (pq == null || (new Double(pq.toString()) >= getVEWalker().minPhaseQuality)); + Double pq = getPQ(gt); + return (pq == null || pq >= getVEWalker().minPhaseQuality); } - public boolean permitsTransitivePhasing(Genotype gt) { - return (gt != null && gt.isHet() && genotypesArePhasedAboveThreshold(gt)); // only a phased het site lets the phase pass through + public static Double getPQ(Genotype gt) { + Object pq = gt.getAttributes().get("PQ"); + if (pq == null) + return null; + + return new Double(pq.toString()); } public boolean topMatchesTop(Biallele b1, Biallele b2) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DisjointSet.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DisjointSet.java new file mode 100644 index 000000000..78ac92be4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DisjointSet.java @@ -0,0 +1,102 @@ +package org.broadinstitute.sting.playground.gatk.walkers.phasing; + +import java.util.LinkedList; +import java.util.List; + +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +public class DisjointSet { + private ItemNode[] nodes; + + public DisjointSet(int numItems) { + this.nodes = new ItemNode[numItems]; + for (int i = 0; i < numItems; i++) + this.nodes[i] = new ItemNode(i); + } + + public int findSet(int itemIndex) { + // Find itemIndex's root Node: + ItemNode curNode = nodes[itemIndex]; + while (curNode.parent != null) + curNode = curNode.parent; + ItemNode root = curNode; + + // Perform path compression: + curNode = nodes[itemIndex]; + while (curNode != root) { + ItemNode next = curNode.parent; + curNode.parent = root; + curNode = next; + } + + return root.itemIndex; + } + + public boolean inSameSet(int x, int y) { + return (findSet(x) == findSet(y)); + } + + public List inSameSetAs(int x, int[] testSet) { + List sameSetInds = new LinkedList(); + + int xSet = findSet(x); + for (int t : testSet) { + if (findSet(t) == xSet) + sameSetInds.add(t); + } + return sameSetInds; + } + + public void setUnion(int x, int y) { + link(findSet(x), findSet(y)); + } + + private void link(int x, int y) { + if (x == y) + return; + + // Union by rank: + if (nodes[x].rank > nodes[y].rank) { + nodes[y].parent = nodes[x]; + } + else { // nodes[x].rank <= nodes[y].rank + nodes[x].parent = nodes[y]; + if (nodes[x].rank == nodes[y].rank) + nodes[y].rank++; + } + } + + private class ItemNode { + private int itemIndex; + private ItemNode parent; + private int rank; + + public ItemNode(int itemIndex) { + this.itemIndex = itemIndex; + this.parent = null; + this.rank = 0; + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DoublyLinkedList.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DoublyLinkedList.java new file mode 100644 index 000000000..154fd6f3b --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/DoublyLinkedList.java @@ -0,0 +1,208 @@ +package org.broadinstitute.sting.playground.gatk.walkers.phasing; + +import java.util.NoSuchElementException; + +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +public class DoublyLinkedList { + private DoublyLinkedNode first; + private DoublyLinkedNode last; + private int size; + + public DoublyLinkedList() { + this.first = null; + this.last = null; + this.size = 0; + } + + public boolean isEmpty() { + return first == null; + } + + public int size() { + return size; + } + + public void addFirst(E e) { + DoublyLinkedNode newNode = new DoublyLinkedNode(e); + + if (isEmpty()) + last = newNode; + else { + first.previous = newNode; + newNode.next = first; + } + first = newNode; + + size++; + } + + public void addLast(E e) { + DoublyLinkedNode newNode = new DoublyLinkedNode(e); + + if (isEmpty()) + first = newNode; + else { + last.next = newNode; + newNode.previous = last; + } + last = newNode; + + size++; + } + + public E removeFirst() { + if (isEmpty()) + throw new NoSuchElementException(); + E e = first.element; + + if (first.next == null) + last = null; + else + first.next.previous = null; + first = first.next; + + size--; + return e; + } + + public E removeLast() { + if (isEmpty()) + throw new NoSuchElementException(); + E e = last.element; + + if (last.previous == null) + first = null; + else + last.previous.next = null; + last = last.previous; + + size--; + return e; + } + + public E getFirst() { + if (isEmpty()) + throw new NoSuchElementException(); + + return first.element; + } + + public E getLast() { + if (isEmpty()) + throw new NoSuchElementException(); + + return last.element; + } + + public E peek() { + if (isEmpty()) + return null; + + return getFirst(); + } + + public E remove() { + return removeFirst(); + } + + public boolean add(E e) { + addLast(e); + return true; + } + + public BidirectionalIterator iterator() { + return new BidirectionalIterator(this); + } + + + private static class DoublyLinkedNode { + private E element = null; + private DoublyLinkedNode next = null; + private DoublyLinkedNode previous = null; + + public DoublyLinkedNode(E element) { + this.element = element; + this.next = null; + this.previous = null; + } + } + + + public static class BidirectionalIterator implements Cloneable { + private DoublyLinkedNode nextNode; + private DoublyLinkedNode lastNode; + + private BidirectionalIterator(DoublyLinkedNode nextNode, DoublyLinkedNode lastNode) { + this.nextNode = nextNode; + this.lastNode = lastNode; + } + + private BidirectionalIterator(DoublyLinkedList list) { + this(list.first, list.last); + } + + public boolean hasNext() { + return nextNode != null; + } + + public E next() { + if (!hasNext()) + throw new NoSuchElementException(); + + E e = nextNode.element; + nextNode = nextNode.next; + return e; + } + + public boolean hasPrevious() { + if (nextNode != null) + return nextNode.previous != null; + + return lastNode != null; + } + + public E previous() { + if (!hasPrevious()) + throw new NoSuchElementException(); + + if (nextNode != null) + nextNode = nextNode.previous; + else + nextNode = lastNode; + + return nextNode.element; + } + + public BidirectionalIterator clone() { + try { + super.clone(); + } catch (CloneNotSupportedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return new BidirectionalIterator(nextNode, lastNode); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/Graph.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/Graph.java new file mode 100644 index 000000000..b5fe5e501 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/Graph.java @@ -0,0 +1,137 @@ +package org.broadinstitute.sting.playground.gatk.walkers.phasing; + +import java.util.*; + +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +public class Graph implements Iterable { + private Neighbors[] adj; + + public Graph(int numVertices) { + adj = new Neighbors[numVertices]; + for (int i = 0; i < numVertices; i++) + adj[i] = new Neighbors(); + } + + public void addEdge(GraphEdge e) { + adj[e.v1].addNeighbor(e); + adj[e.v2].addNeighbor(e); + } + + public void removeEdge(GraphEdge e) { + adj[e.v1].removeNeighbor(e); + adj[e.v2].removeNeighbor(e); + } + + public DisjointSet getConnectedComponents() { + DisjointSet cc = new DisjointSet(adj.length); + + for (int i = 0; i < adj.length; i++) + for (GraphEdge e : adj[i]) + cc.setUnion(e.v1, e.v2); + + return cc; + } + + // Note that this will give each edge TWICE [since e=(v1,v2) is stored as a neighbor for both v1 and v2] + public Iterator iterator() { + return new AllEdgesIterator(); + } + + public List getAllEdges() { + Set allEdges = new TreeSet(); // implemented GraphEdge.compareTo() + for (GraphEdge e : this) + allEdges.add(e); + + return new LinkedList(allEdges); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < adj.length; i++) { + sb.append(i + ":"); + for (GraphEdge e : adj[i]) + sb.append(" " + e); + sb.append("\n"); + } + + return sb.toString(); + } + + private class AllEdgesIterator implements Iterator { + private int curInd; + private Iterator innerIt; + + public AllEdgesIterator() { + curInd = 0; + innerIt = null; + } + + public boolean hasNext() { + for (; curInd < adj.length; curInd++) { + if (innerIt == null) + innerIt = adj[curInd].iterator(); + if (innerIt.hasNext()) + return true; + + innerIt = null; + } + + return false; + } + + public GraphEdge next() { + if (!hasNext()) + throw new NoSuchElementException(); + + return innerIt.next(); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + private class Neighbors implements Iterable { + private Set neighbors; + + public Neighbors() { + this.neighbors = new TreeSet(); // implemented GraphEdge.compareTo() + } + + public void addNeighbor(GraphEdge e) { + neighbors.add(e); + } + + public void removeNeighbor(GraphEdge e) { + neighbors.remove(e); + } + + public Iterator iterator() { + return neighbors.iterator(); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/GraphEdge.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/GraphEdge.java new file mode 100644 index 000000000..0170d476d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/GraphEdge.java @@ -0,0 +1,61 @@ +package org.broadinstitute.sting.playground.gatk.walkers.phasing; + +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +public class GraphEdge implements Comparable { + protected int v1; + protected int v2; + + public GraphEdge(int v1, int v2) { + this.v1 = v1; + this.v2 = v2; + } + + public int getV1() { + return v1; + } + + public int getV2() { + return v2; + } + + public int compareTo(GraphEdge that) { + if (this.v1 != that.v1) + return (this.v1 - that.v1); + + // this.v1 == that.v1: + return (this.v2 - that.v2); + } + + public boolean equals(GraphEdge other) { + return (this.compareTo(other) == 0); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("(").append(v1).append(", ").append(v2).append(")"); + return sb.toString(); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 0ed5dcb0b..a6c99c057 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -42,8 +42,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.playground.gatk.walkers.phasing.*; +import javax.naming.OperationNotSupportedException; import java.io.*; import java.util.*; @@ -78,7 +78,7 @@ public class ReadBackedPhasingWalker extends RodWalker unphasedSiteQueue = null; - private LinkedList phasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions + private DoublyLinkedList partiallyPhasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0); @@ -86,11 +86,14 @@ public class ReadBackedPhasingWalker extends RodWalker(); rodNames.add("variant"); unphasedSiteQueue = new LinkedList(); - phasedSites = new LinkedList(); + partiallyPhasedSites = new DoublyLinkedList(); initializeVcfWriter(); @@ -99,7 +102,7 @@ public class ReadBackedPhasingWalker extends RodWalker hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); @@ -171,236 +174,528 @@ public class ReadBackedPhasingWalker extends RodWalker) */ + (note that we ASSUME that the VCF is ordered by ). + Note that this will always leave at least one entry (the last one), since lastLocus is in range of itself. + */ break; } // Already saw all variant positions within cacheWindow distance ahead of vc (on its contig) } - // Update phasedSites before it's used in finalizePhasing: + // Update partiallyPhasedSites before it's used in phaseSite: oldPhasedList.addAll(discardIrrelevantPhasedSites()); - logger.debug("oldPhasedList(1) = " + toStringVCL(oldPhasedList)); + logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList)); - VariantAndReads phasedVr = finalizePhasing(unphasedSiteQueue.remove(), phaseStats); - logger.debug("Finalized phasing for " + VariantContextUtils.getLocation(phasedVr.variant)); - phasedSites.add(phasedVr); + VariantAndReads vr = unphasedSiteQueue.remove(); + logger.debug("Performing phasing for " + VariantContextUtils.getLocation(vr.variant)); + phaseSite(vr, phaseStats); } } - // Update phasedSites after finalizePhasing is done: + // Update partiallyPhasedSites after phaseSite is done: oldPhasedList.addAll(discardIrrelevantPhasedSites()); - logger.debug("oldPhasedList(2) = " + toStringVCL(oldPhasedList)); + logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList)); return oldPhasedList; } private List discardIrrelevantPhasedSites() { List vcList = new LinkedList(); - VariantContext nextToPhaseVc = null; + GenomeLoc nextToPhaseLoc = null; if (!unphasedSiteQueue.isEmpty()) - nextToPhaseVc = unphasedSiteQueue.peek().variant; + nextToPhaseLoc = VariantContextUtils.getLocation(unphasedSiteQueue.peek().variant); - while (!phasedSites.isEmpty()) { - VariantAndReads phasedVr = phasedSites.peek(); - VariantContext phasedVc = phasedVr.variant; - if (nextToPhaseVc != null && phasedVr.processVariant && isInWindowRange(phasedVc, nextToPhaseVc)) { - // nextToPhaseVc is still not far enough ahead of phasedVc to exclude phasedVc from calculations - break; + while (!partiallyPhasedSites.isEmpty()) { + if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past" + UnfinishedVariantAndReads partPhasedVr = partiallyPhasedSites.peek(); + + if (partPhasedVr.processVariant && isInWindowRange(partPhasedVr.unfinishedVariant.getLocation(), nextToPhaseLoc)) + // nextToPhaseLoc is still not far enough ahead of partPhasedVr to exclude partPhasedVr from calculations + break; } - vcList.add(phasedSites.remove().variant); + vcList.add(partiallyPhasedSites.remove().unfinishedVariant.toVariantContext()); } return vcList; } /* Phase vc (removed head of unphasedSiteQueue) using all VariantContext objects in - phasedSites, and all in unphasedSiteQueue that are within cacheWindow distance ahead of vc (on its contig). + partiallyPhasedSites, and all in unphasedSiteQueue that are within cacheWindow distance ahead of vc (on its contig). ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue). */ - - private VariantAndReads finalizePhasing(VariantAndReads vr, PhasingStats phaseStats) { - if (!vr.processVariant) - return vr; // return vr as is - - // Find the previous VariantContext (that was processed and phased): - VariantAndReads prevVr = null; - Iterator backwardsIt = phasedSites.descendingIterator(); // look at most recently phased sites - while (backwardsIt.hasNext()) { - VariantAndReads backVr = backwardsIt.next(); - if (backVr.processVariant) { - prevVr = backVr; - break; - } + private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) { + UnfinishedVariantAndReads pvr = new UnfinishedVariantAndReads(vr); + if (!vr.processVariant) { + partiallyPhasedSites.add(pvr); + return; } - if (prevVr == null) - return vr; // return vr as is, since cannot phase against "nothing" (vc is at the beginning of the chromosome, or the previous was so far back it was removed from phasedSites) VariantContext vc = vr.variant; logger.debug("Will phase vc = " + VariantContextUtils.getLocation(vc)); - - LinkedList windowVaList = new LinkedList(); - - // Include previously phased sites in the phasing computation: - for (VariantAndReads phasedVr : phasedSites) { - if (phasedVr.processVariant) - windowVaList.add(phasedVr); - } - - // Add position to be phased: - windowVaList.add(vr); - - // Include as of yet unphased sites in the phasing computation: - for (VariantAndReads nextVr : unphasedSiteQueue) { - if (!isInWindowRange(vc, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc - break; - if (nextVr.processVariant) // include in the phasing computation - windowVaList.add(nextVr); - } - - if (logger.isDebugEnabled()) { - for (VariantAndReads phaseInfoVr : windowVaList) - logger.debug("Using phaseInfoVc = " + VariantContextUtils.getLocation(phaseInfoVr.variant)); - } - logger.debug(""); - - Map sampGenotypes = vc.getGenotypes(); - Map phasedGtMap = new TreeMap(); + UnfinishedVariantContext uvc = pvr.unfinishedVariant; // Perform per-sample phasing: - TreeMap samplePhaseStats = new TreeMap(); + Map sampGenotypes = vc.getGenotypes(); + Map samplePhaseStats = new TreeMap(); for (Map.Entry sampGtEntry : sampGenotypes.entrySet()) { - logger.debug("sample = " + sampGtEntry.getKey()); - boolean genotypesArePhased = false; // don't phase unless we determine the phase - String samp = sampGtEntry.getKey(); Genotype gt = sampGtEntry.getValue(); - Map gtAttribs = null; - List gtAlleles = null; - if (gt.getPloidy() != 2) { - gtAttribs = gt.getAttributes(); - gtAlleles = gt.getAlleles(); - } - else { - gtAttribs = new HashMap(gt.getAttributes()); - BialleleSNP biall = new BialleleSNP(gt); + logger.debug("sample = " + samp); + if (isCalledDiploidGenotype(gt) && gt.isHet()) { // Can attempt to phase this genotype + PhasingWindow phaseWindow = new PhasingWindow(vr, samp); + if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against + BialleleSNP biall = new BialleleSNP(gt); + logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + biall); - if (gt.isHet()) { - VariantContext prevVc = prevVr.variant; - Genotype prevGenotype = prevVc.getGenotype(samp); - if (prevGenotype.isHet()) { - logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + biall); + DoublyLinkedList.BidirectionalIterator prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; + /* Notes: + 1. Call to next() advances iterator to next position in partiallyPhasedSites. + 2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads. + */ + UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant; + Genotype prevHetGenotype = prevUvc.getGenotype(samp); - List sampleWindowVaList = new LinkedList(); - int phasingSiteIndex = -1; - int currentIndex = 0; - for (VariantAndReads phaseInfoVr : windowVaList) { - VariantContext phaseInfoVc = phaseInfoVr.variant; - Genotype phaseInfoGt = phaseInfoVc.getGenotype(samp); - if (phaseInfoGt.isHet()) { // otherwise, of no value to phasing - sampleWindowVaList.add(phaseInfoVr); - if (phasingSiteIndex == -1) { - if (phaseInfoVr == vr) - phasingSiteIndex = currentIndex; // index of vr in sampleWindowVaList - else - currentIndex++; - } - logger.debug("STARTING TO PHASE USING POS = " + VariantContextUtils.getLocation(phaseInfoVc)); - } - } - if (logger.isDebugEnabled() && (phasingSiteIndex == -1 || phasingSiteIndex == 0)) - throw new ReviewedStingException("Internal error: could NOT find vr and/or prevVr!"); + PhaseResult pr = phaseSample(phaseWindow); + boolean genotypesArePhased = (pr.phaseQuality >= phaseQualityThresh); + if (genotypesArePhased) { + BialleleSNP prevBiall = new BialleleSNP(prevHetGenotype); - if (sampleWindowVaList.size() > maxPhaseSites) { - logger.warn("Trying to phase sample " + samp + " at locus " + VariantContextUtils.getLocation(vc) + " within a window of " + cacheWindow + " bases yields " + sampleWindowVaList.size() + " heterozygous sites to phase:\n" + toStringVRL(sampleWindowVaList)); + logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevBiall + "\n"); + logger.debug("THE PHASE CHOSEN HERE:\n" + biall + "\n\n"); - int prevSiteIndex = phasingSiteIndex - 1; // index of prevVr in sampleWindowVaList - int numToUse = maxPhaseSites - 2; // since always keep prevVr and vr - - int numOnLeft = prevSiteIndex; - int numOnRight = sampleWindowVaList.size() - (phasingSiteIndex + 1); - - int useOnLeft, useOnRight; - if (numOnLeft <= numOnRight) { - int halfToUse = new Double(Math.floor(numToUse / 2.0)).intValue(); // skimp on the left [floor], and be generous with the right side - useOnLeft = Math.min(halfToUse, numOnLeft); - useOnRight = Math.min(numToUse - useOnLeft, numOnRight); - } - else { // numOnRight < numOnLeft - int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil] - useOnRight = Math.min(halfToUse, numOnRight); - useOnLeft = Math.min(numToUse - useOnRight, numOnLeft); - } - int startIndex = prevSiteIndex - useOnLeft; - int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep - phasingSiteIndex -= startIndex; - sampleWindowVaList = sampleWindowVaList.subList(startIndex, stopIndex); - logger.warn("REDUCED to " + sampleWindowVaList.size() + " sites:\n" + toStringVRL(sampleWindowVaList)); - } - - PhaseResult pr = phaseSample(samp, sampleWindowVaList, phasingSiteIndex); - genotypesArePhased = (pr.phaseQuality >= phaseQualityThresh); - if (genotypesArePhased) { - BialleleSNP prevBiall = new BialleleSNP(prevGenotype); - - logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevBiall + "\n"); - logger.debug("THE PHASE CHOSEN HERE:\n" + biall + "\n\n"); - - ensurePhasing(biall, prevBiall, pr.haplotype); - gtAttribs.put("PQ", pr.phaseQuality); - } - - if (statsWriter != null) - statsWriter.addStat(samp, VariantContextUtils.getLocation(vc), distance(prevVc, vc), pr.phaseQuality, pr.numReads); - - PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp); - if (sampPhaseCounts == null) { - sampPhaseCounts = new PhaseCounts(); - samplePhaseStats.put(samp, sampPhaseCounts); - } - sampPhaseCounts.numTestedSites++; - if (genotypesArePhased) - sampPhaseCounts.numPhased++; + ensurePhasing(biall, prevBiall, pr.haplotype); + Map gtAttribs = new HashMap(gt.getAttributes()); + gtAttribs.put("PQ", pr.phaseQuality); + Genotype phasedGt = new Genotype(gt.getSampleName(), biall.getAllelesAsList(), gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); + uvc.setGenotype(samp, phasedGt); } + + // Now, update the 0 or more "interior" hom sites in between the previous het site and this het site: + while (prevHetAndInteriorIt.hasNext()) { + UnfinishedVariantAndReads interiorVr = prevHetAndInteriorIt.next(); + if (interiorVr.processVariant) { + UnfinishedVariantContext interiorUvc = interiorVr.unfinishedVariant; + Genotype handledGt = interiorUvc.getGenotype(samp); + if (handledGt == null || !isCalledDiploidGenotype(handledGt)) + throw new ReviewedStingException("LOGICAL error: should not have breaks WITHIN haplotype"); + if (!handledGt.isHom()) + throw new ReviewedStingException("LOGICAL error: should not have anything besides hom sites IN BETWEEN two het sites"); + + // Use the same PQ for each hom site in the "interior" as for the het-het phase: + if (genotypesArePhased) { + Map handledGtAttribs = new HashMap(handledGt.getAttributes()); + handledGtAttribs.put("PQ", pr.phaseQuality); + Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getNegLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased); + interiorUvc.setGenotype(samp, phasedHomGt); + } + } + } + + if (statsWriter != null) + statsWriter.addStat(samp, VariantContextUtils.getLocation(vc), distance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length); + + PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp); + if (sampPhaseCounts == null) { + sampPhaseCounts = new PhaseCounts(); + samplePhaseStats.put(samp, sampPhaseCounts); + } + sampPhaseCounts.numTestedSites++; + if (genotypesArePhased) + sampPhaseCounts.numPhased++; } - gtAlleles = biall.getAllelesAsList(); } - - Genotype phasedGt = new Genotype(gt.getSampleName(), gtAlleles, gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); - phasedGtMap.put(samp, phasedGt); } - phaseStats.addIn(new PhasingStats(samplePhaseStats)); - VariantContext phasedVc = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), phasedGtMap, vc.getNegLog10PError(), vc.getFilters(), vc.getAttributes()); - return new VariantAndReads(phasedVc, vr.sampleReadBases, vr.processVariant); + partiallyPhasedSites.add(pvr); // only add it in now, since don't want it to be there during phasing + phaseStats.addIn(new PhasingStats(samplePhaseStats)); } - private PhaseResult phaseSample(String sample, List variantList, int phasingSiteIndex) { + private static class GenotypeAndReadBases { + public Genotype genotype; + public ReadBasesAtPosition readBases; + public GenomeLoc loc; + + public GenotypeAndReadBases(Genotype genotype, ReadBasesAtPosition readBases, GenomeLoc loc) { + this.genotype = genotype; + this.readBases = readBases; + this.loc = loc; + } + } + + private class PhasingWindow { + private Genotype[] hetGenotypes = null; + private DoublyLinkedList.BidirectionalIterator prevHetAndInteriorIt = null; + private int phasingSiteIndex = -1; + private Map readsAtHetSites = null; + + public boolean hasPreviousHets() { + return phasingSiteIndex > 0; + } + + // ASSUMES that: isCalledDiploidGenotype(gt) && gt.isHet() [gt = vr.unfinishedVariant.getGenotype(sample)] + public PhasingWindow(VariantAndReads vr, String sample) { + List listHetGenotypes = new LinkedList(); + + // Include previously phased sites in the phasing computation: + DoublyLinkedList.BidirectionalIterator phasedIt = partiallyPhasedSites.iterator(); + while (phasedIt.hasNext()) { + UnfinishedVariantAndReads phasedVr = phasedIt.next(); + if (phasedVr.processVariant) { + Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample); + if (gt == null || !isCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break" + listHetGenotypes.clear(); // clear out any history + } + else if (gt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation()); + listHetGenotypes.add(grb); + logger.debug("Using UPSTREAM het site = " + grb.loc); + prevHetAndInteriorIt = phasedIt.clone(); + } + } + } + phasingSiteIndex = listHetGenotypes.size(); + if (phasingSiteIndex == 0) { // no previous sites against which to phase + hetGenotypes = null; + prevHetAndInteriorIt = null; + return; + } + prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()] + + // Add the (het) position to be phased: + GenomeLoc phaseLocus = VariantContextUtils.getLocation(vr.variant); + GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); + listHetGenotypes.add(grbPhase); + logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); + + // Include as-of-yet unphased sites in the phasing computation: + for (VariantAndReads nextVr : unphasedSiteQueue) { + if (!isInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc + break; + if (nextVr.processVariant) { + Genotype gt = nextVr.variant.getGenotype(sample); + if (gt == null || !isCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break" + break; + } + else if (gt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), VariantContextUtils.getLocation(nextVr.variant)); + listHetGenotypes.add(grb); + logger.debug("Using DOWNSTREAM het site = " + grb.loc); + } + } + } + + // First, assemble the "sub-reads" from the COMPLETE WINDOW-BASED SET of heterozygous positions for this sample: + buildReadsAtHetSites(listHetGenotypes); + + // Remove extraneous reads (those that do not "connect" the two core phasing sites): + Set onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); + + // Dynamically modify the window to only include sites which have a non-empty set of reads: + listHetGenotypes = removeExtraneousSites(listHetGenotypes); + + // In any case, must still trim the window size to be "feasible" + // [**NOTE**: May want to do this to try maximize the preservation of paths from (phasingSiteIndex - 1) to phasingSiteIndex]: + if (listHetGenotypes.size() > maxPhaseSites) { + listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus); + + // Can now remove any extra reads (and then sites): + buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); + onlyKeepReads = removeExtraneousReads(listHetGenotypes.size()); + listHetGenotypes = removeExtraneousSites(listHetGenotypes); + } + + // Lastly, assemble the "sub-reads" from the FINAL SET of heterozygous positions for this sample: + buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); + + // Copy to a fixed-size array: + hetGenotypes = new Genotype[listHetGenotypes.size()]; + int index = 0; + for (GenotypeAndReadBases copyGrb : listHetGenotypes) + hetGenotypes[index++] = copyGrb.genotype; + } + + private void buildReadsAtHetSites(List listHetGenotypes) { + buildReadsAtHetSites(listHetGenotypes, null); + } + + private void buildReadsAtHetSites(List listHetGenotypes, Set onlyKeepReads) { + readsAtHetSites = new HashMap(); + + LinkedList basesAtPositions = new LinkedList(); + for (GenotypeAndReadBases grb : listHetGenotypes) { + ReadBasesAtPosition readBases = grb.readBases; + if (readBases == null) + readBases = new ReadBasesAtPosition(); // for transparency, put an empty list of bases at this position for sample + basesAtPositions.add(readBases); + } + + int index = 0; + for (ReadBasesAtPosition rbp : basesAtPositions) { + for (ReadBase rb : rbp) { + String readName = rb.readName; + if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads + continue; + + Read rd = readsAtHetSites.get(readName); + if (rd == null) { + rd = new Read(basesAtPositions.size(), rb.mappingQual); + readsAtHetSites.put(readName, rd); + } + rd.updateBaseAndQuality(index, rb.base, rb.baseQual); + } + index++; + } + logger.debug("Number of sites in window = " + index); + + if (logger.isDebugEnabled()) { + logger.debug("ALL READS:"); + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + Read rd = nameToReads.getValue(); + logger.debug(rd + "\t" + rdName); + } + } + } + + private class ReadProperties { + public List rdEdges; + public int[] siteInds; + + public ReadProperties(Read rd) { + this.siteInds = rd.getNonNullIndices(); + this.rdEdges = new LinkedList(); + + // sufficient to create a path linking the sites in rd, so they all end up in the same connected component: + for (int i = 0; i < siteInds.length - 1; i++) { + GraphEdge e = new GraphEdge(siteInds[i], siteInds[i + 1]); + rdEdges.add(e); + } + } + } + + private class EdgeCounts { + private Map counts; + + public EdgeCounts() { + this.counts = new TreeMap(); // implemented GraphEdge.compareTo() + } + + public int getCount(GraphEdge e) { + Integer count = counts.get(e); + if (count == null) + return 0; + + return count; + } + + public int incrementEdge(GraphEdge e) { + Integer eCount = counts.get(e); + int cnt; + if (eCount == null) + cnt = 0; + else + cnt = eCount; + + cnt++; + counts.put(e, cnt); + return cnt; + } + + public int decrementEdge(GraphEdge e) { + Integer eCount = counts.get(e); + if (eCount == null) + return 0; + + int cnt = eCount - 1; + counts.put(e, cnt); + return cnt; + } + } + + public Set removeExtraneousReads(int numHetSites) { + Graph readGraph = new Graph(numHetSites); + Map readToGraphProperties = new HashMap(); + EdgeCounts edgeCounts = new EdgeCounts(); + + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + Read rd = nameToReads.getValue(); + + ReadProperties rp = new ReadProperties(rd); + if (!rp.rdEdges.isEmpty()) { // otherwise, this read is clearly irrelevant since it can't link anything + for (GraphEdge e : rp.rdEdges) { + readGraph.addEdge(e); + logger.debug("Read = " + rdName + " is adding edge: " + e); + + edgeCounts.incrementEdge(e); + } + readToGraphProperties.put(rdName, rp); + } + } + logger.debug("Read graph:\n" + readGraph); + Set keepReads = new HashSet(); + + // Check which Reads are involved in paths from (phasingSiteIndex - 1) to (phasingSiteIndex): + int prev = phasingSiteIndex - 1; + int cur = phasingSiteIndex; + + if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev + logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]"); + readsAtHetSites.clear(); + return keepReads; + } + + for (Map.Entry rdEdgesEntry : readToGraphProperties.entrySet()) { + String testRead = rdEdgesEntry.getKey(); + ReadProperties rp = rdEdgesEntry.getValue(); + logger.debug("Testing the connectivity of Read: " + testRead); + + // Check the connected components after removing this read's UNIQUE edges: + for (GraphEdge e : rp.rdEdges) { + if (edgeCounts.getCount(e) == 1) // otherwise, the edge still exists without this read + readGraph.removeEdge(e); + } + DisjointSet ccAfterRemove = readGraph.getConnectedComponents(); + + /* testRead contributes a path between prev and cur iff: + There exists i != j s.t. testRead[i] != null, testRead[j] != null, ccAfterRemove.inSameSet(prev,i) && ccAfterRemove.inSameSet(j,cur) + [since ALL non-null indices in testRead are connected to one another, as one clique]. + */ + List sameCCasPrev = ccAfterRemove.inSameSetAs(prev, rp.siteInds); + List sameCCasCur = ccAfterRemove.inSameSetAs(cur, rp.siteInds); + if (logger.isDebugEnabled()) { + StringBuilder sb = new StringBuilder("sameCCasPrev:"); + for (int ind : sameCCasPrev) + sb.append(" " + ind); + logger.debug(sb.toString()); + + sb = new StringBuilder("sameCCasCur:"); + for (int ind : sameCCasCur) + sb.append(" " + ind); + logger.debug(sb.toString()); + } + + boolean keepRead = false; + if (!sameCCasPrev.isEmpty() && !sameCCasCur.isEmpty()) { // There exists a path from prev to cur that goes through the sites in testRead + // Now, make sure that TWO DISTINCT sites, i and j, in testRead are used in the path: + Set union = new HashSet(sameCCasPrev); + union.addAll(sameCCasCur); + if (union.size() >= 2) // i != j + keepRead = true; + } + + if (keepRead) { + logger.debug("Read is part of path from " + prev + " to " + cur); + keepReads.add(testRead); + + // Add the removed edges back in, since we're keeping the read: + for (GraphEdge e : rp.rdEdges) + readGraph.addEdge(e); + } + else { // Decrease the count for the edges [note that any read-specific edges were already removed above]: + for (GraphEdge e : rp.rdEdges) + edgeCounts.decrementEdge(e); + } + } + + // Retain only the reads that contain an edge in a path connecting prev and cur: + Iterator> readIt = readsAtHetSites.entrySet().iterator(); + while (readIt.hasNext()) { + Map.Entry nameToReads = readIt.next(); + String rdName = nameToReads.getKey(); + if (!keepReads.contains(rdName)) { + readIt.remove(); + logger.debug("Removing extraneous read: " + rdName); + } + } + + return keepReads; + } + + private List removeExtraneousSites(List listHetGenotypes) { + Set sitesWithReads = new HashSet(); + for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { + Read rd = nameToReads.getValue(); + for (int i : rd.getNonNullIndices()) + sitesWithReads.add(i); + } + + // Remove all sites that have no read bases: + List keepHetSites = new LinkedList(); + int index = 0; + int numPrecedingRemoved = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + boolean keepSite = sitesWithReads.contains(index); + if (logger.isDebugEnabled() && !keepSite) + logger.debug("Removing read-less site " + grb.loc); + + if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) { + keepHetSites.add(grb); + if (!keepSite) + logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); + } + else if (index <= phasingSiteIndex) + numPrecedingRemoved++; + + index++; + } + + phasingSiteIndex -= numPrecedingRemoved; + return keepHetSites; + } + + private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { + logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); + + int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes + int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites! + + int numOnLeft = prevSiteIndex; + int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1); + + int useOnLeft, useOnRight; + if (numOnLeft <= numOnRight) { + int halfToUse = new Double(Math.floor(numToUse / 2.0)).intValue(); // skimp on the left [floor], and be generous with the right side + useOnLeft = Math.min(halfToUse, numOnLeft); + useOnRight = Math.min(numToUse - useOnLeft, numOnRight); + } + else { // numOnRight < numOnLeft + int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil] + useOnRight = Math.min(halfToUse, numOnRight); + useOnLeft = Math.min(numToUse - useOnRight, numOnLeft); + } + int startIndex = prevSiteIndex - useOnLeft; + int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep + phasingSiteIndex -= startIndex; + listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex); + logger.warn("REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); + + return listHetGenotypes; + } + } + + private PhaseResult phaseSample(PhasingWindow phaseWindow) { /* Will map a phase and its "complement" to a single representative phase, and marginalizeTable() marginalizes to 2 positions [starting at the previous position, and then the current position]: */ - HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(variantList, sample, phasingSiteIndex - 1, 2); + HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2); PhasingTable sampleHaps = tabCreator.getNewTable(); - // Assemble the "sub-reads" from the heterozygous positions for this sample: - LinkedList allPositions = new LinkedList(); - for (VariantAndReads phaseInfoVr : variantList) { - ReadBasesAtPosition readBases = phaseInfoVr.sampleReadBases.get(sample); - if (readBases == null) - readBases = new ReadBasesAtPosition(); // for transparency, put an empty list of bases at this position for sample - allPositions.add(readBases); + logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size()); + if (logger.isDebugEnabled()) { + logger.debug("USED READS:"); + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { + String rdName = nameToReads.getKey(); + Read rd = nameToReads.getValue(); + logger.debug(rd + "\t" + rdName); + } } - HashMap allReads = convertReadBasesAtPositionToReads(allPositions); - logger.debug("Number of TOTAL reads [including those covering only 1 position] at sites: " + allReads.size()); - int numUsedReads = 0; // Update the phasing table based on each of the sub-reads for this sample: - for (Map.Entry nameToReads : allReads.entrySet()) { + for (Map.Entry nameToReads : phaseWindow.readsAtHetSites.entrySet()) { Read rd = nameToReads.getValue(); - if (rd.numNonNulls() <= 1) // can't possibly provide any phasing information, so save time - continue; - numUsedReads++; logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey() + (rd.isGapped() ? "\tGAPPED" : "")); for (PhasingTable.PhasingTableEntry pte : sampleHaps) { @@ -411,7 +706,6 @@ public class ReadBackedPhasingWalker extends RodWalker 1 position in the haplotype] = " + numUsedReads); // Marginalize each haplotype to its first 2 positions: sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps); @@ -434,7 +728,7 @@ public class ReadBackedPhasingWalker extends RodWalker sampPhaseCountEntry : result.getPhaseCounts()) { PhaseCounts pc = sampPhaseCountEntry.getValue(); System.out.println("Sample: " + sampPhaseCountEntry.getKey() + "\tNumber of tested sites: " + pc.numTestedSites + "\tNumber of phased sites: " + pc.numPhased); @@ -524,26 +830,6 @@ public class ReadBackedPhasingWalker extends RodWalker convertReadBasesAtPositionToReads(Collection basesAtPositions) { - HashMap reads = new HashMap(); - - int index = 0; - for (ReadBasesAtPosition rbp : basesAtPositions) { - for (ReadBase rb : rbp) { - String readName = rb.readName; - - Read rd = reads.get(readName); - if (rd == null) { - rd = new Read(basesAtPositions.size(), rb.mappingQual); - reads.put(readName, rd); - } - rd.updateBaseAndQuality(index, rb.base, rb.baseQual); - } - index++; - } - return reads; - } - /* Inner classes: @@ -587,16 +873,16 @@ public class ReadBackedPhasingWalker extends RodWalker vrList) { + private static String toStringGRL(List grbList) { boolean first = true; StringBuilder sb = new StringBuilder(); - for (VariantAndReads vr : vrList) { + for (GenotypeAndReadBases grb : grbList) { if (first) first = false; else sb.append(" -- "); - sb.append(VariantContextUtils.getLocation(vr.variant)); + sb.append(grb.loc); } return sb.toString(); } @@ -615,6 +901,65 @@ public class ReadBackedPhasingWalker extends RodWalker sampleReadBases; + public boolean processVariant; + + public UnfinishedVariantAndReads(UnfinishedVariantContext unfinishedVariant, HashMap sampleReadBases, boolean processVariant) { + this.unfinishedVariant = unfinishedVariant; + this.sampleReadBases = sampleReadBases; + this.processVariant = processVariant; + } + + public UnfinishedVariantAndReads(VariantAndReads vr) { + this.unfinishedVariant = new UnfinishedVariantContext(vr.variant); + this.sampleReadBases = vr.sampleReadBases; + this.processVariant = vr.processVariant; + } + } + + // COULD replace with MutableVariantContext if it worked [didn't throw exceptions when trying to call its set() methods]... + private static class UnfinishedVariantContext { + private String name; + private String contig; + private long start; + private long stop; + private Collection alleles; + private Map genotypes; + private double negLog10PError; + private Set filters; + private Map attributes; + + public UnfinishedVariantContext(VariantContext vc) { + this.name = vc.getName(); + this.contig = vc.getChr(); + this.start = vc.getStart(); + this.stop = vc.getEnd(); + this.alleles = vc.getAlleles(); + this.genotypes = new HashMap(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable + this.negLog10PError = vc.getNegLog10PError(); + this.filters = vc.getFilters(); + this.attributes = vc.getAttributes(); + } + + public VariantContext toVariantContext() { + return new VariantContext(name, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes); + } + + public GenomeLoc getLocation() { + return GenomeLocParser.createGenomeLoc(contig, start, stop); + } + + public Genotype getGenotype(String sample) { + return genotypes.get(sample); + } + + public void setGenotype(String sample, Genotype newGt) { + genotypes.put(sample, newGt); + } + } + private static class ReadBase { public String readName; public byte base; @@ -645,6 +990,10 @@ public class ReadBackedPhasingWalker extends RodWalker iterator() { return bases.iterator(); } + + public boolean isEmpty() { + return bases.isEmpty(); + } } // @@ -656,14 +1005,8 @@ public class ReadBackedPhasingWalker extends RodWalker vaList, String sample) { - this.genotypes = new Genotype[vaList.size()]; - int index = 0; - for (VariantAndReads phaseInfoVr : vaList) { - VariantContext phaseInfoVc = phaseInfoVr.variant; - Genotype phaseInfoGt = phaseInfoVc.getGenotype(sample); - genotypes[index++] = phaseInfoGt; - } + public HaplotypeTableCreator(Genotype[] hetGenotypes) { + this.genotypes = hetGenotypes; } abstract public PhasingTable getNewTable(); @@ -671,9 +1014,8 @@ public class ReadBackedPhasingWalker extends RodWalker getAllHaplotypes() { int numSites = genotypes.length; int[] genotypeCards = new int[numSites]; - for (int i = 0; i < numSites; i++) { + for (int i = 0; i < numSites; i++) genotypeCards[i] = genotypes[i].getPloidy(); - } LinkedList allHaps = new LinkedList(); CardinalityCounter alleleCounter = new CardinalityCounter(genotypeCards); @@ -689,7 +1031,7 @@ public class ReadBackedPhasingWalker extends RodWalker hapMap = new TreeMap(); + Map hapMap = new TreeMap(); for (PhasingTable.PhasingTableEntry pte : table) { Haplotype rep = pte.getHaplotypeClass().getRepresentative(); PreciseNonNegativeDouble score = hapMap.get(rep); @@ -718,8 +1060,8 @@ public class ReadBackedPhasingWalker extends RodWalker vaList, String sample, int startIndex, int marginalizeLength) { - super(vaList, sample); + public BiallelicComplementHaplotypeTableCreator(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) { + super(hetGenotypes); this.bialleleSNPs = new BialleleSNP[genotypes.length]; for (int i = 0; i < genotypes.length; i++) @@ -853,18 +1195,20 @@ public class ReadBackedPhasingWalker extends RodWalker nonNull = new LinkedList(); + for (int i = 0; i < bases.length; i++) { + if (getBase(i) != null) + nonNull.add(i); + } + + int[] nonNullArray = new int[nonNull.size()]; + int index = 0; + for (int i : nonNull) + nonNullArray[index++] = i; + return nonNullArray; + } } class PhasingStats { @@ -1111,7 +1460,7 @@ class PhasingStats { private int numVarSites; // Map of: sample -> PhaseCounts: - private TreeMap samplePhaseStats; + private Map samplePhaseStats; public PhasingStats() { this(new TreeMap()); @@ -1123,7 +1472,7 @@ class PhasingStats { this.samplePhaseStats = new TreeMap(); } - public PhasingStats(TreeMap samplePhaseStats) { + public PhasingStats(Map samplePhaseStats) { this.numReads = 0; this.numVarSites = 0; this.samplePhaseStats = samplePhaseStats; @@ -1191,10 +1540,10 @@ class PhasingQualityStatsWriter { this.variantStatsFilePrefix = variantStatsFilePrefix; } - public void addStat(String sample, GenomeLoc locus, int distanceFromPrevious, double phasingQuality, int numReads) { + public void addStat(String sample, GenomeLoc locus, int distanceFromPrevious, double phasingQuality, int numReads, int windowSize) { BufferedWriter sampWriter = sampleToStatsWriter.get(sample); if (sampWriter == null) { - String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ_numReads.txt"; + String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ_numReads_windowSize.txt"; FileOutputStream output; try { @@ -1206,7 +1555,7 @@ class PhasingQualityStatsWriter { sampleToStatsWriter.put(sample, sampWriter); } try { - sampWriter.write(locus + "\t" + distanceFromPrevious + "\t" + phasingQuality + "\t" + numReads + "\n"); + sampWriter.write(locus + "\t" + distanceFromPrevious + "\t" + phasingQuality + "\t" + numReads + "\t" + windowSize + "\n"); sampWriter.flush(); } catch (IOException e) { throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e); @@ -1224,4 +1573,4 @@ class PhasingQualityStatsWriter { } } } -} +} \ No newline at end of file