Added validator for phasing using read information, e.g., PacBio: ReadBasedPhasingValidationWalker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4940 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2011-01-05 20:05:56 +00:00
parent d203f5e39a
commit 4b37710bcd
8 changed files with 848 additions and 280 deletions

View File

@ -0,0 +1,110 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.BaseUtils;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
public abstract class BaseArray implements Comparable<BaseArray> {
protected Byte[] bases;
public BaseArray(byte[] bases) {
this.bases = new Byte[bases.length];
for (int i = 0; i < bases.length; i++)
this.bases[i] = bases[i];
}
public BaseArray(Byte[] bases) {
this.bases = Arrays.copyOf(bases, bases.length);
}
public BaseArray(int length) {
this.bases = new Byte[length];
Arrays.fill(this.bases, null);
}
public BaseArray(BaseArray other) {
this(other.bases);
}
public void updateBase(int index, Byte base) {
bases[index] = base;
}
public Byte getBase(int index) {
return bases[index];
}
public int size() {
return bases.length;
}
public String toString() {
StringBuilder sb = new StringBuilder(bases.length);
for (Byte b : bases)
sb.append(b != null ? (char) b.byteValue() : "_");
return sb.toString();
}
public int compareTo(BaseArray that) {
int sz = this.bases.length;
if (sz != that.bases.length)
return (sz - that.bases.length);
for (int i = 0; i < sz; i++) {
Byte thisBase = this.getBase(i);
Byte thatBase = that.getBase(i);
if (thisBase == null || thatBase == null) {
if (thisBase == null && thatBase != null) {
return -1;
}
else if (thisBase != null && thatBase == null) {
return 1;
}
}
else if (!BaseUtils.extendedBasesAreEqual(thisBase, thatBase)) {
return thisBase - thatBase;
}
}
return 0;
}
public int[] getNonNullIndices() {
List<Integer> nonNull = new LinkedList<Integer>();
for (int i = 0; i < bases.length; i++) {
if (getBase(i) != null)
nonNull.add(i);
}
int[] nonNullArray = new int[nonNull.size()];
int index = 0;
for (int i : nonNull)
nonNullArray[index++] = i;
return nonNullArray;
}
}

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
public class Haplotype extends BaseArray implements Cloneable {
public Haplotype(byte[] bases) {
super(bases);
}
private Haplotype(Byte[] bases) {
super(bases);
}
public Haplotype(Haplotype other) {
super(other);
}
public Haplotype(BaseArray baseArr) {
super(baseArr.bases);
if (baseArr.getNonNullIndices().length != baseArr.bases.length)
throw new ReviewedStingException("Should NEVER call Haplotype ctor with null bases!");
}
public void updateBase(int index, Byte base) {
if (base == null) {
throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!");
}
super.updateBase(index, base);
}
public Haplotype clone() {
try {
super.clone();
} catch (CloneNotSupportedException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
return new Haplotype(this);
}
// Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive.
public Haplotype subHaplotype(int fromIndex, int toIndex) {
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
}
}

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.PreciseNonNegativeDouble;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
public class PhasingRead extends BaseArray {
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT
public PhasingRead(int length, int mappingQual) {
super(length);
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
this.baseProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseProbs, null);
this.baseErrorProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseErrorProbs, null);
}
public void updateBaseAndQuality(int index, Byte base, byte baseQual) {
updateBase(index, base);
double errProb = QualityUtils.qualToErrorProb(baseQual);
// The base error should be AT LEAST AS HIGH as the mapping error [equivalent to capping the base quality (BQ) by the mapping quality (MQ)]:
errProb = Math.max(errProb, 1.0 - mappingProb.getValue());
baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb); // The probability that the true base is the base called in the read
baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb / 3.0); // DIVIDE up the error probability EQUALLY over the 3 non-called bases
}
public PhasingScore matchHaplotypeClassScore(HaplotypeClass hapClass) {
PreciseNonNegativeDouble value = new PreciseNonNegativeDouble(0.0);
for (Haplotype h : hapClass)
value.plusEqual(matchHaplotypeScore(h));
return new PhasingScore(value);
}
private PreciseNonNegativeDouble matchHaplotypeScore(Haplotype hap) {
PreciseNonNegativeDouble score = new PreciseNonNegativeDouble(1.0);
int sz = this.bases.length;
if (sz != hap.bases.length)
throw new ReviewedStingException("Read and Haplotype should have same length to be compared!");
// Technically, this HAS NO EFFECT since it is multiplied in for ALL haplotype pairs, but do so for completeness:
score.timesEqual(mappingProb);
for (int i = 0; i < sz; i++) {
Byte thisBase = this.getBase(i);
Byte hapBase = hap.getBase(i);
if (thisBase != null && hapBase != null) {
if (BaseUtils.basesAreEqual(thisBase, hapBase))
score.timesEqual(baseProbs[i]);
else
score.timesEqual(baseErrorProbs[i]);
}
}
return score;
}
}

View File

@ -21,7 +21,6 @@
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE. * OTHER DEALINGS IN THE SOFTWARE.
*/ */
package org.broadinstitute.sting.gatk.walkers.phasing; package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Allele;
@ -202,7 +201,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
return null; return null;
mostDownstreamLocusReached = ref.getLocus(); mostDownstreamLocusReached = ref.getLocus();
if ( DEBUG ) logger.debug("map() at: " + mostDownstreamLocusReached); if (DEBUG) logger.debug("map() at: " + mostDownstreamLocusReached);
PhasingStats phaseStats = new PhasingStats(); PhasingStats phaseStats = new PhasingStats();
List<VariantContext> unprocessedList = new LinkedList<VariantContext>(); List<VariantContext> unprocessedList = new LinkedList<VariantContext>();
@ -210,13 +209,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
boolean requireStartHere = true; // only see each VariantContext once boolean requireStartHere = true; // only see each VariantContext once
boolean takeFirstOnly = false; // take as many entries as the VCF file has boolean takeFirstOnly = false; // take as many entries as the VCF file has
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
if ( samplesToPhase != null ) vc = reduceVCToSamples(vc, samplesToPhase); if (samplesToPhase != null) vc = reduceVCToSamples(vc, samplesToPhase);
if (ReadBackedPhasingWalker.processVariantInPhasing(vc)) { if (ReadBackedPhasingWalker.processVariantInPhasing(vc)) {
VariantAndReads vr = new VariantAndReads(vc, context); VariantAndReads vr = new VariantAndReads(vc, context);
unphasedSiteQueue.add(vr); unphasedSiteQueue.add(vr);
if ( DEBUG ) logger.debug("Added variant to queue = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vr.variant)); if (DEBUG)
logger.debug("Added variant to queue = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
} }
else { else {
unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly
@ -240,13 +240,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
private static final Set<String> KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet<String>(Arrays.asList("PQ")); private static final Set<String> KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet<String>(Arrays.asList("PQ"));
private VariantContext reduceVCToSamples(VariantContext vc, List<String> samplesToPhase) { private VariantContext reduceVCToSamples(VariantContext vc, List<String> samplesToPhase) {
// for ( String sample : samplesToPhase ) // for ( String sample : samplesToPhase )
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values()); VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values());
// logger.debug("original VC = " + vc); // logger.debug("original VC = " + vc);
// logger.debug("sub VC = " + subvc); // logger.debug("sub VC = " + subvc);
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF ); return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
} }
private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) { private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) {
@ -255,7 +256,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
while (!unphasedSiteQueue.isEmpty()) { while (!unphasedSiteQueue.isEmpty()) {
if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue
VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant; VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant;
if (startDistancesAreInWindowRange(mostDownstreamLocusReached, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),nextToPhaseVc))) { if (startDistancesAreInWindowRange(mostDownstreamLocusReached, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextToPhaseVc))) {
/* mostDownstreamLocusReached is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc /* mostDownstreamLocusReached is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc
(note that we ASSUME that the VCF is ordered by <contig,locus>). (note that we ASSUME that the VCF is ordered by <contig,locus>).
Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself. Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself.
@ -266,16 +267,17 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
// Update partiallyPhasedSites before it's used in phaseSite: // Update partiallyPhasedSites before it's used in phaseSite:
oldPhasedList.addAll(discardIrrelevantPhasedSites()); oldPhasedList.addAll(discardIrrelevantPhasedSites());
if ( DEBUG ) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList)); if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList));
VariantAndReads vr = unphasedSiteQueue.remove(); VariantAndReads vr = unphasedSiteQueue.remove();
if ( DEBUG ) logger.debug("Performing phasing for " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vr.variant)); if (DEBUG)
logger.debug("Performing phasing for " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
phaseSite(vr, phaseStats); phaseSite(vr, phaseStats);
} }
// Update partiallyPhasedSites after phaseSite is done: // Update partiallyPhasedSites after phaseSite is done:
oldPhasedList.addAll(discardIrrelevantPhasedSites()); oldPhasedList.addAll(discardIrrelevantPhasedSites());
if ( DEBUG ) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList)); if (DEBUG) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList));
if (outputMultipleBaseCountsWriter != null) if (outputMultipleBaseCountsWriter != null)
outputMultipleBaseCountsWriter.outputMultipleBaseCounts(); outputMultipleBaseCountsWriter.outputMultipleBaseCounts();
@ -288,7 +290,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
GenomeLoc nextToPhaseLoc = null; GenomeLoc nextToPhaseLoc = null;
if (!unphasedSiteQueue.isEmpty()) if (!unphasedSiteQueue.isEmpty())
nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),unphasedSiteQueue.peek().variant); nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant);
while (!partiallyPhasedSites.isEmpty()) { while (!partiallyPhasedSites.isEmpty()) {
if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past" if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past"
@ -313,7 +315,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) { private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) {
VariantContext vc = vr.variant; VariantContext vc = vr.variant;
logger.debug("Will phase vc = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc)); logger.debug("Will phase vc = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
UnfinishedVariantAndReads uvr = new UnfinishedVariantAndReads(vr); UnfinishedVariantAndReads uvr = new UnfinishedVariantAndReads(vr);
UnfinishedVariantContext uvc = uvr.unfinishedVariant; UnfinishedVariantContext uvc = uvr.unfinishedVariant;
@ -325,7 +327,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
String samp = sampGtEntry.getKey(); String samp = sampGtEntry.getKey();
Genotype gt = sampGtEntry.getValue(); Genotype gt = sampGtEntry.getValue();
if ( DEBUG ) logger.debug("sample = " + samp); if (DEBUG) logger.debug("sample = " + samp);
if (isUnfilteredCalledDiploidGenotype(gt)) { if (isUnfilteredCalledDiploidGenotype(gt)) {
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site: if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
// true <-> can trivially phase a hom site relative to ANY previous site: // true <-> can trivially phase a hom site relative to ANY previous site:
@ -336,7 +338,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
PhasingWindow phaseWindow = new PhasingWindow(vr, samp); PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
SNPallelePair allelePair = new SNPallelePair(gt); SNPallelePair allelePair = new SNPallelePair(gt);
if ( DEBUG ) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair); if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
/* Notes: /* Notes:
@ -350,15 +352,17 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality); boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
if (pr.phasingContainsInconsistencies) { if (pr.phasingContainsInconsistencies) {
if ( DEBUG ) logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc)); if (DEBUG)
logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
uvc.setPhasingInconsistent(); uvc.setPhasingInconsistent();
} }
if (genotypesArePhased) { if (genotypesArePhased) {
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype); SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
if ( DEBUG ) logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n"); if (DEBUG)
if ( DEBUG ) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n"); logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n");
if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
ensurePhasing(allelePair, prevAllelePair, pr.haplotype); ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes()); Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
@ -389,7 +393,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
if (statsWriter != null) if (statsWriter != null)
statsWriter.addStat(samp, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length); statsWriter.addStat(samp, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp); PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) { if (sampPhaseCounts == null) {
@ -436,7 +440,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private Genotype[] hetGenotypes = null; private Genotype[] hetGenotypes = null;
private DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = null; private DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = null;
private int phasingSiteIndex = -1; private int phasingSiteIndex = -1;
private Map<String, Read> readsAtHetSites = null; private Map<String, PhasingRead> readsAtHetSites = null;
public boolean hasPreviousHets() { public boolean hasPreviousHets() {
return phasingSiteIndex > 0; return phasingSiteIndex > 0;
@ -458,7 +462,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
else if (gt.isHet()) { else if (gt.isHet()) {
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation()); GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation());
listHetGenotypes.add(grb); listHetGenotypes.add(grb);
if ( DEBUG ) logger.debug("Using UPSTREAM het site = " + grb.loc); if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
prevHetAndInteriorIt = phasedIt.clone(); prevHetAndInteriorIt = phasedIt.clone();
} }
} }
@ -471,10 +475,11 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()] prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()]
// Add the (het) position to be phased: // Add the (het) position to be phased:
GenomeLoc phaseLocus = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vr.variant); GenomeLoc phaseLocus = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant);
GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus);
listHetGenotypes.add(grbPhase); listHetGenotypes.add(grbPhase);
if ( DEBUG ) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); if (DEBUG)
logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
// Include as-of-yet unphased sites in the phasing computation: // Include as-of-yet unphased sites in the phasing computation:
for (VariantAndReads nextVr : unphasedSiteQueue) { for (VariantAndReads nextVr : unphasedSiteQueue) {
@ -485,9 +490,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
break; break;
} }
else if (gt.isHet()) { else if (gt.isHet()) {
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),nextVr.variant)); GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant));
listHetGenotypes.add(grb); listHetGenotypes.add(grb);
if ( DEBUG ) logger.debug("Using DOWNSTREAM het site = " + grb.loc); if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc);
} }
} }
@ -515,7 +520,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
buildReadsAtHetSites(listHetGenotypes, onlyKeepReads); buildReadsAtHetSites(listHetGenotypes, onlyKeepReads);
// Copy to a fixed-size array: // Copy to a fixed-size array:
if ( DEBUG ) logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); if (DEBUG)
logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
hetGenotypes = new Genotype[listHetGenotypes.size()]; hetGenotypes = new Genotype[listHetGenotypes.size()];
int index = 0; int index = 0;
for (GenotypeAndReadBases copyGrb : listHetGenotypes) for (GenotypeAndReadBases copyGrb : listHetGenotypes)
@ -531,7 +537,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
private void buildReadsAtHetSites(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phasingLoc, Set<String> onlyKeepReads) { private void buildReadsAtHetSites(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phasingLoc, Set<String> onlyKeepReads) {
readsAtHetSites = new HashMap<String, Read>(); readsAtHetSites = new HashMap<String, PhasingRead>();
int index = 0; int index = 0;
for (GenotypeAndReadBases grb : listHetGenotypes) { for (GenotypeAndReadBases grb : listHetGenotypes) {
@ -542,9 +548,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads
continue; continue;
Read rd = readsAtHetSites.get(readName); PhasingRead rd = readsAtHetSites.get(readName);
if (rd == null) { if (rd == null) {
rd = new Read(listHetGenotypes.size(), rb.mappingQual); rd = new PhasingRead(listHetGenotypes.size(), rb.mappingQual);
readsAtHetSites.put(readName, rd); readsAtHetSites.put(readName, rd);
} }
else if (outputMultipleBaseCountsWriter != null && rd.getBase(index) != null // rd already has a base at index else if (outputMultipleBaseCountsWriter != null && rd.getBase(index) != null // rd already has a base at index
@ -558,13 +564,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
index++; index++;
} }
if ( DEBUG ) logger.debug("Number of sites in window = " + index); if (DEBUG) logger.debug("Number of sites in window = " + index);
if ( DEBUG && logger.isDebugEnabled()) { if (DEBUG && logger.isDebugEnabled()) {
logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:"); logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:");
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) { for (Map.Entry<String, PhasingRead> nameToReads : readsAtHetSites.entrySet()) {
String rdName = nameToReads.getKey(); String rdName = nameToReads.getKey();
Read rd = nameToReads.getValue(); PhasingRead rd = nameToReads.getValue();
logger.debug(rd + "\t" + rdName); logger.debug(rd + "\t" + rdName);
} }
} }
@ -620,16 +626,16 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
EdgeToReads edgeToReads = new EdgeToReads(); EdgeToReads edgeToReads = new EdgeToReads();
Set<Integer> sitesWithEdges = new TreeSet<Integer>(); Set<Integer> sitesWithEdges = new TreeSet<Integer>();
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) { for (Map.Entry<String, PhasingRead> nameToReads : readsAtHetSites.entrySet()) {
String rdName = nameToReads.getKey(); String rdName = nameToReads.getKey();
Read rd = nameToReads.getValue(); PhasingRead rd = nameToReads.getValue();
int[] siteInds = rd.getNonNullIndices(); int[] siteInds = rd.getNonNullIndices();
// Connect each pair of non-null sites in rd: // Connect each pair of non-null sites in rd:
for (int i = 0; i < siteInds.length; i++) { for (int i = 0; i < siteInds.length; i++) {
for (int j = i + 1; j < siteInds.length; j++) { for (int j = i + 1; j < siteInds.length; j++) {
GraphEdge e = new GraphEdge(siteInds[i], siteInds[j]); GraphEdge e = new GraphEdge(siteInds[i], siteInds[j]);
if ( DEBUG ) logger.debug("Read = " + rdName + " is adding edge: " + e); if (DEBUG) logger.debug("Read = " + rdName + " is adding edge: " + e);
readGraph.addEdge(e); readGraph.addEdge(e);
edgeToReads.addRead(e, rdName); edgeToReads.addRead(e, rdName);
@ -639,7 +645,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
} }
} }
if ( DEBUG ) logger.debug("Read graph:\n" + readGraph); if (DEBUG) logger.debug("Read graph:\n" + readGraph);
Set<String> keepReads = new HashSet<String>(); Set<String> keepReads = new HashSet<String>();
/* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex): /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
@ -692,7 +698,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
int cur = phasingSiteIndex; int cur = phasingSiteIndex;
if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev
if ( DEBUG ) logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]"); if (DEBUG)
logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]");
readsAtHetSites.clear(); readsAtHetSites.clear();
return keepReads; return keepReads;
} }
@ -703,7 +710,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites]; IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites];
IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites]; IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites];
for (int i : sitesWithEdges) { for (int i : sitesWithEdges) {
if ( DEBUG ) logger.debug("Calculating CC after removing edges of site: " + i); if (DEBUG) logger.debug("Calculating CC after removing edges of site: " + i);
// Remove all edges incident to i and see which positions have paths to prev and cur: // Remove all edges incident to i and see which positions have paths to prev and cur:
Collection<GraphEdge> removedEdges = readGraph.removeAllIncidentEdges(i); Collection<GraphEdge> removedEdges = readGraph.removeAllIncidentEdges(i);
@ -713,15 +720,15 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges)); removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges));
removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges)); removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges));
if ( DEBUG ) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]); if (DEBUG) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]);
if ( DEBUG ) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]); if (DEBUG) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]);
// Add the removed edges back in: // Add the removed edges back in:
readGraph.addEdges(removedEdges); readGraph.addEdges(removedEdges);
} }
for (GraphEdge e : readGraph) { for (GraphEdge e : readGraph) {
if ( DEBUG ) logger.debug("Testing the path-connectivity of Edge: " + e); if (DEBUG) logger.debug("Testing the path-connectivity of Edge: " + e);
/* Edge e={v1,v2} contributes a path between prev and cur for testRead iff: /* Edge e={v1,v2} contributes a path between prev and cur for testRead iff:
testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa]. testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa].
@ -746,13 +753,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
// Retain only the reads that contain an edge in a path connecting prev and cur: // Retain only the reads that contain an edge in a path connecting prev and cur:
Iterator<Map.Entry<String, Read>> readIt = readsAtHetSites.entrySet().iterator(); Iterator<Map.Entry<String, PhasingRead>> readIt = readsAtHetSites.entrySet().iterator();
while (readIt.hasNext()) { while (readIt.hasNext()) {
Map.Entry<String, Read> nameToReads = readIt.next(); Map.Entry<String, PhasingRead> nameToReads = readIt.next();
String rdName = nameToReads.getKey(); String rdName = nameToReads.getKey();
if (!keepReads.contains(rdName)) { if (!keepReads.contains(rdName)) {
readIt.remove(); readIt.remove();
if ( DEBUG ) logger.debug("Removing extraneous read: " + rdName); if (DEBUG) logger.debug("Removing extraneous read: " + rdName);
} }
} }
@ -761,8 +768,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private List<GenotypeAndReadBases> removeExtraneousSites(List<GenotypeAndReadBases> listHetGenotypes) { private List<GenotypeAndReadBases> removeExtraneousSites(List<GenotypeAndReadBases> listHetGenotypes) {
Set<Integer> sitesWithReads = new HashSet<Integer>(); Set<Integer> sitesWithReads = new HashSet<Integer>();
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) { for (Map.Entry<String, PhasingRead> nameToReads : readsAtHetSites.entrySet()) {
Read rd = nameToReads.getValue(); PhasingRead rd = nameToReads.getValue();
for (int i : rd.getNonNullIndices()) for (int i : rd.getNonNullIndices())
sitesWithReads.add(i); sitesWithReads.add(i);
} }
@ -779,7 +786,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) { if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) {
keepHetSites.add(grb); keepHetSites.add(grb);
if (!keepSite) if (!keepSite)
if ( DEBUG ) logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); if (DEBUG)
logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]...");
} }
else if (index <= phasingSiteIndex) else if (index <= phasingSiteIndex)
numPrecedingRemoved++; numPrecedingRemoved++;
@ -792,7 +800,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
private List<GenotypeAndReadBases> trimWindow(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phaseLocus) { private List<GenotypeAndReadBases> trimWindow(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phaseLocus) {
if ( DEBUG) logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); if (DEBUG)
logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes
int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites! int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites!
@ -815,7 +824,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
phasingSiteIndex -= startIndex; phasingSiteIndex -= startIndex;
listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex); listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex);
if ( DEBUG ) logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); if (DEBUG)
logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
return listHetGenotypes; return listHetGenotypes;
} }
@ -831,9 +841,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (DEBUG && logger.isDebugEnabled()) { if (DEBUG && logger.isDebugEnabled()) {
logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size()); logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size());
logger.debug("USED READS:"); logger.debug("USED READS:");
for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) { for (Map.Entry<String, PhasingRead> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
String rdName = nameToReads.getKey(); String rdName = nameToReads.getKey();
Read rd = nameToReads.getValue(); PhasingRead rd = nameToReads.getValue();
logger.debug(rd + "\t" + rdName); logger.debug(rd + "\t" + rdName);
} }
} }
@ -847,19 +857,20 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
double totalAbsPQchange = 0; double totalAbsPQchange = 0;
int numPQchangesObserved = 0; int numPQchangesObserved = 0;
for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) { for (Map.Entry<String, PhasingRead> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
Read rd = nameToReads.getValue(); PhasingRead rd = nameToReads.getValue();
if ( DEBUG ) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey()); if (DEBUG) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey());
for (PhasingTable.PhasingTableEntry pte : sampleHaps) { for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass());
pte.getScore().integrateReadScore(score); pte.getScore().integrateReadScore(score);
if ( DEBUG ) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); if (DEBUG) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
} }
// Check the current best haplotype assignment and compare it to the previous one: // Check the current best haplotype assignment and compare it to the previous one:
MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false); MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false);
if ( DEBUG ) logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); if (DEBUG)
logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality);
if (prevMaxHapAndQual != null) { if (prevMaxHapAndQual != null) {
double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality; double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality;
@ -867,7 +878,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
numHighQualityIterations++; numHighQualityIterations++;
if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase
(numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ (numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ
if ( DEBUG ) logger.debug("Inconsistent read found!"); if (DEBUG) logger.debug("Inconsistent read found!");
numInconsistentIterations++; numInconsistentIterations++;
} }
} }
@ -878,12 +889,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
prevMaxHapAndQual = curMaxHapAndQual; prevMaxHapAndQual = curMaxHapAndQual;
} }
if ( DEBUG ) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n"); if (DEBUG) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true); MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true);
double posteriorProb = maxHapQual.maxEntry.getScore().getValue(); double posteriorProb = maxHapQual.maxEntry.getScore().getValue();
if ( DEBUG ) logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality); if (DEBUG)
if ( DEBUG ) logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations); logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality);
if (DEBUG)
logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations);
boolean phasingContainsInconsistencies = false; boolean phasingContainsInconsistencies = false;
if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS) if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS)
@ -951,7 +964,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) { private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) {
return startDistancesAreInWindowRange(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc1), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc2)); return startDistancesAreInWindowRange(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc1), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
} }
private boolean startDistancesAreInWindowRange(GenomeLoc loc1, GenomeLoc loc2) { private boolean startDistancesAreInWindowRange(GenomeLoc loc1, GenomeLoc loc2) {
@ -959,7 +972,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
private int startDistance(UnfinishedVariantContext uvc1, VariantContext vc2) { private int startDistance(UnfinishedVariantContext uvc1, VariantContext vc2) {
return uvc1.getLocation().distance(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc2)); return uvc1.getLocation().distance(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
} }
public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) { public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) {
@ -1006,13 +1019,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
private void writeVCF(VariantContext vc) { private void writeVCF(VariantContext vc) {
if ( samplesToPhase == null || vc.isNotFiltered() ) if (samplesToPhase == null || vc.isNotFiltered())
//if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant //if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant
WriteVCF.writeVCF(vc, writer, logger); WriteVCF.writeVCF(vc, writer, logger);
} }
public static boolean processVariantInPhasing(VariantContext vc) { public static boolean processVariantInPhasing(VariantContext vc) {
return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || ! vc.isVariant()); // we can handle the non-variant case as well return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || !vc.isVariant()); // we can handle the non-variant case as well
//return isUnfilteredBiallelicSNP(vc); //return isUnfilteredBiallelicSNP(vc);
} }
@ -1140,47 +1153,11 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
else else
sb.append(" -- "); sb.append(" -- ");
sb.append(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc)); sb.append(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
} }
return sb.toString(); return sb.toString();
} }
private static class ReadBase {
public String readName;
public byte base;
public int mappingQual;
public byte baseQual;
public ReadBase(String readName, byte base, int mappingQual, byte baseQual) {
this.readName = readName;
this.base = base;
this.mappingQual = mappingQual;
this.baseQual = baseQual;
}
}
private static class ReadBasesAtPosition implements Iterable<ReadBase> {
// list of: <read name, base>
private LinkedList<ReadBase> bases;
public ReadBasesAtPosition() {
this.bases = new LinkedList<ReadBase>();
}
public void putReadBase(PileupElement pue) {
ReadBase rb = new ReadBase(pue.getRead().getReadName(), pue.getBase(), pue.getMappingQual(), pue.getQual());
bases.add(rb);
}
public Iterator<ReadBase> iterator() {
return bases.iterator();
}
public boolean isEmpty() {
return bases.isEmpty();
}
}
// //
// THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases() // THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases()
// FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH // FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH
@ -1432,7 +1409,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
public void outputMultipleBaseCounts() { public void outputMultipleBaseCounts() {
GenomeLoc nextToPhaseLoc = null; GenomeLoc nextToPhaseLoc = null;
if (!unphasedSiteQueue.isEmpty()) if (!unphasedSiteQueue.isEmpty())
nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),unphasedSiteQueue.peek().variant); nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant);
outputMultipleBaseCounts(nextToPhaseLoc); outputMultipleBaseCounts(nextToPhaseLoc);
} }
@ -1484,108 +1461,6 @@ class PhasingScore extends PreciseNonNegativeDouble {
} }
} }
abstract class BaseArray implements Comparable<BaseArray> {
protected Byte[] bases;
public BaseArray(byte[] bases) {
this.bases = new Byte[bases.length];
for (int i = 0; i < bases.length; i++)
this.bases[i] = bases[i];
}
public BaseArray(Byte[] bases) {
this.bases = Arrays.copyOf(bases, bases.length);
}
public BaseArray(int length) {
this.bases = new Byte[length];
Arrays.fill(this.bases, null);
}
public BaseArray(BaseArray other) {
this(other.bases);
}
public void updateBase(int index, Byte base) {
bases[index] = base;
}
public Byte getBase(int index) {
return bases[index];
}
public int size() {
return bases.length;
}
public String toString() {
StringBuilder sb = new StringBuilder(bases.length);
for (Byte b : bases)
sb.append(b != null ? (char) b.byteValue() : "_");
return sb.toString();
}
public int compareTo(BaseArray that) {
int sz = this.bases.length;
if (sz != that.bases.length)
return (sz - that.bases.length);
for (int i = 0; i < sz; i++) {
Byte thisBase = this.getBase(i);
Byte thatBase = that.getBase(i);
if (thisBase == null || thatBase == null) {
if (thisBase == null && thatBase != null) {
return -1;
}
else if (thisBase != null && thatBase == null) {
return 1;
}
}
else if (!BaseUtils.basesAreEqual(thisBase, thatBase)) {
return thisBase - thatBase;
}
}
return 0;
}
}
class Haplotype extends BaseArray implements Cloneable {
public Haplotype(byte[] bases) {
super(bases);
}
private Haplotype(Byte[] bases) {
super(bases);
}
public Haplotype(Haplotype other) {
super(other);
}
public void updateBase(int index, Byte base) {
if (base == null) {
throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!");
}
super.updateBase(index, base);
}
public Haplotype clone() {
try {
super.clone();
} catch (CloneNotSupportedException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
return new Haplotype(this);
}
// Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive.
public Haplotype subHaplotype(int fromIndex, int toIndex) {
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
}
}
class HaplotypeClass implements Iterable<Haplotype> { class HaplotypeClass implements Iterable<Haplotype> {
private ArrayList<Haplotype> haps; private ArrayList<Haplotype> haps;
private Haplotype rep; private Haplotype rep;
@ -1619,82 +1494,6 @@ class HaplotypeClass implements Iterable<Haplotype> {
} }
} }
class Read extends BaseArray {
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT
public Read(int length, int mappingQual) {
super(length);
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
this.baseProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseProbs, null);
this.baseErrorProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseErrorProbs, null);
}
public void updateBaseAndQuality(int index, Byte base, byte baseQual) {
updateBase(index, base);
double errProb = QualityUtils.qualToErrorProb(baseQual);
// The base error should be AT LEAST AS HIGH as the mapping error [equivalent to capping the base quality (BQ) by the mapping quality (MQ)]:
errProb = Math.max(errProb, 1.0 - mappingProb.getValue());
baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb); // The probability that the true base is the base called in the read
baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb / 3.0); // DIVIDE up the error probability EQUALLY over the 3 non-called bases
}
public PhasingScore matchHaplotypeClassScore(HaplotypeClass hapClass) {
PreciseNonNegativeDouble value = new PreciseNonNegativeDouble(0.0);
for (Haplotype h : hapClass)
value.plusEqual(matchHaplotypeScore(h));
return new PhasingScore(value);
}
private PreciseNonNegativeDouble matchHaplotypeScore(Haplotype hap) {
PreciseNonNegativeDouble score = new PreciseNonNegativeDouble(1.0);
int sz = this.bases.length;
if (sz != hap.bases.length)
throw new ReviewedStingException("Read and Haplotype should have same length to be compared!");
// Technically, this HAS NO EFFECT since it is multiplied in for ALL haplotype pairs, but do so for completeness:
score.timesEqual(mappingProb);
for (int i = 0; i < sz; i++) {
Byte thisBase = this.getBase(i);
Byte hapBase = hap.getBase(i);
if (thisBase != null && hapBase != null) {
if (BaseUtils.basesAreEqual(thisBase, hapBase))
score.timesEqual(baseProbs[i]);
else
score.timesEqual(baseErrorProbs[i]);
}
}
return score;
}
public int[] getNonNullIndices() {
List<Integer> nonNull = new LinkedList<Integer>();
for (int i = 0; i < bases.length; i++) {
if (getBase(i) != null)
nonNull.add(i);
}
int[] nonNullArray = new int[nonNull.size()];
int index = 0;
for (int i : nonNull)
nonNullArray[index++] = i;
return nonNullArray;
}
}
class PhasingStats { class PhasingStats {
private int numReads; private int numReads;
private int numVarSites; private int numVarSites;

View File

@ -0,0 +1,38 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
public class ReadBase {
public String readName;
public byte base;
public int mappingQual;
public byte baseQual;
public ReadBase(String readName, byte base, int mappingQual, byte baseQual) {
this.readName = readName;
this.base = base;
this.mappingQual = mappingQual;
this.baseQual = baseQual;
}
}

View File

@ -0,0 +1,51 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.util.Iterator;
import java.util.LinkedList;
public class ReadBasesAtPosition implements Iterable<ReadBase> {
// list of: <read name, base>
private LinkedList<ReadBase> bases;
public ReadBasesAtPosition() {
this.bases = new LinkedList<ReadBase>();
}
public void putReadBase(PileupElement pue) {
ReadBase rb = new ReadBase(pue.getRead().getReadName(), pue.getBase(), pue.getMappingQual(), pue.getQual());
bases.add(rb);
}
public Iterator<ReadBase> iterator() {
return bases.iterator();
}
public boolean isEmpty() {
return bases.isEmpty();
}
}

View File

@ -0,0 +1,402 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.phasing;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.phasing.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.*;
import java.util.*;
/**
* Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites.
*/
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
@Requires(value = {DataSource.READS, DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class))
@By(DataSource.READS)
@ReadFilters({ZeroMappingQualityReadFilter.class})
// Filter out all reads with zero mapping quality
public class ReadBasedPhasingValidationWalker extends RodWalker<Integer, Integer> {
private LinkedList<String> rodNames = null;
@Argument(fullName = "sitePairsFile", shortName = "sitePairsFile", doc = "File of pairs of variants for which phasing in ROD should be assessed using input reads", required = true)
protected File sitePairsFile = null;
@Output
protected PrintStream out;
private Set<SitePair> sitePairs = null;
private String sampleName = null;
SiteGenotypeAndReads prevSiteAndReads = null;
private final static int NUM_IN_PAIR = 2; // trivial
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
public void initialize() {
rodNames = new LinkedList<String>();
rodNames.add("variant");
sitePairs = new TreeSet<SitePair>();
GenomeLocParser locParser = getToolkit().getGenomeLocParser();
InputStream sitePairsStream = null;
try {
sitePairsStream = new FileInputStream(sitePairsFile);
}
catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
throw new UserException("Problem opening file: " + sitePairsFile);
}
AsciiLineReader sitePairsReader = new AsciiLineReader(sitePairsStream);
while (true) {
String line = null;
try {
line = sitePairsReader.readLine();
}
catch (IOException ioe) {
ioe.printStackTrace();
throw new UserException("Problem reading file: " + sitePairsFile);
}
if (line == null)
break; // reached end of file
String[] twoSites = line.split("\t");
if (twoSites.length != 2)
throw new UserException("Must have PAIRS of sites in line " + line + " of " + sitePairsFile);
SitePair sp = new SitePair(locParser.parseGenomeLoc(twoSites[0]), locParser.parseGenomeLoc(twoSites[1]));
sitePairs.add(sp);
}
}
public boolean generateExtendedEvents() {
return false;
}
public Integer reduceInit() {
return 0;
}
/**
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range.
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (tracker == null)
return null;
boolean relevantSitePair = false;
SitePair sp = null;
if (prevSiteAndReads != null) {
// all vc's below start at ref.getLocus() [due to requireStartHere = true]:
sp = new SitePair(prevSiteAndReads.site, ref.getLocus());
relevantSitePair = sitePairs.contains(sp);
}
if (context == null || !context.hasBasePileup())
return null;
ReadBackedPileup pileup = context.getBasePileup();
String nextName = null;
//
// TODO: ASSUMES THAT ALL READS COME FROM A SINGLE SAMPLE:
// TODO: CODE SHOULD BE AS FOLLOWS:
/*
Collection<String> sampNames = pileup.getSampleNames();
if (sampNames.size() != 1)
throw new UserException("Reads must be for exactly one sample [not multi-sample]");
nextName = sampNames.iterator().next();
if (nextName == null)
throw new UserException("Reads must be for exactly one sample");
if (sampleName == null)
sampleName = nextName;
else if (!nextName.equals(sampleName))
throw new UserException("Reads must have a single consistent sample name");
pileup = pileup.getPileupForSampleName(sampleName);
*/
//
ReadBasesAtPosition readBases = new ReadBasesAtPosition();
for (PileupElement p : pileup)
readBases.putReadBase(p);
ReadCounter rdCounts = null;
if (relevantSitePair) { // otherwise, processed the reads for their possible use in the future:
PhasingReadList buildReads = new PhasingReadList(NUM_IN_PAIR);
buildReads.updateBases(0, prevSiteAndReads.readBases);
buildReads.updateBases(1, readBases);
List<PhasingRead> reads = new LinkedList<PhasingRead>();
for (PhasingRead rd : buildReads) {
if (rd.getNonNullIndices().length == NUM_IN_PAIR) // only want reads with BOTH bases called [possibly as deleted ("D")]
reads.add(rd);
}
// Count the occurence of each "haplotype":
rdCounts = new ReadCounter();
for (PhasingRead rd : reads)
rdCounts.incrementCount(rd);
}
// Now, read the ROD and note the genotypes and their phase to be validated:
Set<Haplotype> calledHaplotypes = null;
List<Haplotype> allPossibleHaplotypes = null;
boolean requireStartHere = true; // only see each VariantContext once
boolean takeFirstOnly = true; // take only the first entry from the ROD file
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
if (vc.isFiltered() || !vc.isSNP())
continue;
if (vc.getNSamples() != 1)
throw new UserException("ROD file must have exactly one sample [not multi-sample]");
nextName = vc.getSampleNames().iterator().next();
if (sampleName == null)
sampleName = nextName;
else if (!nextName.equals(sampleName))
throw new UserException("ROD must have a single consistent sample name");
Genotype gt = vc.getGenotype(sampleName);
if (relevantSitePair) {
Genotype prevGt = prevSiteAndReads.gt;
List<Allele> prevAlleles = prevGt.getAlleles();
List<Allele> curAlleles = gt.getAlleles();
calledHaplotypes = new TreeSet<Haplotype>(); // implemented Haplotype.compareTo()
if (gt.isPhased()) {
if (gt.getPloidy() != prevGt.getPloidy())
throw new UserException("Invalid ROD file: cannot be phased AND have different ploidys!");
// Consider only the haplotypes called to be phased
Iterator<Allele> curAllIt = curAlleles.iterator();
for (Allele prevAll : prevAlleles) {
Allele curAll = curAllIt.next();
calledHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll));
}
}
// Consider EVERY combination of alleles as haplotypes [IF PHASED, this will give the contingency table in the CORRECT order]:
allPossibleHaplotypes = new LinkedList<Haplotype>();
for (Allele prevAll : prevAlleles) {
for (Allele curAll : curAlleles) {
allPossibleHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll));
}
}
}
prevSiteAndReads = new SiteGenotypeAndReads(ref.getLocus(), gt, readBases);
}
int processedPairs = 0;
if (relevantSitePair) {
Map<Haplotype, Integer> haplotypeCounts = new TreeMap<Haplotype, Integer>(); // implemented Haplotype.compareTo()
processedPairs = 1;
int totalCount = rdCounts.totalCount();
System.out.println("\nPair: " + sp + " [# reads = " + totalCount + "]");
int matchCount = 0;
for (Map.Entry<PhasingRead, Integer> rdEntry : rdCounts.entrySet()) {
PhasingRead read = rdEntry.getKey();
int count = rdEntry.getValue();
Haplotype readsHaplotype = new Haplotype(read);
haplotypeCounts.put(readsHaplotype, count);
boolean readMatchesCalledHaplotype = calledHaplotypes != null && calledHaplotypes.contains(readsHaplotype);
if (readMatchesCalledHaplotype)
matchCount += count;
System.out.println("read" + ": " + read + (readMatchesCalledHaplotype ? "*" : "") + "\tcount: " + count);
}
double percentMatchingReads = 100 * (matchCount / (double) totalCount);
System.out.println("% MATCHING reads: " + percentMatchingReads + " [of " + totalCount + " TOTAL reads]");
out.print(sp);
for (Haplotype hap : allPossibleHaplotypes) {
Integer count = haplotypeCounts.get(hap);
if (count == null) // haplotype may not have been observed in ANY reads
count = 0;
out.print("\t" + count);
}
out.println();
}
return processedPairs;
}
private Haplotype successiveAllelesToHaplotype(Allele prevAll, Allele curAll) {
byte prevBase = SNPallelePair.getSingleBase(prevAll);
byte curBase = SNPallelePair.getSingleBase(curAll);
byte[] hapBases = new byte[NUM_IN_PAIR];
hapBases[0] = prevBase;
hapBases[1] = curBase;
return new Haplotype(hapBases);
}
public Integer reduce(Integer addIn, Integer runningCount) {
if (addIn == null)
addIn = 0;
return runningCount + addIn;
}
/**
* @param result the number of reads and VariantContexts seen.
*/
public void onTraversalDone(Integer result) {
System.out.println("Validated " + result + " pairs of sites.");
}
}
class SitePair implements Comparable<SitePair> {
public GenomeLoc site1;
public GenomeLoc site2;
public SitePair(GenomeLoc site1, GenomeLoc site2) {
if (site1.size() > 1 || site2.size() > 1)
throw new UserException("Must give pairs of SINGLE-LOCUS record start sites");
this.site1 = site1;
this.site2 = site2;
}
public String toString() {
return site1.toString() + "\t" + site2.toString();
}
public int compareTo(SitePair other) {
int comp1 = site1.compareTo(other.site1);
if (comp1 != 0)
return comp1;
return site2.compareTo(other.site2);
}
}
class SiteGenotypeAndReads {
public GenomeLoc site;
public Genotype gt;
public ReadBasesAtPosition readBases;
public SiteGenotypeAndReads(GenomeLoc site, Genotype gt, ReadBasesAtPosition readBases) {
this.site = site;
this.gt = gt;
this.readBases = readBases;
}
}
class PhasingReadList implements Iterable<PhasingRead> {
private Map<String, PhasingRead> readsAtSites = null;
private int numSites;
public PhasingReadList(int numSites) {
this.readsAtSites = new HashMap<String, PhasingRead>();
this.numSites = numSites;
}
public void updateBases(int index, ReadBasesAtPosition readBases) {
if (readBases == null)
return;
for (ReadBase rb : readBases) {
String readName = rb.readName;
PhasingRead rd = readsAtSites.get(readName);
if (rd == null) {
rd = new PhasingRead(numSites, rb.mappingQual);
readsAtSites.put(readName, rd);
}
// Arbitrarily updates to the last base observed for this sample and read (rb.base):
rd.updateBaseAndQuality(index, rb.base, rb.baseQual);
}
}
public Iterator<PhasingRead> iterator() {
return readsAtSites.values().iterator();
}
public int size() {
return readsAtSites.size();
}
}
class ReadCounter {
private Map<PhasingRead, Integer> counts;
private int totalCount;
public ReadCounter() {
this.counts = new TreeMap<PhasingRead, Integer>(); // implemented PhasingRead.compareTo()
}
public void incrementCount(PhasingRead rd) {
Integer cnt = counts.get(rd);
if (cnt == null)
cnt = 0;
counts.put(rd, cnt + 1);
totalCount++;
}
public Set<Map.Entry<PhasingRead, Integer>> entrySet() {
return counts.entrySet();
}
public int totalCount() {
return totalCount;
}
}

View File

@ -93,6 +93,10 @@ public class BaseUtils {
return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2); return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2);
} }
static public boolean extendedBasesAreEqual(byte base1, byte base2) {
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
}
/** /**
* Converts a IUPAC nucleotide code to a pair of bases * Converts a IUPAC nucleotide code to a pair of bases