Added validator for phasing using read information, e.g., PacBio: ReadBasedPhasingValidationWalker
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4940 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d203f5e39a
commit
4b37710bcd
|
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class BaseArray implements Comparable<BaseArray> {
|
||||
protected Byte[] bases;
|
||||
|
||||
public BaseArray(byte[] bases) {
|
||||
this.bases = new Byte[bases.length];
|
||||
for (int i = 0; i < bases.length; i++)
|
||||
this.bases[i] = bases[i];
|
||||
}
|
||||
|
||||
public BaseArray(Byte[] bases) {
|
||||
this.bases = Arrays.copyOf(bases, bases.length);
|
||||
}
|
||||
|
||||
public BaseArray(int length) {
|
||||
this.bases = new Byte[length];
|
||||
Arrays.fill(this.bases, null);
|
||||
}
|
||||
|
||||
public BaseArray(BaseArray other) {
|
||||
this(other.bases);
|
||||
}
|
||||
|
||||
public void updateBase(int index, Byte base) {
|
||||
bases[index] = base;
|
||||
}
|
||||
|
||||
public Byte getBase(int index) {
|
||||
return bases[index];
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return bases.length;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder(bases.length);
|
||||
for (Byte b : bases)
|
||||
sb.append(b != null ? (char) b.byteValue() : "_");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public int compareTo(BaseArray that) {
|
||||
int sz = this.bases.length;
|
||||
if (sz != that.bases.length)
|
||||
return (sz - that.bases.length);
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
Byte thisBase = this.getBase(i);
|
||||
Byte thatBase = that.getBase(i);
|
||||
if (thisBase == null || thatBase == null) {
|
||||
if (thisBase == null && thatBase != null) {
|
||||
return -1;
|
||||
}
|
||||
else if (thisBase != null && thatBase == null) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
else if (!BaseUtils.extendedBasesAreEqual(thisBase, thatBase)) {
|
||||
return thisBase - thatBase;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int[] getNonNullIndices() {
|
||||
List<Integer> nonNull = new LinkedList<Integer>();
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
if (getBase(i) != null)
|
||||
nonNull.add(i);
|
||||
}
|
||||
|
||||
int[] nonNullArray = new int[nonNull.size()];
|
||||
int index = 0;
|
||||
for (int i : nonNull)
|
||||
nonNullArray[index++] = i;
|
||||
return nonNullArray;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class Haplotype extends BaseArray implements Cloneable {
|
||||
public Haplotype(byte[] bases) {
|
||||
super(bases);
|
||||
}
|
||||
|
||||
private Haplotype(Byte[] bases) {
|
||||
super(bases);
|
||||
}
|
||||
|
||||
public Haplotype(Haplotype other) {
|
||||
super(other);
|
||||
}
|
||||
|
||||
public Haplotype(BaseArray baseArr) {
|
||||
super(baseArr.bases);
|
||||
|
||||
if (baseArr.getNonNullIndices().length != baseArr.bases.length)
|
||||
throw new ReviewedStingException("Should NEVER call Haplotype ctor with null bases!");
|
||||
}
|
||||
|
||||
public void updateBase(int index, Byte base) {
|
||||
if (base == null) {
|
||||
throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!");
|
||||
}
|
||||
super.updateBase(index, base);
|
||||
}
|
||||
|
||||
public Haplotype clone() {
|
||||
try {
|
||||
super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
return new Haplotype(this);
|
||||
}
|
||||
|
||||
// Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive.
|
||||
|
||||
public Haplotype subHaplotype(int fromIndex, int toIndex) {
|
||||
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.PreciseNonNegativeDouble;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class PhasingRead extends BaseArray {
|
||||
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
|
||||
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
|
||||
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT
|
||||
|
||||
public PhasingRead(int length, int mappingQual) {
|
||||
super(length);
|
||||
|
||||
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
|
||||
|
||||
this.baseProbs = new PreciseNonNegativeDouble[length];
|
||||
Arrays.fill(this.baseProbs, null);
|
||||
|
||||
this.baseErrorProbs = new PreciseNonNegativeDouble[length];
|
||||
Arrays.fill(this.baseErrorProbs, null);
|
||||
}
|
||||
|
||||
public void updateBaseAndQuality(int index, Byte base, byte baseQual) {
|
||||
updateBase(index, base);
|
||||
|
||||
double errProb = QualityUtils.qualToErrorProb(baseQual);
|
||||
|
||||
// The base error should be AT LEAST AS HIGH as the mapping error [equivalent to capping the base quality (BQ) by the mapping quality (MQ)]:
|
||||
errProb = Math.max(errProb, 1.0 - mappingProb.getValue());
|
||||
|
||||
baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb); // The probability that the true base is the base called in the read
|
||||
baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb / 3.0); // DIVIDE up the error probability EQUALLY over the 3 non-called bases
|
||||
}
|
||||
|
||||
public PhasingScore matchHaplotypeClassScore(HaplotypeClass hapClass) {
|
||||
PreciseNonNegativeDouble value = new PreciseNonNegativeDouble(0.0);
|
||||
for (Haplotype h : hapClass)
|
||||
value.plusEqual(matchHaplotypeScore(h));
|
||||
|
||||
return new PhasingScore(value);
|
||||
}
|
||||
|
||||
private PreciseNonNegativeDouble matchHaplotypeScore(Haplotype hap) {
|
||||
PreciseNonNegativeDouble score = new PreciseNonNegativeDouble(1.0);
|
||||
|
||||
int sz = this.bases.length;
|
||||
if (sz != hap.bases.length)
|
||||
throw new ReviewedStingException("Read and Haplotype should have same length to be compared!");
|
||||
|
||||
// Technically, this HAS NO EFFECT since it is multiplied in for ALL haplotype pairs, but do so for completeness:
|
||||
score.timesEqual(mappingProb);
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
Byte thisBase = this.getBase(i);
|
||||
Byte hapBase = hap.getBase(i);
|
||||
if (thisBase != null && hapBase != null) {
|
||||
if (BaseUtils.basesAreEqual(thisBase, hapBase))
|
||||
score.timesEqual(baseProbs[i]);
|
||||
else
|
||||
score.timesEqual(baseErrorProbs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
}
|
||||
|
|
@ -21,7 +21,6 @@
|
|||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
|
|
@ -202,7 +201,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
return null;
|
||||
|
||||
mostDownstreamLocusReached = ref.getLocus();
|
||||
if ( DEBUG ) logger.debug("map() at: " + mostDownstreamLocusReached);
|
||||
if (DEBUG) logger.debug("map() at: " + mostDownstreamLocusReached);
|
||||
|
||||
PhasingStats phaseStats = new PhasingStats();
|
||||
List<VariantContext> unprocessedList = new LinkedList<VariantContext>();
|
||||
|
|
@ -210,13 +209,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
boolean requireStartHere = true; // only see each VariantContext once
|
||||
boolean takeFirstOnly = false; // take as many entries as the VCF file has
|
||||
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
|
||||
if ( samplesToPhase != null ) vc = reduceVCToSamples(vc, samplesToPhase);
|
||||
if (samplesToPhase != null) vc = reduceVCToSamples(vc, samplesToPhase);
|
||||
|
||||
if (ReadBackedPhasingWalker.processVariantInPhasing(vc)) {
|
||||
VariantAndReads vr = new VariantAndReads(vc, context);
|
||||
unphasedSiteQueue.add(vr);
|
||||
|
||||
if ( DEBUG ) logger.debug("Added variant to queue = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vr.variant));
|
||||
if (DEBUG)
|
||||
logger.debug("Added variant to queue = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
|
||||
}
|
||||
else {
|
||||
unprocessedList.add(vc); // Finished with the unprocessed variant, and writer can enforce sorting on-the-fly
|
||||
|
|
@ -240,13 +240,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private static final Set<String> KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet<String>(Arrays.asList("PQ"));
|
||||
|
||||
private VariantContext reduceVCToSamples(VariantContext vc, List<String> samplesToPhase) {
|
||||
// for ( String sample : samplesToPhase )
|
||||
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
|
||||
VariantContext subvc = vc.subContextFromGenotypes(vc.getGenotypes(samplesToPhase).values());
|
||||
// logger.debug("original VC = " + vc);
|
||||
// logger.debug("sub VC = " + subvc);
|
||||
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF );
|
||||
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
|
||||
}
|
||||
|
||||
private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) {
|
||||
|
|
@ -255,7 +256,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
while (!unphasedSiteQueue.isEmpty()) {
|
||||
if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue
|
||||
VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant;
|
||||
if (startDistancesAreInWindowRange(mostDownstreamLocusReached, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),nextToPhaseVc))) {
|
||||
if (startDistancesAreInWindowRange(mostDownstreamLocusReached, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextToPhaseVc))) {
|
||||
/* mostDownstreamLocusReached is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc
|
||||
(note that we ASSUME that the VCF is ordered by <contig,locus>).
|
||||
Note that this will always leave at least one entry (the last one), since mostDownstreamLocusReached is in range of itself.
|
||||
|
|
@ -266,16 +267,17 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
// Update partiallyPhasedSites before it's used in phaseSite:
|
||||
oldPhasedList.addAll(discardIrrelevantPhasedSites());
|
||||
if ( DEBUG ) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList));
|
||||
if (DEBUG) logger.debug("oldPhasedList(1st) = " + toStringVCL(oldPhasedList));
|
||||
|
||||
VariantAndReads vr = unphasedSiteQueue.remove();
|
||||
if ( DEBUG ) logger.debug("Performing phasing for " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vr.variant));
|
||||
if (DEBUG)
|
||||
logger.debug("Performing phasing for " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant));
|
||||
phaseSite(vr, phaseStats);
|
||||
}
|
||||
|
||||
// Update partiallyPhasedSites after phaseSite is done:
|
||||
oldPhasedList.addAll(discardIrrelevantPhasedSites());
|
||||
if ( DEBUG ) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList));
|
||||
if (DEBUG) logger.debug("oldPhasedList(2nd) = " + toStringVCL(oldPhasedList));
|
||||
|
||||
if (outputMultipleBaseCountsWriter != null)
|
||||
outputMultipleBaseCountsWriter.outputMultipleBaseCounts();
|
||||
|
|
@ -288,7 +290,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
GenomeLoc nextToPhaseLoc = null;
|
||||
if (!unphasedSiteQueue.isEmpty())
|
||||
nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),unphasedSiteQueue.peek().variant);
|
||||
nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant);
|
||||
|
||||
while (!partiallyPhasedSites.isEmpty()) {
|
||||
if (nextToPhaseLoc != null) { // otherwise, unphasedSiteQueue.isEmpty(), and therefore no need to keep any of the "past"
|
||||
|
|
@ -313,7 +315,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
private void phaseSite(VariantAndReads vr, PhasingStats phaseStats) {
|
||||
VariantContext vc = vr.variant;
|
||||
logger.debug("Will phase vc = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc));
|
||||
logger.debug("Will phase vc = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
|
||||
|
||||
UnfinishedVariantAndReads uvr = new UnfinishedVariantAndReads(vr);
|
||||
UnfinishedVariantContext uvc = uvr.unfinishedVariant;
|
||||
|
|
@ -325,7 +327,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
String samp = sampGtEntry.getKey();
|
||||
Genotype gt = sampGtEntry.getValue();
|
||||
|
||||
if ( DEBUG ) logger.debug("sample = " + samp);
|
||||
if (DEBUG) logger.debug("sample = " + samp);
|
||||
if (isUnfilteredCalledDiploidGenotype(gt)) {
|
||||
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
|
||||
// true <-> can trivially phase a hom site relative to ANY previous site:
|
||||
|
|
@ -336,7 +338,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
|
||||
if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
|
||||
SNPallelePair allelePair = new SNPallelePair(gt);
|
||||
if ( DEBUG ) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
|
||||
if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
|
||||
|
||||
DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
|
||||
/* Notes:
|
||||
|
|
@ -350,15 +352,17 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
|
||||
|
||||
if (pr.phasingContainsInconsistencies) {
|
||||
if ( DEBUG ) logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc));
|
||||
if (DEBUG)
|
||||
logger.debug("MORE than " + (MAX_FRACTION_OF_INCONSISTENT_READS * 100) + "% of the reads are inconsistent for phasing of " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
|
||||
uvc.setPhasingInconsistent();
|
||||
}
|
||||
|
||||
if (genotypesArePhased) {
|
||||
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
|
||||
|
||||
if ( DEBUG ) logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n");
|
||||
if ( DEBUG ) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
|
||||
if (DEBUG)
|
||||
logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n");
|
||||
if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
|
||||
|
||||
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
|
||||
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
|
||||
|
|
@ -389,7 +393,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
if (statsWriter != null)
|
||||
statsWriter.addStat(samp, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
|
||||
statsWriter.addStat(samp, VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
|
||||
|
||||
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
|
||||
if (sampPhaseCounts == null) {
|
||||
|
|
@ -436,7 +440,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
private Genotype[] hetGenotypes = null;
|
||||
private DoublyLinkedList.BidirectionalIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = null;
|
||||
private int phasingSiteIndex = -1;
|
||||
private Map<String, Read> readsAtHetSites = null;
|
||||
private Map<String, PhasingRead> readsAtHetSites = null;
|
||||
|
||||
public boolean hasPreviousHets() {
|
||||
return phasingSiteIndex > 0;
|
||||
|
|
@ -458,7 +462,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
else if (gt.isHet()) {
|
||||
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation());
|
||||
listHetGenotypes.add(grb);
|
||||
if ( DEBUG ) logger.debug("Using UPSTREAM het site = " + grb.loc);
|
||||
if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
|
||||
prevHetAndInteriorIt = phasedIt.clone();
|
||||
}
|
||||
}
|
||||
|
|
@ -471,10 +475,11 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()]
|
||||
|
||||
// Add the (het) position to be phased:
|
||||
GenomeLoc phaseLocus = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vr.variant);
|
||||
GenomeLoc phaseLocus = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant);
|
||||
GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus);
|
||||
listHetGenotypes.add(grbPhase);
|
||||
if ( DEBUG ) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
|
||||
if (DEBUG)
|
||||
logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
|
||||
|
||||
// Include as-of-yet unphased sites in the phasing computation:
|
||||
for (VariantAndReads nextVr : unphasedSiteQueue) {
|
||||
|
|
@ -485,9 +490,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
break;
|
||||
}
|
||||
else if (gt.isHet()) {
|
||||
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),nextVr.variant));
|
||||
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant));
|
||||
listHetGenotypes.add(grb);
|
||||
if ( DEBUG ) logger.debug("Using DOWNSTREAM het site = " + grb.loc);
|
||||
if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -515,7 +520,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
buildReadsAtHetSites(listHetGenotypes, onlyKeepReads);
|
||||
|
||||
// Copy to a fixed-size array:
|
||||
if ( DEBUG ) logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
|
||||
if (DEBUG)
|
||||
logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
|
||||
hetGenotypes = new Genotype[listHetGenotypes.size()];
|
||||
int index = 0;
|
||||
for (GenotypeAndReadBases copyGrb : listHetGenotypes)
|
||||
|
|
@ -531,7 +537,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private void buildReadsAtHetSites(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phasingLoc, Set<String> onlyKeepReads) {
|
||||
readsAtHetSites = new HashMap<String, Read>();
|
||||
readsAtHetSites = new HashMap<String, PhasingRead>();
|
||||
|
||||
int index = 0;
|
||||
for (GenotypeAndReadBases grb : listHetGenotypes) {
|
||||
|
|
@ -542,9 +548,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
if (onlyKeepReads != null && !onlyKeepReads.contains(readName)) // if onlyKeepReads exists, ignore reads not in onlyKeepReads
|
||||
continue;
|
||||
|
||||
Read rd = readsAtHetSites.get(readName);
|
||||
PhasingRead rd = readsAtHetSites.get(readName);
|
||||
if (rd == null) {
|
||||
rd = new Read(listHetGenotypes.size(), rb.mappingQual);
|
||||
rd = new PhasingRead(listHetGenotypes.size(), rb.mappingQual);
|
||||
readsAtHetSites.put(readName, rd);
|
||||
}
|
||||
else if (outputMultipleBaseCountsWriter != null && rd.getBase(index) != null // rd already has a base at index
|
||||
|
|
@ -558,13 +564,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
index++;
|
||||
}
|
||||
if ( DEBUG ) logger.debug("Number of sites in window = " + index);
|
||||
if (DEBUG) logger.debug("Number of sites in window = " + index);
|
||||
|
||||
if ( DEBUG && logger.isDebugEnabled()) {
|
||||
if (DEBUG && logger.isDebugEnabled()) {
|
||||
logger.debug("ALL READS [phasingSiteIndex = " + phasingSiteIndex + "]:");
|
||||
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
|
||||
for (Map.Entry<String, PhasingRead> nameToReads : readsAtHetSites.entrySet()) {
|
||||
String rdName = nameToReads.getKey();
|
||||
Read rd = nameToReads.getValue();
|
||||
PhasingRead rd = nameToReads.getValue();
|
||||
logger.debug(rd + "\t" + rdName);
|
||||
}
|
||||
}
|
||||
|
|
@ -620,16 +626,16 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
EdgeToReads edgeToReads = new EdgeToReads();
|
||||
Set<Integer> sitesWithEdges = new TreeSet<Integer>();
|
||||
|
||||
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
|
||||
for (Map.Entry<String, PhasingRead> nameToReads : readsAtHetSites.entrySet()) {
|
||||
String rdName = nameToReads.getKey();
|
||||
Read rd = nameToReads.getValue();
|
||||
PhasingRead rd = nameToReads.getValue();
|
||||
|
||||
int[] siteInds = rd.getNonNullIndices();
|
||||
// Connect each pair of non-null sites in rd:
|
||||
for (int i = 0; i < siteInds.length; i++) {
|
||||
for (int j = i + 1; j < siteInds.length; j++) {
|
||||
GraphEdge e = new GraphEdge(siteInds[i], siteInds[j]);
|
||||
if ( DEBUG ) logger.debug("Read = " + rdName + " is adding edge: " + e);
|
||||
if (DEBUG) logger.debug("Read = " + rdName + " is adding edge: " + e);
|
||||
readGraph.addEdge(e);
|
||||
|
||||
edgeToReads.addRead(e, rdName);
|
||||
|
|
@ -639,7 +645,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
}
|
||||
}
|
||||
if ( DEBUG ) logger.debug("Read graph:\n" + readGraph);
|
||||
if (DEBUG) logger.debug("Read graph:\n" + readGraph);
|
||||
Set<String> keepReads = new HashSet<String>();
|
||||
|
||||
/* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
|
||||
|
|
@ -692,7 +698,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
int cur = phasingSiteIndex;
|
||||
|
||||
if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev
|
||||
if ( DEBUG ) logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]");
|
||||
if (DEBUG)
|
||||
logger.debug("NO READ PATH between PHASE site [" + cur + "] and UPSTREAM site [" + prev + "]");
|
||||
readsAtHetSites.clear();
|
||||
return keepReads;
|
||||
}
|
||||
|
|
@ -703,7 +710,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
IntegerSet[] removedSiteSameCCAsPrev = new IntegerSet[numHetSites];
|
||||
IntegerSet[] removedSiteSameCCAsCur = new IntegerSet[numHetSites];
|
||||
for (int i : sitesWithEdges) {
|
||||
if ( DEBUG ) logger.debug("Calculating CC after removing edges of site: " + i);
|
||||
if (DEBUG) logger.debug("Calculating CC after removing edges of site: " + i);
|
||||
|
||||
// Remove all edges incident to i and see which positions have paths to prev and cur:
|
||||
Collection<GraphEdge> removedEdges = readGraph.removeAllIncidentEdges(i);
|
||||
|
|
@ -713,15 +720,15 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
removedSiteSameCCAsPrev[i] = new IntegerSet(ccAfterRemove.inSameSetAs(prev, sitesWithEdges));
|
||||
removedSiteSameCCAsCur[i] = new IntegerSet(ccAfterRemove.inSameSetAs(cur, sitesWithEdges));
|
||||
|
||||
if ( DEBUG ) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]);
|
||||
if ( DEBUG ) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]);
|
||||
if (DEBUG) logger.debug("Same CC as previous [" + prev + "]: " + removedSiteSameCCAsPrev[i]);
|
||||
if (DEBUG) logger.debug("Same CC as current [" + cur + "]: " + removedSiteSameCCAsCur[i]);
|
||||
|
||||
// Add the removed edges back in:
|
||||
readGraph.addEdges(removedEdges);
|
||||
}
|
||||
|
||||
for (GraphEdge e : readGraph) {
|
||||
if ( DEBUG ) logger.debug("Testing the path-connectivity of Edge: " + e);
|
||||
if (DEBUG) logger.debug("Testing the path-connectivity of Edge: " + e);
|
||||
|
||||
/* Edge e={v1,v2} contributes a path between prev and cur for testRead iff:
|
||||
testRead[v1] != null, testRead[v2] != null, and there is a path from prev ---> v1 -> v2 ---> cur [or vice versa].
|
||||
|
|
@ -746,13 +753,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
// Retain only the reads that contain an edge in a path connecting prev and cur:
|
||||
Iterator<Map.Entry<String, Read>> readIt = readsAtHetSites.entrySet().iterator();
|
||||
Iterator<Map.Entry<String, PhasingRead>> readIt = readsAtHetSites.entrySet().iterator();
|
||||
while (readIt.hasNext()) {
|
||||
Map.Entry<String, Read> nameToReads = readIt.next();
|
||||
Map.Entry<String, PhasingRead> nameToReads = readIt.next();
|
||||
String rdName = nameToReads.getKey();
|
||||
if (!keepReads.contains(rdName)) {
|
||||
readIt.remove();
|
||||
if ( DEBUG ) logger.debug("Removing extraneous read: " + rdName);
|
||||
if (DEBUG) logger.debug("Removing extraneous read: " + rdName);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -761,8 +768,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
private List<GenotypeAndReadBases> removeExtraneousSites(List<GenotypeAndReadBases> listHetGenotypes) {
|
||||
Set<Integer> sitesWithReads = new HashSet<Integer>();
|
||||
for (Map.Entry<String, Read> nameToReads : readsAtHetSites.entrySet()) {
|
||||
Read rd = nameToReads.getValue();
|
||||
for (Map.Entry<String, PhasingRead> nameToReads : readsAtHetSites.entrySet()) {
|
||||
PhasingRead rd = nameToReads.getValue();
|
||||
for (int i : rd.getNonNullIndices())
|
||||
sitesWithReads.add(i);
|
||||
}
|
||||
|
|
@ -779,7 +786,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) {
|
||||
keepHetSites.add(grb);
|
||||
if (!keepSite)
|
||||
if ( DEBUG ) logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]...");
|
||||
if (DEBUG)
|
||||
logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]...");
|
||||
}
|
||||
else if (index <= phasingSiteIndex)
|
||||
numPrecedingRemoved++;
|
||||
|
|
@ -792,7 +800,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private List<GenotypeAndReadBases> trimWindow(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phaseLocus) {
|
||||
if ( DEBUG) logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
|
||||
if (DEBUG)
|
||||
logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
|
||||
|
||||
int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes
|
||||
int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites!
|
||||
|
|
@ -815,7 +824,8 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
|
||||
phasingSiteIndex -= startIndex;
|
||||
listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex);
|
||||
if ( DEBUG ) logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
|
||||
if (DEBUG)
|
||||
logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
|
||||
|
||||
return listHetGenotypes;
|
||||
}
|
||||
|
|
@ -831,9 +841,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
if (DEBUG && logger.isDebugEnabled()) {
|
||||
logger.debug("Number of USED reads [connecting the two positions to be phased] at sites: " + phaseWindow.readsAtHetSites.size());
|
||||
logger.debug("USED READS:");
|
||||
for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
|
||||
for (Map.Entry<String, PhasingRead> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
|
||||
String rdName = nameToReads.getKey();
|
||||
Read rd = nameToReads.getValue();
|
||||
PhasingRead rd = nameToReads.getValue();
|
||||
logger.debug(rd + "\t" + rdName);
|
||||
}
|
||||
}
|
||||
|
|
@ -847,19 +857,20 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
double totalAbsPQchange = 0;
|
||||
int numPQchangesObserved = 0;
|
||||
|
||||
for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
|
||||
Read rd = nameToReads.getValue();
|
||||
if ( DEBUG ) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey());
|
||||
for (Map.Entry<String, PhasingRead> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
|
||||
PhasingRead rd = nameToReads.getValue();
|
||||
if (DEBUG) logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey());
|
||||
|
||||
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
|
||||
PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass());
|
||||
pte.getScore().integrateReadScore(score);
|
||||
if ( DEBUG ) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
|
||||
if (DEBUG) logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
|
||||
}
|
||||
|
||||
// Check the current best haplotype assignment and compare it to the previous one:
|
||||
MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false);
|
||||
if ( DEBUG ) logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality);
|
||||
if (DEBUG)
|
||||
logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality);
|
||||
if (prevMaxHapAndQual != null) {
|
||||
double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality;
|
||||
|
||||
|
|
@ -867,7 +878,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
numHighQualityIterations++;
|
||||
if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase
|
||||
(numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ
|
||||
if ( DEBUG ) logger.debug("Inconsistent read found!");
|
||||
if (DEBUG) logger.debug("Inconsistent read found!");
|
||||
numInconsistentIterations++;
|
||||
}
|
||||
}
|
||||
|
|
@ -878,12 +889,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
prevMaxHapAndQual = curMaxHapAndQual;
|
||||
}
|
||||
|
||||
if ( DEBUG ) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
|
||||
if (DEBUG) logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
|
||||
MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true);
|
||||
double posteriorProb = maxHapQual.maxEntry.getScore().getValue();
|
||||
|
||||
if ( DEBUG ) logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality);
|
||||
if ( DEBUG ) logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations);
|
||||
if (DEBUG)
|
||||
logger.debug("MAX hap:\t" + maxHapQual.maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + maxHapQual.phaseQuality);
|
||||
if (DEBUG)
|
||||
logger.debug("Number of used reads " + phaseWindow.readsAtHetSites.size() + "; number of high PQ iterations " + numHighQualityIterations + "; number of inconsistencies " + numInconsistentIterations);
|
||||
|
||||
boolean phasingContainsInconsistencies = false;
|
||||
if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS)
|
||||
|
|
@ -951,7 +964,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) {
|
||||
return startDistancesAreInWindowRange(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc1), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc2));
|
||||
return startDistancesAreInWindowRange(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc1), VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
|
||||
}
|
||||
|
||||
private boolean startDistancesAreInWindowRange(GenomeLoc loc1, GenomeLoc loc2) {
|
||||
|
|
@ -959,7 +972,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private int startDistance(UnfinishedVariantContext uvc1, VariantContext vc2) {
|
||||
return uvc1.getLocation().distance(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc2));
|
||||
return uvc1.getLocation().distance(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
|
||||
}
|
||||
|
||||
public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) {
|
||||
|
|
@ -1006,13 +1019,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private void writeVCF(VariantContext vc) {
|
||||
if ( samplesToPhase == null || vc.isNotFiltered() )
|
||||
//if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant
|
||||
if (samplesToPhase == null || vc.isNotFiltered())
|
||||
//if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant
|
||||
WriteVCF.writeVCF(vc, writer, logger);
|
||||
}
|
||||
|
||||
public static boolean processVariantInPhasing(VariantContext vc) {
|
||||
return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || ! vc.isVariant()); // we can handle the non-variant case as well
|
||||
return vc.isNotFiltered() && ((vc.isSNP() && vc.isBiallelic()) || !vc.isVariant()); // we can handle the non-variant case as well
|
||||
//return isUnfilteredBiallelicSNP(vc);
|
||||
}
|
||||
|
||||
|
|
@ -1140,47 +1153,11 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
else
|
||||
sb.append(" -- ");
|
||||
|
||||
sb.append(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),vc));
|
||||
sb.append(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static class ReadBase {
|
||||
public String readName;
|
||||
public byte base;
|
||||
public int mappingQual;
|
||||
public byte baseQual;
|
||||
|
||||
public ReadBase(String readName, byte base, int mappingQual, byte baseQual) {
|
||||
this.readName = readName;
|
||||
this.base = base;
|
||||
this.mappingQual = mappingQual;
|
||||
this.baseQual = baseQual;
|
||||
}
|
||||
}
|
||||
|
||||
private static class ReadBasesAtPosition implements Iterable<ReadBase> {
|
||||
// list of: <read name, base>
|
||||
private LinkedList<ReadBase> bases;
|
||||
|
||||
public ReadBasesAtPosition() {
|
||||
this.bases = new LinkedList<ReadBase>();
|
||||
}
|
||||
|
||||
public void putReadBase(PileupElement pue) {
|
||||
ReadBase rb = new ReadBase(pue.getRead().getReadName(), pue.getBase(), pue.getMappingQual(), pue.getQual());
|
||||
bases.add(rb);
|
||||
}
|
||||
|
||||
public Iterator<ReadBase> iterator() {
|
||||
return bases.iterator();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return bases.isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// THIS IMPLEMENTATION WILL FAIL WHEN NOT DEALING WITH SNP Alleles (e.g., MNP or INDEL), SINCE THEN THE Allele.getBases()
|
||||
// FUNCTION WILL RETURN VARIABLE-LENGTH Byte ARRAYS. IN THAT CASE, BaseArray/Haplotype/Read WILL NEED TO BE REPLACED WITH
|
||||
|
|
@ -1432,7 +1409,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
public void outputMultipleBaseCounts() {
|
||||
GenomeLoc nextToPhaseLoc = null;
|
||||
if (!unphasedSiteQueue.isEmpty())
|
||||
nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),unphasedSiteQueue.peek().variant);
|
||||
nextToPhaseLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), unphasedSiteQueue.peek().variant);
|
||||
|
||||
outputMultipleBaseCounts(nextToPhaseLoc);
|
||||
}
|
||||
|
|
@ -1484,108 +1461,6 @@ class PhasingScore extends PreciseNonNegativeDouble {
|
|||
}
|
||||
}
|
||||
|
||||
abstract class BaseArray implements Comparable<BaseArray> {
|
||||
protected Byte[] bases;
|
||||
|
||||
public BaseArray(byte[] bases) {
|
||||
this.bases = new Byte[bases.length];
|
||||
for (int i = 0; i < bases.length; i++)
|
||||
this.bases[i] = bases[i];
|
||||
}
|
||||
|
||||
public BaseArray(Byte[] bases) {
|
||||
this.bases = Arrays.copyOf(bases, bases.length);
|
||||
}
|
||||
|
||||
public BaseArray(int length) {
|
||||
this.bases = new Byte[length];
|
||||
Arrays.fill(this.bases, null);
|
||||
}
|
||||
|
||||
public BaseArray(BaseArray other) {
|
||||
this(other.bases);
|
||||
}
|
||||
|
||||
public void updateBase(int index, Byte base) {
|
||||
bases[index] = base;
|
||||
}
|
||||
|
||||
public Byte getBase(int index) {
|
||||
return bases[index];
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return bases.length;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder(bases.length);
|
||||
for (Byte b : bases)
|
||||
sb.append(b != null ? (char) b.byteValue() : "_");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public int compareTo(BaseArray that) {
|
||||
int sz = this.bases.length;
|
||||
if (sz != that.bases.length)
|
||||
return (sz - that.bases.length);
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
Byte thisBase = this.getBase(i);
|
||||
Byte thatBase = that.getBase(i);
|
||||
if (thisBase == null || thatBase == null) {
|
||||
if (thisBase == null && thatBase != null) {
|
||||
return -1;
|
||||
}
|
||||
else if (thisBase != null && thatBase == null) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
else if (!BaseUtils.basesAreEqual(thisBase, thatBase)) {
|
||||
return thisBase - thatBase;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
class Haplotype extends BaseArray implements Cloneable {
|
||||
public Haplotype(byte[] bases) {
|
||||
super(bases);
|
||||
}
|
||||
|
||||
private Haplotype(Byte[] bases) {
|
||||
super(bases);
|
||||
}
|
||||
|
||||
public Haplotype(Haplotype other) {
|
||||
super(other);
|
||||
}
|
||||
|
||||
public void updateBase(int index, Byte base) {
|
||||
if (base == null) {
|
||||
throw new ReviewedStingException("Internal error: CANNOT have null for a missing Haplotype base!");
|
||||
}
|
||||
super.updateBase(index, base);
|
||||
}
|
||||
|
||||
public Haplotype clone() {
|
||||
try {
|
||||
super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
return new Haplotype(this);
|
||||
}
|
||||
|
||||
// Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive.
|
||||
|
||||
public Haplotype subHaplotype(int fromIndex, int toIndex) {
|
||||
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
|
||||
}
|
||||
}
|
||||
|
||||
class HaplotypeClass implements Iterable<Haplotype> {
|
||||
private ArrayList<Haplotype> haps;
|
||||
private Haplotype rep;
|
||||
|
|
@ -1619,82 +1494,6 @@ class HaplotypeClass implements Iterable<Haplotype> {
|
|||
}
|
||||
}
|
||||
|
||||
class Read extends BaseArray {
|
||||
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
|
||||
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
|
||||
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT
|
||||
|
||||
public Read(int length, int mappingQual) {
|
||||
super(length);
|
||||
|
||||
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
|
||||
|
||||
this.baseProbs = new PreciseNonNegativeDouble[length];
|
||||
Arrays.fill(this.baseProbs, null);
|
||||
|
||||
this.baseErrorProbs = new PreciseNonNegativeDouble[length];
|
||||
Arrays.fill(this.baseErrorProbs, null);
|
||||
}
|
||||
|
||||
public void updateBaseAndQuality(int index, Byte base, byte baseQual) {
|
||||
updateBase(index, base);
|
||||
|
||||
double errProb = QualityUtils.qualToErrorProb(baseQual);
|
||||
|
||||
// The base error should be AT LEAST AS HIGH as the mapping error [equivalent to capping the base quality (BQ) by the mapping quality (MQ)]:
|
||||
errProb = Math.max(errProb, 1.0 - mappingProb.getValue());
|
||||
|
||||
baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb); // The probability that the true base is the base called in the read
|
||||
baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb / 3.0); // DIVIDE up the error probability EQUALLY over the 3 non-called bases
|
||||
}
|
||||
|
||||
public PhasingScore matchHaplotypeClassScore(HaplotypeClass hapClass) {
|
||||
PreciseNonNegativeDouble value = new PreciseNonNegativeDouble(0.0);
|
||||
for (Haplotype h : hapClass)
|
||||
value.plusEqual(matchHaplotypeScore(h));
|
||||
|
||||
return new PhasingScore(value);
|
||||
}
|
||||
|
||||
private PreciseNonNegativeDouble matchHaplotypeScore(Haplotype hap) {
|
||||
PreciseNonNegativeDouble score = new PreciseNonNegativeDouble(1.0);
|
||||
|
||||
int sz = this.bases.length;
|
||||
if (sz != hap.bases.length)
|
||||
throw new ReviewedStingException("Read and Haplotype should have same length to be compared!");
|
||||
|
||||
// Technically, this HAS NO EFFECT since it is multiplied in for ALL haplotype pairs, but do so for completeness:
|
||||
score.timesEqual(mappingProb);
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
Byte thisBase = this.getBase(i);
|
||||
Byte hapBase = hap.getBase(i);
|
||||
if (thisBase != null && hapBase != null) {
|
||||
if (BaseUtils.basesAreEqual(thisBase, hapBase))
|
||||
score.timesEqual(baseProbs[i]);
|
||||
else
|
||||
score.timesEqual(baseErrorProbs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
public int[] getNonNullIndices() {
|
||||
List<Integer> nonNull = new LinkedList<Integer>();
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
if (getBase(i) != null)
|
||||
nonNull.add(i);
|
||||
}
|
||||
|
||||
int[] nonNullArray = new int[nonNull.size()];
|
||||
int index = 0;
|
||||
for (int i : nonNull)
|
||||
nonNullArray[index++] = i;
|
||||
return nonNullArray;
|
||||
}
|
||||
}
|
||||
|
||||
class PhasingStats {
|
||||
private int numReads;
|
||||
private int numVarSites;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
public class ReadBase {
|
||||
public String readName;
|
||||
public byte base;
|
||||
public int mappingQual;
|
||||
public byte baseQual;
|
||||
|
||||
public ReadBase(String readName, byte base, int mappingQual, byte baseQual) {
|
||||
this.readName = readName;
|
||||
this.base = base;
|
||||
this.mappingQual = mappingQual;
|
||||
this.baseQual = baseQual;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class ReadBasesAtPosition implements Iterable<ReadBase> {
|
||||
// list of: <read name, base>
|
||||
private LinkedList<ReadBase> bases;
|
||||
|
||||
public ReadBasesAtPosition() {
|
||||
this.bases = new LinkedList<ReadBase>();
|
||||
}
|
||||
|
||||
public void putReadBase(PileupElement pue) {
|
||||
ReadBase rb = new ReadBase(pue.getRead().getReadName(), pue.getBase(), pue.getMappingQual(), pue.getQual());
|
||||
bases.add(rb);
|
||||
}
|
||||
|
||||
public Iterator<ReadBase> iterator() {
|
||||
return bases.iterator();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return bases.isEmpty();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,402 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.phasing;
|
||||
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.phasing.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites.
|
||||
*/
|
||||
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
|
||||
@Requires(value = {DataSource.READS, DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class))
|
||||
@By(DataSource.READS)
|
||||
|
||||
@ReadFilters({ZeroMappingQualityReadFilter.class})
|
||||
// Filter out all reads with zero mapping quality
|
||||
|
||||
public class ReadBasedPhasingValidationWalker extends RodWalker<Integer, Integer> {
|
||||
private LinkedList<String> rodNames = null;
|
||||
|
||||
@Argument(fullName = "sitePairsFile", shortName = "sitePairsFile", doc = "File of pairs of variants for which phasing in ROD should be assessed using input reads", required = true)
|
||||
protected File sitePairsFile = null;
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
private Set<SitePair> sitePairs = null;
|
||||
private String sampleName = null;
|
||||
|
||||
SiteGenotypeAndReads prevSiteAndReads = null;
|
||||
|
||||
private final static int NUM_IN_PAIR = 2; // trivial
|
||||
|
||||
// enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
public void initialize() {
|
||||
rodNames = new LinkedList<String>();
|
||||
rodNames.add("variant");
|
||||
|
||||
sitePairs = new TreeSet<SitePair>();
|
||||
GenomeLocParser locParser = getToolkit().getGenomeLocParser();
|
||||
|
||||
InputStream sitePairsStream = null;
|
||||
try {
|
||||
sitePairsStream = new FileInputStream(sitePairsFile);
|
||||
}
|
||||
catch (FileNotFoundException fnfe) {
|
||||
fnfe.printStackTrace();
|
||||
throw new UserException("Problem opening file: " + sitePairsFile);
|
||||
}
|
||||
|
||||
AsciiLineReader sitePairsReader = new AsciiLineReader(sitePairsStream);
|
||||
while (true) {
|
||||
String line = null;
|
||||
try {
|
||||
line = sitePairsReader.readLine();
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
ioe.printStackTrace();
|
||||
throw new UserException("Problem reading file: " + sitePairsFile);
|
||||
}
|
||||
if (line == null)
|
||||
break; // reached end of file
|
||||
|
||||
String[] twoSites = line.split("\t");
|
||||
if (twoSites.length != 2)
|
||||
throw new UserException("Must have PAIRS of sites in line " + line + " of " + sitePairsFile);
|
||||
|
||||
SitePair sp = new SitePair(locParser.parseGenomeLoc(twoSites[0]), locParser.parseGenomeLoc(twoSites[1]));
|
||||
sitePairs.add(sp);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean generateExtendedEvents() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref the reference base
|
||||
* @param context the context for the given locus
|
||||
* @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range.
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (tracker == null)
|
||||
return null;
|
||||
|
||||
boolean relevantSitePair = false;
|
||||
SitePair sp = null;
|
||||
if (prevSiteAndReads != null) {
|
||||
// all vc's below start at ref.getLocus() [due to requireStartHere = true]:
|
||||
sp = new SitePair(prevSiteAndReads.site, ref.getLocus());
|
||||
relevantSitePair = sitePairs.contains(sp);
|
||||
}
|
||||
|
||||
if (context == null || !context.hasBasePileup())
|
||||
return null;
|
||||
ReadBackedPileup pileup = context.getBasePileup();
|
||||
String nextName = null;
|
||||
|
||||
//
|
||||
// TODO: ASSUMES THAT ALL READS COME FROM A SINGLE SAMPLE:
|
||||
// TODO: CODE SHOULD BE AS FOLLOWS:
|
||||
/*
|
||||
Collection<String> sampNames = pileup.getSampleNames();
|
||||
if (sampNames.size() != 1)
|
||||
throw new UserException("Reads must be for exactly one sample [not multi-sample]");
|
||||
nextName = sampNames.iterator().next();
|
||||
if (nextName == null)
|
||||
throw new UserException("Reads must be for exactly one sample");
|
||||
|
||||
if (sampleName == null)
|
||||
sampleName = nextName;
|
||||
else if (!nextName.equals(sampleName))
|
||||
throw new UserException("Reads must have a single consistent sample name");
|
||||
|
||||
pileup = pileup.getPileupForSampleName(sampleName);
|
||||
*/
|
||||
//
|
||||
|
||||
ReadBasesAtPosition readBases = new ReadBasesAtPosition();
|
||||
for (PileupElement p : pileup)
|
||||
readBases.putReadBase(p);
|
||||
|
||||
ReadCounter rdCounts = null;
|
||||
if (relevantSitePair) { // otherwise, processed the reads for their possible use in the future:
|
||||
PhasingReadList buildReads = new PhasingReadList(NUM_IN_PAIR);
|
||||
buildReads.updateBases(0, prevSiteAndReads.readBases);
|
||||
buildReads.updateBases(1, readBases);
|
||||
|
||||
List<PhasingRead> reads = new LinkedList<PhasingRead>();
|
||||
for (PhasingRead rd : buildReads) {
|
||||
if (rd.getNonNullIndices().length == NUM_IN_PAIR) // only want reads with BOTH bases called [possibly as deleted ("D")]
|
||||
reads.add(rd);
|
||||
}
|
||||
|
||||
// Count the occurence of each "haplotype":
|
||||
rdCounts = new ReadCounter();
|
||||
for (PhasingRead rd : reads)
|
||||
rdCounts.incrementCount(rd);
|
||||
}
|
||||
|
||||
|
||||
// Now, read the ROD and note the genotypes and their phase to be validated:
|
||||
Set<Haplotype> calledHaplotypes = null;
|
||||
List<Haplotype> allPossibleHaplotypes = null;
|
||||
|
||||
boolean requireStartHere = true; // only see each VariantContext once
|
||||
boolean takeFirstOnly = true; // take only the first entry from the ROD file
|
||||
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
|
||||
if (vc.isFiltered() || !vc.isSNP())
|
||||
continue;
|
||||
|
||||
if (vc.getNSamples() != 1)
|
||||
throw new UserException("ROD file must have exactly one sample [not multi-sample]");
|
||||
nextName = vc.getSampleNames().iterator().next();
|
||||
if (sampleName == null)
|
||||
sampleName = nextName;
|
||||
else if (!nextName.equals(sampleName))
|
||||
throw new UserException("ROD must have a single consistent sample name");
|
||||
|
||||
Genotype gt = vc.getGenotype(sampleName);
|
||||
|
||||
if (relevantSitePair) {
|
||||
Genotype prevGt = prevSiteAndReads.gt;
|
||||
List<Allele> prevAlleles = prevGt.getAlleles();
|
||||
List<Allele> curAlleles = gt.getAlleles();
|
||||
|
||||
calledHaplotypes = new TreeSet<Haplotype>(); // implemented Haplotype.compareTo()
|
||||
if (gt.isPhased()) {
|
||||
if (gt.getPloidy() != prevGt.getPloidy())
|
||||
throw new UserException("Invalid ROD file: cannot be phased AND have different ploidys!");
|
||||
|
||||
// Consider only the haplotypes called to be phased
|
||||
Iterator<Allele> curAllIt = curAlleles.iterator();
|
||||
for (Allele prevAll : prevAlleles) {
|
||||
Allele curAll = curAllIt.next();
|
||||
calledHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll));
|
||||
}
|
||||
}
|
||||
|
||||
// Consider EVERY combination of alleles as haplotypes [IF PHASED, this will give the contingency table in the CORRECT order]:
|
||||
allPossibleHaplotypes = new LinkedList<Haplotype>();
|
||||
for (Allele prevAll : prevAlleles) {
|
||||
for (Allele curAll : curAlleles) {
|
||||
allPossibleHaplotypes.add(successiveAllelesToHaplotype(prevAll, curAll));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
prevSiteAndReads = new SiteGenotypeAndReads(ref.getLocus(), gt, readBases);
|
||||
}
|
||||
|
||||
int processedPairs = 0;
|
||||
if (relevantSitePair) {
|
||||
Map<Haplotype, Integer> haplotypeCounts = new TreeMap<Haplotype, Integer>(); // implemented Haplotype.compareTo()
|
||||
|
||||
processedPairs = 1;
|
||||
int totalCount = rdCounts.totalCount();
|
||||
System.out.println("\nPair: " + sp + " [# reads = " + totalCount + "]");
|
||||
|
||||
int matchCount = 0;
|
||||
for (Map.Entry<PhasingRead, Integer> rdEntry : rdCounts.entrySet()) {
|
||||
PhasingRead read = rdEntry.getKey();
|
||||
int count = rdEntry.getValue();
|
||||
|
||||
Haplotype readsHaplotype = new Haplotype(read);
|
||||
haplotypeCounts.put(readsHaplotype, count);
|
||||
|
||||
boolean readMatchesCalledHaplotype = calledHaplotypes != null && calledHaplotypes.contains(readsHaplotype);
|
||||
if (readMatchesCalledHaplotype)
|
||||
matchCount += count;
|
||||
|
||||
System.out.println("read" + ": " + read + (readMatchesCalledHaplotype ? "*" : "") + "\tcount: " + count);
|
||||
}
|
||||
|
||||
double percentMatchingReads = 100 * (matchCount / (double) totalCount);
|
||||
System.out.println("% MATCHING reads: " + percentMatchingReads + " [of " + totalCount + " TOTAL reads]");
|
||||
|
||||
out.print(sp);
|
||||
for (Haplotype hap : allPossibleHaplotypes) {
|
||||
Integer count = haplotypeCounts.get(hap);
|
||||
if (count == null) // haplotype may not have been observed in ANY reads
|
||||
count = 0;
|
||||
|
||||
out.print("\t" + count);
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
|
||||
return processedPairs;
|
||||
}
|
||||
|
||||
private Haplotype successiveAllelesToHaplotype(Allele prevAll, Allele curAll) {
|
||||
byte prevBase = SNPallelePair.getSingleBase(prevAll);
|
||||
byte curBase = SNPallelePair.getSingleBase(curAll);
|
||||
|
||||
byte[] hapBases = new byte[NUM_IN_PAIR];
|
||||
hapBases[0] = prevBase;
|
||||
hapBases[1] = curBase;
|
||||
return new Haplotype(hapBases);
|
||||
}
|
||||
|
||||
public Integer reduce(Integer addIn, Integer runningCount) {
|
||||
if (addIn == null)
|
||||
addIn = 0;
|
||||
|
||||
return runningCount + addIn;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param result the number of reads and VariantContexts seen.
|
||||
*/
|
||||
public void onTraversalDone(Integer result) {
|
||||
System.out.println("Validated " + result + " pairs of sites.");
|
||||
}
|
||||
}
|
||||
|
||||
class SitePair implements Comparable<SitePair> {
|
||||
public GenomeLoc site1;
|
||||
public GenomeLoc site2;
|
||||
|
||||
public SitePair(GenomeLoc site1, GenomeLoc site2) {
|
||||
if (site1.size() > 1 || site2.size() > 1)
|
||||
throw new UserException("Must give pairs of SINGLE-LOCUS record start sites");
|
||||
|
||||
this.site1 = site1;
|
||||
this.site2 = site2;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return site1.toString() + "\t" + site2.toString();
|
||||
}
|
||||
|
||||
public int compareTo(SitePair other) {
|
||||
int comp1 = site1.compareTo(other.site1);
|
||||
if (comp1 != 0)
|
||||
return comp1;
|
||||
|
||||
return site2.compareTo(other.site2);
|
||||
}
|
||||
}
|
||||
|
||||
class SiteGenotypeAndReads {
|
||||
public GenomeLoc site;
|
||||
public Genotype gt;
|
||||
public ReadBasesAtPosition readBases;
|
||||
|
||||
public SiteGenotypeAndReads(GenomeLoc site, Genotype gt, ReadBasesAtPosition readBases) {
|
||||
this.site = site;
|
||||
this.gt = gt;
|
||||
this.readBases = readBases;
|
||||
}
|
||||
}
|
||||
|
||||
class PhasingReadList implements Iterable<PhasingRead> {
|
||||
private Map<String, PhasingRead> readsAtSites = null;
|
||||
private int numSites;
|
||||
|
||||
public PhasingReadList(int numSites) {
|
||||
this.readsAtSites = new HashMap<String, PhasingRead>();
|
||||
this.numSites = numSites;
|
||||
}
|
||||
|
||||
public void updateBases(int index, ReadBasesAtPosition readBases) {
|
||||
if (readBases == null)
|
||||
return;
|
||||
|
||||
for (ReadBase rb : readBases) {
|
||||
String readName = rb.readName;
|
||||
|
||||
PhasingRead rd = readsAtSites.get(readName);
|
||||
if (rd == null) {
|
||||
rd = new PhasingRead(numSites, rb.mappingQual);
|
||||
readsAtSites.put(readName, rd);
|
||||
}
|
||||
|
||||
// Arbitrarily updates to the last base observed for this sample and read (rb.base):
|
||||
rd.updateBaseAndQuality(index, rb.base, rb.baseQual);
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator<PhasingRead> iterator() {
|
||||
return readsAtSites.values().iterator();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return readsAtSites.size();
|
||||
}
|
||||
}
|
||||
|
||||
class ReadCounter {
|
||||
private Map<PhasingRead, Integer> counts;
|
||||
private int totalCount;
|
||||
|
||||
public ReadCounter() {
|
||||
this.counts = new TreeMap<PhasingRead, Integer>(); // implemented PhasingRead.compareTo()
|
||||
}
|
||||
|
||||
public void incrementCount(PhasingRead rd) {
|
||||
Integer cnt = counts.get(rd);
|
||||
if (cnt == null)
|
||||
cnt = 0;
|
||||
|
||||
counts.put(rd, cnt + 1);
|
||||
totalCount++;
|
||||
}
|
||||
|
||||
public Set<Map.Entry<PhasingRead, Integer>> entrySet() {
|
||||
return counts.entrySet();
|
||||
}
|
||||
|
||||
public int totalCount() {
|
||||
return totalCount;
|
||||
}
|
||||
}
|
||||
|
|
@ -93,6 +93,10 @@ public class BaseUtils {
|
|||
return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2);
|
||||
}
|
||||
|
||||
static public boolean extendedBasesAreEqual(byte base1, byte base2) {
|
||||
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Converts a IUPAC nucleotide code to a pair of bases
|
||||
|
|
|
|||
Loading…
Reference in New Issue