Implemented fully symmetric sliding window read-backed phaser

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4104 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-08-24 21:12:32 +00:00
parent cba5f05538
commit aa8cf25d08
1 changed files with 288 additions and 165 deletions

View File

@ -45,6 +45,8 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
/** /**
* Walks along all loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using downstream reads). * Walks along all loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using downstream reads).
@ -71,31 +73,37 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
@Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files", required = false) @Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files", required = false)
protected String variantStatsFilePrefix = null; protected String variantStatsFilePrefix = null;
private PhasingQualityStatsWriter statsWriter = null; private LinkedList<VariantAndReads> unphasedSiteQueue = null;
private LinkedList<VariantAndReads> phasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions
private LinkedList<VariantAndReads> siteQueue = null;
private VariantAndReads prevVr = null; // the VC emitted after phasing, and the alignment bases at the position emitted
private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0); private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0);
private static boolean DEBUG_DETAILED = true; private static boolean DEBUG_DETAILED = true;
private void initializeVcfWriter(VariantContext vc) { private LinkedList<String> rodNames = null;
private PhasingQualityStatsWriter statsWriter = null;
public void initialize() {
rodNames = new LinkedList<String>();
rodNames.add("variant");
unphasedSiteQueue = new LinkedList<VariantAndReads>();
phasedSites = new LinkedList<VariantAndReads>();
initializeVcfWriter();
if (variantStatsFilePrefix != null)
statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix);
}
private void initializeVcfWriter() {
// setup the header fields // setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
hInfo.add(new VCFFormatHeaderLine("PQ", 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); hInfo.add(new VCFFormatHeaderLine("PQ", 1, VCFHeaderLineType.Float, "Read-backed phasing quality"));
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(vc.getSampleNames()))); Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames);
} writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples())));
public void initialize() {
siteQueue = new LinkedList<VariantAndReads>();
prevVr = new VariantAndReads(null, null, true);
if (variantStatsFilePrefix != null)
statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix);
} }
public boolean generateExtendedEvents() { public boolean generateExtendedEvents() {
@ -107,7 +115,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
/** /**
* For each site of interest, cache the current site and then use the cache to phase all upstream sites * For each site of interest, cache the current site and then use the cache to phase all sites
* for which "sufficient" information has already been observed. * for which "sufficient" information has already been observed.
* *
* @param tracker the meta-data tracker * @param tracker the meta-data tracker
@ -121,8 +129,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
PhasingStats phaseStats = new PhasingStats(); PhasingStats phaseStats = new PhasingStats();
LinkedList<String> rodNames = new LinkedList<String>();
rodNames.add("variant");
boolean requireStartHere = true; // only see each VariantContext once boolean requireStartHere = true; // only see each VariantContext once
boolean takeFirstOnly = false; // take as many entries as the VCF file has boolean takeFirstOnly = false; // take as many entries as the VCF file has
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
@ -131,7 +137,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
processVariant = false; processVariant = false;
VariantAndReads vr = new VariantAndReads(vc, context, processVariant); VariantAndReads vr = new VariantAndReads(vc, context, processVariant);
siteQueue.add(vr); unphasedSiteQueue.add(vr);
int numReads = 0; int numReads = 0;
if (context.hasBasePileup()) { if (context.hasBasePileup()) {
@ -143,50 +149,101 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
PhasingStats addInPhaseStats = new PhasingStats(numReads, 1); PhasingStats addInPhaseStats = new PhasingStats(numReads, 1);
phaseStats.addIn(addInPhaseStats); phaseStats.addIn(addInPhaseStats);
} }
List<VariantContext> phasedList = processQueue(ref.getLocus(), phaseStats); List<VariantContext> phasedList = processQueue(phaseStats, false);
return new PhasingStatsAndOutput(phaseStats, phasedList); return new PhasingStatsAndOutput(phaseStats, phasedList);
} }
private List<VariantContext> processQueue(GenomeLoc loc, PhasingStats phaseStats) { private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) {
List<VariantContext> vcList = new LinkedList<VariantContext>(); GenomeLoc lastLocus = null;
if (!processAll && !unphasedSiteQueue.isEmpty())
lastLocus = VariantContextUtils.getLocation(unphasedSiteQueue.peekLast().variant);
while (!siteQueue.isEmpty()) { List<VariantContext> oldPhasedList = new LinkedList<VariantContext>();
if (loc != null) { while (!unphasedSiteQueue.isEmpty()) {
VariantContext vc = siteQueue.peek().variant; if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue
if (isInWindowRange(loc, VariantContextUtils.getLocation(vc))) { VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant;
// loc is still not far enough ahead of vc (since we ASSUME that the VCF is ordered by <contig,locus>) if (isInWindowRange(lastLocus, VariantContextUtils.getLocation(nextToPhaseVc))) {
/* lastLocus is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc
(note that we ASSUME that the VCF is ordered by <contig,locus>) */
break; break;
} }
// Already saw all variant positions within cacheWindow distance ahead of vc (on its contig) // Already saw all variant positions within cacheWindow distance ahead of vc (on its contig)
} }
VariantContext phasedVc = finalizePhasingAndRemove(phaseStats);
vcList.add(phasedVc); // Update phasedSites before it's used in finalizePhasing:
oldPhasedList.addAll(discardIrrelevantPhasedSites());
VariantAndReads phasedVr = finalizePhasing(unphasedSiteQueue.remove(), phaseStats);
phasedSites.add(phasedVr);
} }
// Update phasedSites after finalizePhasing is done:
oldPhasedList.addAll(discardIrrelevantPhasedSites());
return oldPhasedList;
}
private List<VariantContext> discardIrrelevantPhasedSites() {
List<VariantContext> vcList = new LinkedList<VariantContext>();
VariantContext nextToPhaseVc = null;
if (!unphasedSiteQueue.isEmpty())
nextToPhaseVc = unphasedSiteQueue.peek().variant;
while (!phasedSites.isEmpty()) {
VariantAndReads phasedVr = phasedSites.peek();
VariantContext phasedVc = phasedVr.variant;
if (nextToPhaseVc != null && phasedVr.processVariant &&
(isInWindowRange(phasedVc, nextToPhaseVc) // nextToPhaseVc is still not far enough ahead of phasedVc to exclude phasedVc from calculations
// since ALWAYS want the previous site to be included for phasing nextToPhaseVc:
|| (phasedSites.size() == 1 && VariantContextUtils.getLocation(phasedVc).onSameContig(VariantContextUtils.getLocation(nextToPhaseVc))))) {
break;
}
vcList.add(phasedSites.remove().variant);
}
return vcList; return vcList;
} }
/* Finalize phasing of vc (head of siteQueue) using all VariantContext objects in the siteQueue that are /* Finalize phasing of vc (head of unphasedSiteQueue) using all VariantContext objects in
within cacheWindow distance ahead of vc (on its contig). phasedSites and all in unphasedSiteQueue that are within cacheWindow distance ahead of vc (on its contig).
ASSUMES:
1. siteQueue is NOT empty. ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue).
2. All VariantContexts in siteQueue are in positions downstream of vc (head of queue).
*/ */
private VariantAndReads finalizePhasing(VariantAndReads vr, PhasingStats phaseStats) {
private VariantContext finalizePhasingAndRemove(PhasingStats phaseStats) {
VariantAndReads vr = siteQueue.remove(); // remove vr from head of queue
VariantContext vc = vr.variant;
if (!vr.processVariant) if (!vr.processVariant)
return vc; // return vc as is return vr; // return vr as is
boolean hasPreviousSite = previousIsRelevantTo(vc); VariantContext vc = vr.variant;
logger.debug("Will phase vc = " + VariantContextUtils.getLocation(vc)); logger.debug("Will phase vc = " + VariantContextUtils.getLocation(vc));
LinkedList<VariantAndReads> windowVaList = new LinkedList<VariantAndReads>(); // Find the previous VariantContext (that was processed and phased):
VariantAndReads prevVr = null;
Iterator<VariantAndReads> backwardsIt = phasedSites.descendingIterator();
while (backwardsIt.hasNext()) {
VariantAndReads backVr = backwardsIt.next();
if (backVr.processVariant) {
prevVr = backVr;
break;
}
}
boolean hasPreviousSite = (prevVr != null);
LinkedList<VariantAndReads> windowVaList = null;
if (hasPreviousSite) { if (hasPreviousSite) {
windowVaList.add(prevVr); // need to add one position for phasing context windowVaList = new LinkedList<VariantAndReads>();
windowVaList.add(vr); // add position to be phased
for (VariantAndReads nextVr : siteQueue) { // Include previously phased sites in the phasing computation:
for (VariantAndReads phasedVr : phasedSites) {
if (phasedVr.processVariant)
windowVaList.add(phasedVr);
}
// Add position to be phased:
windowVaList.add(vr);
// Include as of yet unphased sites in the phasing computation:
for (VariantAndReads nextVr : unphasedSiteQueue) {
if (!isInWindowRange(vc, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc if (!isInWindowRange(vc, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc
break; break;
if (nextVr.processVariant) // include in the phasing computation if (nextVr.processVariant) // include in the phasing computation
@ -201,7 +258,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
logger.debug(""); logger.debug("");
Map<String, Genotype> sampGenotypes = vc.getGenotypes(); Map<String, Genotype> sampGenotypes = vc.getGenotypes();
VariantContext prevVc = prevVr.variant;
Map<String, Genotype> phasedGtMap = new TreeMap<String, Genotype>(); Map<String, Genotype> phasedGtMap = new TreeMap<String, Genotype>();
// Perform per-sample phasing: // Perform per-sample phasing:
@ -215,115 +271,154 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
Biallele biall = new Biallele(gt); Biallele biall = new Biallele(gt);
HashMap<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes()); HashMap<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
if (hasPreviousSite && gt.isHet() && prevVc.getGenotype(samp).isHet()) { //otherwise, can trivially phase if (hasPreviousSite && gt.isHet()) {
logger.debug("NON-TRIVIALLY CARE about TOP vs. BOTTOM for: "); VariantContext prevVc = prevVr.variant;
logger.debug("\n" + biall); Genotype prevGenotype = prevVc.getGenotype(samp);
if (prevGenotype.isHet()) { //otherwise, can trivially phase
logger.debug("NON-TRIVIALLY CARE about TOP vs. BOTTOM for: ");
logger.debug("\n" + biall);
List<VariantAndReads> sampleWindowVaList = new LinkedList<VariantAndReads>(); List<VariantAndReads> sampleWindowVaList = new LinkedList<VariantAndReads>();
for (VariantAndReads phaseInfoVr : windowVaList) { int phasingSiteIndex = -1;
VariantContext phaseInfoVc = phaseInfoVr.variant; int currentIndex = 0;
Genotype phaseInfoGt = phaseInfoVc.getGenotype(samp); for (VariantAndReads phaseInfoVr : windowVaList) {
if (phaseInfoGt.isHet()) { // otherwise, of no value to phasing VariantContext phaseInfoVc = phaseInfoVr.variant;
sampleWindowVaList.add(phaseInfoVr); Genotype phaseInfoGt = phaseInfoVc.getGenotype(samp);
logger.debug("STARTING TO PHASE USING POS = " + VariantContextUtils.getLocation(phaseInfoVc)); if (phaseInfoGt.isHet()) { // otherwise, of no value to phasing
sampleWindowVaList.add(phaseInfoVr);
if (phasingSiteIndex == -1) {
if (phaseInfoVr == vr)
phasingSiteIndex = currentIndex;
else
currentIndex++;
}
logger.debug("STARTING TO PHASE USING POS = " + VariantContextUtils.getLocation(phaseInfoVc));
}
} }
} if (logger.isDebugEnabled() && (phasingSiteIndex == -1 || phasingSiteIndex == 0))
if (sampleWindowVaList.size() > maxPhaseSites) { throw new StingException("Internal error: could NOT find vr and/or prevVr!");
logger.warn("Trying to phase sample " + samp + " at locus " + VariantContextUtils.getLocation(vc) + " within a window of " + cacheWindow + " bases yields " + sampleWindowVaList.size() + " heterozygous sites to phase -- REDUCING to first " + maxPhaseSites + " sites!");
sampleWindowVaList = sampleWindowVaList.subList(0, maxPhaseSites);
}
/* Will map a phase and its "complement" to a single representative phase, if (sampleWindowVaList.size() > maxPhaseSites) {
and marginalizeTable() marginalizes to the first 2 positions [i.e., the previous position and the current position]: logger.warn("Trying to phase sample " + samp + " at locus " + VariantContextUtils.getLocation(vc) + " within a window of " + cacheWindow + " bases yields " + sampleWindowVaList.size() + " heterozygous sites to phase -- REDUCING to " + maxPhaseSites + " sites:\n" + toString(sampleWindowVaList));
*/
HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(sampleWindowVaList, samp, 2);
PhasingTable sampleHaps = tabCreator.getNewTable();
// Assemble the "sub-reads" from the heterozygous positions for this sample: int prevSiteIndex = phasingSiteIndex - 1;
LinkedList<ReadBasesAtPosition> allPositions = new LinkedList<ReadBasesAtPosition>(); int numToUse = maxPhaseSites - 2; // since always keep prevVr and vr
for (VariantAndReads phaseInfoVr : sampleWindowVaList) { int halfToUse = new Double(Math.floor(numToUse / 2.0)).intValue();
ReadBasesAtPosition readBases = phaseInfoVr.sampleReadBases.get(samp);
allPositions.add(readBases);
}
HashMap<String, Read> allReads = convertReadBasesAtPositionToReads(allPositions);
logger.debug("Number of reads at sites: " + allReads.size());
int numUsedReads = 0;
// Update the phasing table based on each of the sub-reads for this sample: int numOnLeft = prevSiteIndex;
for (Map.Entry<String, Read> nameToReads : allReads.entrySet()) { int numOnRight = sampleWindowVaList.size() - (phasingSiteIndex + 1);
Read rd = nameToReads.getValue();
if (rd.numNonNulls() <= 1) // can't possibly provide any phasing information, so save time
continue;
numUsedReads++; int useOnLeft, useOnRight;
if (DEBUG_DETAILED) if (numOnLeft <= numOnRight) {
logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey() + (rd.isGapped() ? "\tGAPPED" : "")); useOnLeft = Math.min(halfToUse, numOnLeft);
useOnRight = Math.min(numToUse - useOnLeft, numOnRight);
for (PhasingTable.PhasingTableEntry pte : sampleHaps) { }
PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); else { // numOnRight < numOnLeft
pte.getScore().integrateReadScore(score); useOnRight = Math.min(halfToUse, numOnRight);
useOnLeft = Math.min(numToUse - useOnRight, numOnLeft);
if (DEBUG_DETAILED) }
logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); int startIndex = prevSiteIndex - useOnLeft;
int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
phasingSiteIndex -= startIndex;
sampleWindowVaList = sampleWindowVaList.subList(startIndex, stopIndex);
logger.warn("REDUCED to " + sampleWindowVaList.size() + " sites:\n" + toString(sampleWindowVaList));
} }
PhaseResult pr = phaseSample(samp, sampleWindowVaList, phasingSiteIndex);
genotypesArePhased = (pr.phaseQuality >= phaseQualityThresh);
if (genotypesArePhased) {
Biallele prevBiall = new Biallele(prevGenotype);
logger.debug("CHOSEN PHASE FOR PREVIOUS:\n" + prevBiall + "\n");
logger.debug("CHOSE PHASE:\n" + biall + "\n\n");
ensurePhasing(biall, prevBiall, pr.haplotype);
gtAttribs.put("PQ", pr.phaseQuality);
}
if (statsWriter != null)
statsWriter.addStat(samp, VariantContextUtils.getLocation(vc), distance(prevVc, vc), pr.phaseQuality);
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) {
sampPhaseCounts = new PhaseCounts();
samplePhaseStats.put(samp, sampPhaseCounts);
}
sampPhaseCounts.numTestedSites++;
if (genotypesArePhased)
sampPhaseCounts.numPhased++;
} }
logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
logger.debug("numUsedReads = " + numUsedReads);
// Marginalize each haplotype to its first 2 positions:
sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps);
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + sampleHaps + "\n");
// Determine the phase at this position:
sampleHaps.normalizeScores();
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry();
double posteriorProb = maxEntry.getScore().getValue();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
if (pte != maxEntry)
sumErrorProbs.plusEqual(pte.getScore());
}
double phaseQuality = -10.0 * (sumErrorProbs.getLog10Value());
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality);
if (statsWriter != null)
statsWriter.addStat(samp, distance(prevVc, vc), phaseQuality);
genotypesArePhased = (phaseQuality >= phaseQualityThresh);
if (genotypesArePhased) {
Biallele prevBiall = new Biallele(prevVc.getGenotype(samp));
ensurePhasing(biall, prevBiall, maxEntry.getHaplotypeClass().getRepresentative());
gtAttribs.put("PQ", phaseQuality);
logger.debug("CHOSE PHASE:\n" + biall + "\n\n");
}
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) {
sampPhaseCounts = new PhaseCounts();
samplePhaseStats.put(samp, sampPhaseCounts);
}
sampPhaseCounts.numTestedSites++;
sampPhaseCounts.numPhased += (genotypesArePhased ? 1 : 0);
} }
List<Allele> phasedAll = biall.getAllelesAsList(); List<Allele> phasedAll = biall.getAllelesAsList();
Genotype phasedGt = new Genotype(gt.getSampleName(), phasedAll, gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); Genotype phasedGt = new Genotype(gt.getSampleName(), phasedAll, gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
phasedGtMap.put(samp, phasedGt); phasedGtMap.put(samp, phasedGt);
} }
VariantContext phasedVc = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), phasedGtMap, vc.getNegLog10PError(), vc.getFilters(), vc.getAttributes());
prevVr.variant = phasedVc;
prevVr.sampleReadBases = vr.sampleReadBases;
phaseStats.addIn(new PhasingStats(samplePhaseStats)); phaseStats.addIn(new PhasingStats(samplePhaseStats));
return phasedVc; VariantContext phasedVc = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), phasedGtMap, vc.getNegLog10PError(), vc.getFilters(), vc.getAttributes());
return new VariantAndReads(phasedVc, vr.sampleReadBases, vr.processVariant);
}
private PhaseResult phaseSample(String sample, List<VariantAndReads> variantList, int phasingSiteIndex) {
/* Will map a phase and its "complement" to a single representative phase,
and marginalizeTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
*/
HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(variantList, sample, phasingSiteIndex - 1, 2);
PhasingTable sampleHaps = tabCreator.getNewTable();
// Assemble the "sub-reads" from the heterozygous positions for this sample:
LinkedList<ReadBasesAtPosition> allPositions = new LinkedList<ReadBasesAtPosition>();
for (VariantAndReads phaseInfoVr : variantList) {
ReadBasesAtPosition readBases = phaseInfoVr.sampleReadBases.get(sample);
allPositions.add(readBases);
}
HashMap<String, Read> allReads = convertReadBasesAtPositionToReads(allPositions);
logger.debug("Number of reads at sites: " + allReads.size());
int numUsedReads = 0;
// Update the phasing table based on each of the sub-reads for this sample:
for (Map.Entry<String, Read> nameToReads : allReads.entrySet()) {
Read rd = nameToReads.getValue();
if (rd.numNonNulls() <= 1) // can't possibly provide any phasing information, so save time
continue;
numUsedReads++;
if (DEBUG_DETAILED)
logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey() + (rd.isGapped() ? "\tGAPPED" : ""));
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass());
pte.getScore().integrateReadScore(score);
if (DEBUG_DETAILED)
logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
}
}
logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
logger.debug("numUsedReads = " + numUsedReads);
// Marginalize each haplotype to its first 2 positions:
sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps);
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + sampleHaps + "\n");
// Determine the phase at this position:
sampleHaps.normalizeScores();
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry();
double posteriorProb = maxEntry.getScore().getValue();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
if (pte != maxEntry)
sumErrorProbs.plusEqual(pte.getScore());
}
double phaseQuality = -10.0 * (sumErrorProbs.getLog10Value());
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality);
return new PhaseResult(maxEntry.getHaplotypeClass().getRepresentative(), phaseQuality);
} }
/* /*
@ -343,11 +438,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
curBiall.swapAlleles(); curBiall.swapAlleles();
} }
private boolean previousIsRelevantTo(VariantContext vc) {
VariantContext prevVc = prevVr.variant;
return (prevVc != null && VariantContextUtils.getLocation(prevVc).onSameContig(VariantContextUtils.getLocation(vc)));
}
private boolean isInWindowRange(VariantContext vc1, VariantContext vc2) { private boolean isInWindowRange(VariantContext vc1, VariantContext vc2) {
GenomeLoc loc1 = VariantContextUtils.getLocation(vc1); GenomeLoc loc1 = VariantContextUtils.getLocation(vc1);
GenomeLoc loc2 = VariantContextUtils.getLocation(vc2); GenomeLoc loc2 = VariantContextUtils.getLocation(vc2);
@ -369,9 +459,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
private void writeVCF(VariantContext vc) { private void writeVCF(VariantContext vc) {
if (writer == null)
initializeVcfWriter(vc);
byte refBase; byte refBase;
if (!vc.isIndel()) { if (!vc.isIndel()) {
Allele varAllele = vc.getReference(); Allele varAllele = vc.getReference();
@ -393,12 +480,12 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
/** /**
* Phase anything left in the cached siteQueue, and report the number of reads and VariantContexts processed. * Phase anything left in the cached unphasedSiteQueue, and report the number of reads and VariantContexts processed.
* *
* @param result the number of reads and VariantContexts seen. * @param result the number of reads and VariantContexts seen.
*/ */
public void onTraversalDone(PhasingStats result) { public void onTraversalDone(PhasingStats result) {
List<VariantContext> finalList = processQueue(null, result); List<VariantContext> finalList = processQueue(result, true);
writeVarContList(finalList); writeVarContList(finalList);
if (statsWriter != null) if (statsWriter != null)
statsWriter.close(); statsWriter.close();
@ -532,6 +619,12 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
public HashMap<String, ReadBasesAtPosition> sampleReadBases; public HashMap<String, ReadBasesAtPosition> sampleReadBases;
public boolean processVariant; public boolean processVariant;
public VariantAndReads(VariantContext variant, HashMap<String, ReadBasesAtPosition> sampleReadBases, boolean processVariant) {
this.variant = variant;
this.sampleReadBases = sampleReadBases;
this.processVariant = processVariant;
}
public VariantAndReads(VariantContext variant, AlignmentContext alignment, boolean processVariant) { public VariantAndReads(VariantContext variant, AlignmentContext alignment, boolean processVariant) {
this.variant = variant; this.variant = variant;
this.sampleReadBases = new HashMap<String, ReadBasesAtPosition>(); this.sampleReadBases = new HashMap<String, ReadBasesAtPosition>();
@ -560,6 +653,20 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
} }
private static String toString(List<VariantAndReads> vrList) {
boolean first = true;
StringBuilder sb = new StringBuilder();
for (VariantAndReads vr : vrList) {
if (first)
first = false;
else
sb.append(" -- ");
sb.append(VariantContextUtils.getLocation(vr.variant));
}
return sb.toString();
}
private static class ReadBase { private static class ReadBase {
public String readName; public String readName;
public byte base; public byte base;
@ -599,19 +706,14 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
// //
private static abstract class HaplotypeTableCreator { private static abstract class HaplotypeTableCreator {
protected Genotype[] genotypes; protected Genotype[] genotypes;
protected Biallele[] bialleles;
public HaplotypeTableCreator(List<VariantAndReads> vaList, String sample) { public HaplotypeTableCreator(List<VariantAndReads> vaList, String sample) {
this.genotypes = new Genotype[vaList.size()]; this.genotypes = new Genotype[vaList.size()];
this.bialleles = new Biallele[vaList.size()];
int index = 0; int index = 0;
for (VariantAndReads phaseInfoVr : vaList) { for (VariantAndReads phaseInfoVr : vaList) {
VariantContext phaseInfoVc = phaseInfoVr.variant; VariantContext phaseInfoVc = phaseInfoVr.variant;
Genotype phaseInfoGt = phaseInfoVc.getGenotype(sample); Genotype phaseInfoGt = phaseInfoVc.getGenotype(sample);
genotypes[index] = phaseInfoGt; genotypes[index++] = phaseInfoGt;
bialleles[index] = new Biallele(phaseInfoGt);
index++;
} }
} }
@ -643,7 +745,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
Haplotype rep = pte.getHaplotypeClass().getRepresentative(); Haplotype rep = pte.getHaplotypeClass().getRepresentative();
PreciseNonNegativeDouble score = hapMap.get(rep); PreciseNonNegativeDouble score = hapMap.get(rep);
if (score == null) { if (score == null) {
score = new PreciseNonNegativeDouble(0.0); score = new PreciseNonNegativeDouble(ZERO);
hapMap.put(rep, score); hapMap.put(rep, score);
} }
score.plusEqual(pte.getScore()); score.plusEqual(pte.getScore());
@ -663,10 +765,18 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
private static class BiallelicComplementHaplotypeTableCreator extends HaplotypeTableCreator { private static class BiallelicComplementHaplotypeTableCreator extends HaplotypeTableCreator {
private Biallele[] bialleles;
private int startIndex;
private int marginalizeLength; private int marginalizeLength;
public BiallelicComplementHaplotypeTableCreator(List<VariantAndReads> vaList, String sample, int marginalizeLength) { public BiallelicComplementHaplotypeTableCreator(List<VariantAndReads> vaList, String sample, int startIndex, int marginalizeLength) {
super(vaList, sample); super(vaList, sample);
this.bialleles = new Biallele[genotypes.length];
for (int i = 0; i < genotypes.length; i++)
bialleles[i] = new Biallele(genotypes[i]);
this.startIndex = startIndex;
this.marginalizeLength = marginalizeLength; this.marginalizeLength = marginalizeLength;
} }
@ -675,15 +785,18 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
PhasingTable table = new PhasingTable(); PhasingTable table = new PhasingTable();
for (Haplotype hap : getAllHaplotypes()) { for (Haplotype hap : getAllHaplotypes()) {
if (bialleles[0].matchesTopBase(hap.getBase(0))) { if (bialleles[startIndex].matchesTopBase(hap.getBase(startIndex))) {
/* hap is the "representative" haplotype [arbitrarily defined to be /* hap is the "representative" haplotype [DEFINED here to be
the one with the top base at the 0th position] the one with the top base at the startIndex position.
NOTE that it is CRITICAL that this definition be consistent with the representative sub-haplotypes defined below!]
*/ */
ArrayList<Haplotype> hapList = new ArrayList<Haplotype>(); ArrayList<Haplotype> hapList = new ArrayList<Haplotype>();
hapList.add(hap); hapList.add(hap);
hapList.add(complement(hap)); hapList.add(complement(hap));
Haplotype rep = hap.subHaplotype(0, Math.min(marginalizeLength, hap.size())); // only want first marginalizeLength positions // want marginalizeLength positions starting at startIndex:
Haplotype rep = hap.subHaplotype(startIndex, startIndex + marginalizeLength);
HaplotypeClass hapClass = new HaplotypeClass(hapList, rep); HaplotypeClass hapClass = new HaplotypeClass(hapList, rep);
table.addEntry(hapClass, hapClassPrior); table.addEntry(hapClass, hapClassPrior);
} }
@ -787,6 +900,16 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
} }
} }
private static class PhaseResult {
public Haplotype haplotype;
public double phaseQuality;
public PhaseResult(Haplotype haplotype, double phaseQuality) {
this.haplotype = haplotype;
this.phaseQuality = phaseQuality;
}
}
} }
@ -903,7 +1026,7 @@ class Haplotype extends BaseArray implements Cloneable {
// Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive. // Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive.
public Haplotype subHaplotype(int fromIndex, int toIndex) { public Haplotype subHaplotype(int fromIndex, int toIndex) {
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, toIndex)); return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
} }
} }
@ -1114,10 +1237,10 @@ class PhasingQualityStatsWriter {
this.variantStatsFilePrefix = variantStatsFilePrefix; this.variantStatsFilePrefix = variantStatsFilePrefix;
} }
public void addStat(String sample, int distanceFromPrevious, double phasingQuality) { public void addStat(String sample, GenomeLoc locus, int distanceFromPrevious, double phasingQuality) {
BufferedWriter sampWriter = sampleToStatsWriter.get(sample); BufferedWriter sampWriter = sampleToStatsWriter.get(sample);
if (sampWriter == null) { if (sampWriter == null) {
String fileName = variantStatsFilePrefix + "." + sample + ".distance_PQ.txt"; String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ.txt";
FileOutputStream output; FileOutputStream output;
try { try {
@ -1129,7 +1252,7 @@ class PhasingQualityStatsWriter {
sampleToStatsWriter.put(sample, sampWriter); sampleToStatsWriter.put(sample, sampWriter);
} }
try { try {
sampWriter.write(distanceFromPrevious + "\t" + phasingQuality + "\n"); sampWriter.write(locus + "\t" + distanceFromPrevious + "\t" + phasingQuality + "\n");
sampWriter.flush(); sampWriter.flush();
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e); throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e);