Implemented fully symmetric sliding window read-backed phaser

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4104 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-08-24 21:12:32 +00:00
parent cba5f05538
commit aa8cf25d08
1 changed files with 288 additions and 165 deletions

View File

@ -45,6 +45,8 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.*;
import java.util.*;
import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
/**
* Walks along all loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using downstream reads).
@ -71,31 +73,37 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
@Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files", required = false)
protected String variantStatsFilePrefix = null;
private PhasingQualityStatsWriter statsWriter = null;
private LinkedList<VariantAndReads> siteQueue = null;
private VariantAndReads prevVr = null; // the VC emitted after phasing, and the alignment bases at the position emitted
private LinkedList<VariantAndReads> unphasedSiteQueue = null;
private LinkedList<VariantAndReads> phasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions
private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0);
private static boolean DEBUG_DETAILED = true;
private void initializeVcfWriter(VariantContext vc) {
private LinkedList<String> rodNames = null;
private PhasingQualityStatsWriter statsWriter = null;
public void initialize() {
rodNames = new LinkedList<String>();
rodNames.add("variant");
unphasedSiteQueue = new LinkedList<VariantAndReads>();
phasedSites = new LinkedList<VariantAndReads>();
initializeVcfWriter();
if (variantStatsFilePrefix != null)
statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix);
}
private void initializeVcfWriter() {
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
hInfo.add(new VCFFormatHeaderLine("PQ", 1, VCFHeaderLineType.Float, "Read-backed phasing quality"));
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(vc.getSampleNames())));
}
public void initialize() {
siteQueue = new LinkedList<VariantAndReads>();
prevVr = new VariantAndReads(null, null, true);
if (variantStatsFilePrefix != null)
statsWriter = new PhasingQualityStatsWriter(variantStatsFilePrefix);
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames);
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples())));
}
public boolean generateExtendedEvents() {
@ -107,7 +115,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
}
/**
* For each site of interest, cache the current site and then use the cache to phase all upstream sites
* For each site of interest, cache the current site and then use the cache to phase all sites
* for which "sufficient" information has already been observed.
*
* @param tracker the meta-data tracker
@ -121,8 +129,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
PhasingStats phaseStats = new PhasingStats();
LinkedList<String> rodNames = new LinkedList<String>();
rodNames.add("variant");
boolean requireStartHere = true; // only see each VariantContext once
boolean takeFirstOnly = false; // take as many entries as the VCF file has
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
@ -131,7 +137,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
processVariant = false;
VariantAndReads vr = new VariantAndReads(vc, context, processVariant);
siteQueue.add(vr);
unphasedSiteQueue.add(vr);
int numReads = 0;
if (context.hasBasePileup()) {
@ -143,50 +149,101 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
PhasingStats addInPhaseStats = new PhasingStats(numReads, 1);
phaseStats.addIn(addInPhaseStats);
}
List<VariantContext> phasedList = processQueue(ref.getLocus(), phaseStats);
List<VariantContext> phasedList = processQueue(phaseStats, false);
return new PhasingStatsAndOutput(phaseStats, phasedList);
}
private List<VariantContext> processQueue(GenomeLoc loc, PhasingStats phaseStats) {
List<VariantContext> vcList = new LinkedList<VariantContext>();
private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) {
GenomeLoc lastLocus = null;
if (!processAll && !unphasedSiteQueue.isEmpty())
lastLocus = VariantContextUtils.getLocation(unphasedSiteQueue.peekLast().variant);
while (!siteQueue.isEmpty()) {
if (loc != null) {
VariantContext vc = siteQueue.peek().variant;
if (isInWindowRange(loc, VariantContextUtils.getLocation(vc))) {
// loc is still not far enough ahead of vc (since we ASSUME that the VCF is ordered by <contig,locus>)
List<VariantContext> oldPhasedList = new LinkedList<VariantContext>();
while (!unphasedSiteQueue.isEmpty()) {
if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue
VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant;
if (isInWindowRange(lastLocus, VariantContextUtils.getLocation(nextToPhaseVc))) {
/* lastLocus is still not far enough ahead of nextToPhaseVc to have all phasing information for nextToPhaseVc
(note that we ASSUME that the VCF is ordered by <contig,locus>) */
break;
}
// Already saw all variant positions within cacheWindow distance ahead of vc (on its contig)
}
VariantContext phasedVc = finalizePhasingAndRemove(phaseStats);
vcList.add(phasedVc);
// Update phasedSites before it's used in finalizePhasing:
oldPhasedList.addAll(discardIrrelevantPhasedSites());
VariantAndReads phasedVr = finalizePhasing(unphasedSiteQueue.remove(), phaseStats);
phasedSites.add(phasedVr);
}
// Update phasedSites after finalizePhasing is done:
oldPhasedList.addAll(discardIrrelevantPhasedSites());
return oldPhasedList;
}
private List<VariantContext> discardIrrelevantPhasedSites() {
List<VariantContext> vcList = new LinkedList<VariantContext>();
VariantContext nextToPhaseVc = null;
if (!unphasedSiteQueue.isEmpty())
nextToPhaseVc = unphasedSiteQueue.peek().variant;
while (!phasedSites.isEmpty()) {
VariantAndReads phasedVr = phasedSites.peek();
VariantContext phasedVc = phasedVr.variant;
if (nextToPhaseVc != null && phasedVr.processVariant &&
(isInWindowRange(phasedVc, nextToPhaseVc) // nextToPhaseVc is still not far enough ahead of phasedVc to exclude phasedVc from calculations
// since ALWAYS want the previous site to be included for phasing nextToPhaseVc:
|| (phasedSites.size() == 1 && VariantContextUtils.getLocation(phasedVc).onSameContig(VariantContextUtils.getLocation(nextToPhaseVc))))) {
break;
}
vcList.add(phasedSites.remove().variant);
}
return vcList;
}
/* Finalize phasing of vc (head of siteQueue) using all VariantContext objects in the siteQueue that are
within cacheWindow distance ahead of vc (on its contig).
ASSUMES:
1. siteQueue is NOT empty.
2. All VariantContexts in siteQueue are in positions downstream of vc (head of queue).
/* Finalize phasing of vc (head of unphasedSiteQueue) using all VariantContext objects in
phasedSites and all in unphasedSiteQueue that are within cacheWindow distance ahead of vc (on its contig).
ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue).
*/
private VariantContext finalizePhasingAndRemove(PhasingStats phaseStats) {
VariantAndReads vr = siteQueue.remove(); // remove vr from head of queue
VariantContext vc = vr.variant;
private VariantAndReads finalizePhasing(VariantAndReads vr, PhasingStats phaseStats) {
if (!vr.processVariant)
return vc; // return vc as is
return vr; // return vr as is
boolean hasPreviousSite = previousIsRelevantTo(vc);
VariantContext vc = vr.variant;
logger.debug("Will phase vc = " + VariantContextUtils.getLocation(vc));
LinkedList<VariantAndReads> windowVaList = new LinkedList<VariantAndReads>();
// Find the previous VariantContext (that was processed and phased):
VariantAndReads prevVr = null;
Iterator<VariantAndReads> backwardsIt = phasedSites.descendingIterator();
while (backwardsIt.hasNext()) {
VariantAndReads backVr = backwardsIt.next();
if (backVr.processVariant) {
prevVr = backVr;
break;
}
}
boolean hasPreviousSite = (prevVr != null);
LinkedList<VariantAndReads> windowVaList = null;
if (hasPreviousSite) {
windowVaList.add(prevVr); // need to add one position for phasing context
windowVaList.add(vr); // add position to be phased
for (VariantAndReads nextVr : siteQueue) {
windowVaList = new LinkedList<VariantAndReads>();
// Include previously phased sites in the phasing computation:
for (VariantAndReads phasedVr : phasedSites) {
if (phasedVr.processVariant)
windowVaList.add(phasedVr);
}
// Add position to be phased:
windowVaList.add(vr);
// Include as of yet unphased sites in the phasing computation:
for (VariantAndReads nextVr : unphasedSiteQueue) {
if (!isInWindowRange(vc, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc
break;
if (nextVr.processVariant) // include in the phasing computation
@ -201,7 +258,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
logger.debug("");
Map<String, Genotype> sampGenotypes = vc.getGenotypes();
VariantContext prevVc = prevVr.variant;
Map<String, Genotype> phasedGtMap = new TreeMap<String, Genotype>();
// Perform per-sample phasing:
@ -215,115 +271,154 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
Biallele biall = new Biallele(gt);
HashMap<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
if (hasPreviousSite && gt.isHet() && prevVc.getGenotype(samp).isHet()) { //otherwise, can trivially phase
logger.debug("NON-TRIVIALLY CARE about TOP vs. BOTTOM for: ");
logger.debug("\n" + biall);
if (hasPreviousSite && gt.isHet()) {
VariantContext prevVc = prevVr.variant;
Genotype prevGenotype = prevVc.getGenotype(samp);
if (prevGenotype.isHet()) { //otherwise, can trivially phase
logger.debug("NON-TRIVIALLY CARE about TOP vs. BOTTOM for: ");
logger.debug("\n" + biall);
List<VariantAndReads> sampleWindowVaList = new LinkedList<VariantAndReads>();
for (VariantAndReads phaseInfoVr : windowVaList) {
VariantContext phaseInfoVc = phaseInfoVr.variant;
Genotype phaseInfoGt = phaseInfoVc.getGenotype(samp);
if (phaseInfoGt.isHet()) { // otherwise, of no value to phasing
sampleWindowVaList.add(phaseInfoVr);
logger.debug("STARTING TO PHASE USING POS = " + VariantContextUtils.getLocation(phaseInfoVc));
List<VariantAndReads> sampleWindowVaList = new LinkedList<VariantAndReads>();
int phasingSiteIndex = -1;
int currentIndex = 0;
for (VariantAndReads phaseInfoVr : windowVaList) {
VariantContext phaseInfoVc = phaseInfoVr.variant;
Genotype phaseInfoGt = phaseInfoVc.getGenotype(samp);
if (phaseInfoGt.isHet()) { // otherwise, of no value to phasing
sampleWindowVaList.add(phaseInfoVr);
if (phasingSiteIndex == -1) {
if (phaseInfoVr == vr)
phasingSiteIndex = currentIndex;
else
currentIndex++;
}
logger.debug("STARTING TO PHASE USING POS = " + VariantContextUtils.getLocation(phaseInfoVc));
}
}
}
if (sampleWindowVaList.size() > maxPhaseSites) {
logger.warn("Trying to phase sample " + samp + " at locus " + VariantContextUtils.getLocation(vc) + " within a window of " + cacheWindow + " bases yields " + sampleWindowVaList.size() + " heterozygous sites to phase -- REDUCING to first " + maxPhaseSites + " sites!");
sampleWindowVaList = sampleWindowVaList.subList(0, maxPhaseSites);
}
if (logger.isDebugEnabled() && (phasingSiteIndex == -1 || phasingSiteIndex == 0))
throw new StingException("Internal error: could NOT find vr and/or prevVr!");
/* Will map a phase and its "complement" to a single representative phase,
and marginalizeTable() marginalizes to the first 2 positions [i.e., the previous position and the current position]:
*/
HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(sampleWindowVaList, samp, 2);
PhasingTable sampleHaps = tabCreator.getNewTable();
if (sampleWindowVaList.size() > maxPhaseSites) {
logger.warn("Trying to phase sample " + samp + " at locus " + VariantContextUtils.getLocation(vc) + " within a window of " + cacheWindow + " bases yields " + sampleWindowVaList.size() + " heterozygous sites to phase -- REDUCING to " + maxPhaseSites + " sites:\n" + toString(sampleWindowVaList));
// Assemble the "sub-reads" from the heterozygous positions for this sample:
LinkedList<ReadBasesAtPosition> allPositions = new LinkedList<ReadBasesAtPosition>();
for (VariantAndReads phaseInfoVr : sampleWindowVaList) {
ReadBasesAtPosition readBases = phaseInfoVr.sampleReadBases.get(samp);
allPositions.add(readBases);
}
HashMap<String, Read> allReads = convertReadBasesAtPositionToReads(allPositions);
logger.debug("Number of reads at sites: " + allReads.size());
int numUsedReads = 0;
int prevSiteIndex = phasingSiteIndex - 1;
int numToUse = maxPhaseSites - 2; // since always keep prevVr and vr
int halfToUse = new Double(Math.floor(numToUse / 2.0)).intValue();
// Update the phasing table based on each of the sub-reads for this sample:
for (Map.Entry<String, Read> nameToReads : allReads.entrySet()) {
Read rd = nameToReads.getValue();
if (rd.numNonNulls() <= 1) // can't possibly provide any phasing information, so save time
continue;
int numOnLeft = prevSiteIndex;
int numOnRight = sampleWindowVaList.size() - (phasingSiteIndex + 1);
numUsedReads++;
if (DEBUG_DETAILED)
logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey() + (rd.isGapped() ? "\tGAPPED" : ""));
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass());
pte.getScore().integrateReadScore(score);
if (DEBUG_DETAILED)
logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
int useOnLeft, useOnRight;
if (numOnLeft <= numOnRight) {
useOnLeft = Math.min(halfToUse, numOnLeft);
useOnRight = Math.min(numToUse - useOnLeft, numOnRight);
}
else { // numOnRight < numOnLeft
useOnRight = Math.min(halfToUse, numOnRight);
useOnLeft = Math.min(numToUse - useOnRight, numOnLeft);
}
int startIndex = prevSiteIndex - useOnLeft;
int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
phasingSiteIndex -= startIndex;
sampleWindowVaList = sampleWindowVaList.subList(startIndex, stopIndex);
logger.warn("REDUCED to " + sampleWindowVaList.size() + " sites:\n" + toString(sampleWindowVaList));
}
PhaseResult pr = phaseSample(samp, sampleWindowVaList, phasingSiteIndex);
genotypesArePhased = (pr.phaseQuality >= phaseQualityThresh);
if (genotypesArePhased) {
Biallele prevBiall = new Biallele(prevGenotype);
logger.debug("CHOSEN PHASE FOR PREVIOUS:\n" + prevBiall + "\n");
logger.debug("CHOSE PHASE:\n" + biall + "\n\n");
ensurePhasing(biall, prevBiall, pr.haplotype);
gtAttribs.put("PQ", pr.phaseQuality);
}
if (statsWriter != null)
statsWriter.addStat(samp, VariantContextUtils.getLocation(vc), distance(prevVc, vc), pr.phaseQuality);
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) {
sampPhaseCounts = new PhaseCounts();
samplePhaseStats.put(samp, sampPhaseCounts);
}
sampPhaseCounts.numTestedSites++;
if (genotypesArePhased)
sampPhaseCounts.numPhased++;
}
logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
logger.debug("numUsedReads = " + numUsedReads);
// Marginalize each haplotype to its first 2 positions:
sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps);
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + sampleHaps + "\n");
// Determine the phase at this position:
sampleHaps.normalizeScores();
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry();
double posteriorProb = maxEntry.getScore().getValue();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
if (pte != maxEntry)
sumErrorProbs.plusEqual(pte.getScore());
}
double phaseQuality = -10.0 * (sumErrorProbs.getLog10Value());
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality);
if (statsWriter != null)
statsWriter.addStat(samp, distance(prevVc, vc), phaseQuality);
genotypesArePhased = (phaseQuality >= phaseQualityThresh);
if (genotypesArePhased) {
Biallele prevBiall = new Biallele(prevVc.getGenotype(samp));
ensurePhasing(biall, prevBiall, maxEntry.getHaplotypeClass().getRepresentative());
gtAttribs.put("PQ", phaseQuality);
logger.debug("CHOSE PHASE:\n" + biall + "\n\n");
}
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) {
sampPhaseCounts = new PhaseCounts();
samplePhaseStats.put(samp, sampPhaseCounts);
}
sampPhaseCounts.numTestedSites++;
sampPhaseCounts.numPhased += (genotypesArePhased ? 1 : 0);
}
List<Allele> phasedAll = biall.getAllelesAsList();
Genotype phasedGt = new Genotype(gt.getSampleName(), phasedAll, gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
phasedGtMap.put(samp, phasedGt);
}
VariantContext phasedVc = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), phasedGtMap, vc.getNegLog10PError(), vc.getFilters(), vc.getAttributes());
prevVr.variant = phasedVc;
prevVr.sampleReadBases = vr.sampleReadBases;
phaseStats.addIn(new PhasingStats(samplePhaseStats));
return phasedVc;
VariantContext phasedVc = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), phasedGtMap, vc.getNegLog10PError(), vc.getFilters(), vc.getAttributes());
return new VariantAndReads(phasedVc, vr.sampleReadBases, vr.processVariant);
}
private PhaseResult phaseSample(String sample, List<VariantAndReads> variantList, int phasingSiteIndex) {
/* Will map a phase and its "complement" to a single representative phase,
and marginalizeTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
*/
HaplotypeTableCreator tabCreator = new BiallelicComplementHaplotypeTableCreator(variantList, sample, phasingSiteIndex - 1, 2);
PhasingTable sampleHaps = tabCreator.getNewTable();
// Assemble the "sub-reads" from the heterozygous positions for this sample:
LinkedList<ReadBasesAtPosition> allPositions = new LinkedList<ReadBasesAtPosition>();
for (VariantAndReads phaseInfoVr : variantList) {
ReadBasesAtPosition readBases = phaseInfoVr.sampleReadBases.get(sample);
allPositions.add(readBases);
}
HashMap<String, Read> allReads = convertReadBasesAtPositionToReads(allPositions);
logger.debug("Number of reads at sites: " + allReads.size());
int numUsedReads = 0;
// Update the phasing table based on each of the sub-reads for this sample:
for (Map.Entry<String, Read> nameToReads : allReads.entrySet()) {
Read rd = nameToReads.getValue();
if (rd.numNonNulls() <= 1) // can't possibly provide any phasing information, so save time
continue;
numUsedReads++;
if (DEBUG_DETAILED)
logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey() + (rd.isGapped() ? "\tGAPPED" : ""));
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass());
pte.getScore().integrateReadScore(score);
if (DEBUG_DETAILED)
logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score);
}
}
logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n");
logger.debug("numUsedReads = " + numUsedReads);
// Marginalize each haplotype to its first 2 positions:
sampleHaps = HaplotypeTableCreator.marginalizeTable(sampleHaps);
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + sampleHaps + "\n");
// Determine the phase at this position:
sampleHaps.normalizeScores();
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + sampleHaps + "\n");
PhasingTable.PhasingTableEntry maxEntry = sampleHaps.maxEntry();
double posteriorProb = maxEntry.getScore().getValue();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
if (pte != maxEntry)
sumErrorProbs.plusEqual(pte.getScore());
}
double phaseQuality = -10.0 * (sumErrorProbs.getLog10Value());
logger.debug("MAX hap:\t" + maxEntry.getHaplotypeClass() + "\tposteriorProb:\t" + posteriorProb + "\tphaseQuality:\t" + phaseQuality);
return new PhaseResult(maxEntry.getHaplotypeClass().getRepresentative(), phaseQuality);
}
/*
@ -343,11 +438,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
curBiall.swapAlleles();
}
private boolean previousIsRelevantTo(VariantContext vc) {
VariantContext prevVc = prevVr.variant;
return (prevVc != null && VariantContextUtils.getLocation(prevVc).onSameContig(VariantContextUtils.getLocation(vc)));
}
private boolean isInWindowRange(VariantContext vc1, VariantContext vc2) {
GenomeLoc loc1 = VariantContextUtils.getLocation(vc1);
GenomeLoc loc2 = VariantContextUtils.getLocation(vc2);
@ -369,9 +459,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
}
private void writeVCF(VariantContext vc) {
if (writer == null)
initializeVcfWriter(vc);
byte refBase;
if (!vc.isIndel()) {
Allele varAllele = vc.getReference();
@ -393,12 +480,12 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
}
/**
* Phase anything left in the cached siteQueue, and report the number of reads and VariantContexts processed.
* Phase anything left in the cached unphasedSiteQueue, and report the number of reads and VariantContexts processed.
*
* @param result the number of reads and VariantContexts seen.
*/
public void onTraversalDone(PhasingStats result) {
List<VariantContext> finalList = processQueue(null, result);
List<VariantContext> finalList = processQueue(result, true);
writeVarContList(finalList);
if (statsWriter != null)
statsWriter.close();
@ -532,6 +619,12 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
public HashMap<String, ReadBasesAtPosition> sampleReadBases;
public boolean processVariant;
public VariantAndReads(VariantContext variant, HashMap<String, ReadBasesAtPosition> sampleReadBases, boolean processVariant) {
this.variant = variant;
this.sampleReadBases = sampleReadBases;
this.processVariant = processVariant;
}
public VariantAndReads(VariantContext variant, AlignmentContext alignment, boolean processVariant) {
this.variant = variant;
this.sampleReadBases = new HashMap<String, ReadBasesAtPosition>();
@ -560,6 +653,20 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
}
}
private static String toString(List<VariantAndReads> vrList) {
boolean first = true;
StringBuilder sb = new StringBuilder();
for (VariantAndReads vr : vrList) {
if (first)
first = false;
else
sb.append(" -- ");
sb.append(VariantContextUtils.getLocation(vr.variant));
}
return sb.toString();
}
private static class ReadBase {
public String readName;
public byte base;
@ -599,19 +706,14 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
//
private static abstract class HaplotypeTableCreator {
protected Genotype[] genotypes;
protected Biallele[] bialleles;
public HaplotypeTableCreator(List<VariantAndReads> vaList, String sample) {
this.genotypes = new Genotype[vaList.size()];
this.bialleles = new Biallele[vaList.size()];
int index = 0;
for (VariantAndReads phaseInfoVr : vaList) {
VariantContext phaseInfoVc = phaseInfoVr.variant;
Genotype phaseInfoGt = phaseInfoVc.getGenotype(sample);
genotypes[index] = phaseInfoGt;
bialleles[index] = new Biallele(phaseInfoGt);
index++;
genotypes[index++] = phaseInfoGt;
}
}
@ -643,7 +745,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
Haplotype rep = pte.getHaplotypeClass().getRepresentative();
PreciseNonNegativeDouble score = hapMap.get(rep);
if (score == null) {
score = new PreciseNonNegativeDouble(0.0);
score = new PreciseNonNegativeDouble(ZERO);
hapMap.put(rep, score);
}
score.plusEqual(pte.getScore());
@ -663,10 +765,18 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
}
private static class BiallelicComplementHaplotypeTableCreator extends HaplotypeTableCreator {
private Biallele[] bialleles;
private int startIndex;
private int marginalizeLength;
public BiallelicComplementHaplotypeTableCreator(List<VariantAndReads> vaList, String sample, int marginalizeLength) {
public BiallelicComplementHaplotypeTableCreator(List<VariantAndReads> vaList, String sample, int startIndex, int marginalizeLength) {
super(vaList, sample);
this.bialleles = new Biallele[genotypes.length];
for (int i = 0; i < genotypes.length; i++)
bialleles[i] = new Biallele(genotypes[i]);
this.startIndex = startIndex;
this.marginalizeLength = marginalizeLength;
}
@ -675,15 +785,18 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
PhasingTable table = new PhasingTable();
for (Haplotype hap : getAllHaplotypes()) {
if (bialleles[0].matchesTopBase(hap.getBase(0))) {
/* hap is the "representative" haplotype [arbitrarily defined to be
the one with the top base at the 0th position]
if (bialleles[startIndex].matchesTopBase(hap.getBase(startIndex))) {
/* hap is the "representative" haplotype [DEFINED here to be
the one with the top base at the startIndex position.
NOTE that it is CRITICAL that this definition be consistent with the representative sub-haplotypes defined below!]
*/
ArrayList<Haplotype> hapList = new ArrayList<Haplotype>();
hapList.add(hap);
hapList.add(complement(hap));
Haplotype rep = hap.subHaplotype(0, Math.min(marginalizeLength, hap.size())); // only want first marginalizeLength positions
// want marginalizeLength positions starting at startIndex:
Haplotype rep = hap.subHaplotype(startIndex, startIndex + marginalizeLength);
HaplotypeClass hapClass = new HaplotypeClass(hapList, rep);
table.addEntry(hapClass, hapClassPrior);
}
@ -787,6 +900,16 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
}
}
}
private static class PhaseResult {
public Haplotype haplotype;
public double phaseQuality;
public PhaseResult(Haplotype haplotype, double phaseQuality) {
this.haplotype = haplotype;
this.phaseQuality = phaseQuality;
}
}
}
@ -903,7 +1026,7 @@ class Haplotype extends BaseArray implements Cloneable {
// Returns a new Haplotype containing the portion of this Haplotype between the specified fromIndex, inclusive, and toIndex, exclusive.
public Haplotype subHaplotype(int fromIndex, int toIndex) {
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, toIndex));
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
}
}
@ -1114,10 +1237,10 @@ class PhasingQualityStatsWriter {
this.variantStatsFilePrefix = variantStatsFilePrefix;
}
public void addStat(String sample, int distanceFromPrevious, double phasingQuality) {
public void addStat(String sample, GenomeLoc locus, int distanceFromPrevious, double phasingQuality) {
BufferedWriter sampWriter = sampleToStatsWriter.get(sample);
if (sampWriter == null) {
String fileName = variantStatsFilePrefix + "." + sample + ".distance_PQ.txt";
String fileName = variantStatsFilePrefix + "." + sample + ".locus_distance_PQ.txt";
FileOutputStream output;
try {
@ -1129,7 +1252,7 @@ class PhasingQualityStatsWriter {
sampleToStatsWriter.put(sample, sampWriter);
}
try {
sampWriter.write(distanceFromPrevious + "\t" + phasingQuality + "\n");
sampWriter.write(locus + "\t" + distanceFromPrevious + "\t" + phasingQuality + "\n");
sampWriter.flush();
} catch (IOException e) {
throw new RuntimeException("Unable to write to per-sample phasing quality stats file", e);