Changed ReadBackedPhasing to be a RodWalker (more efficient, since it is ROD-focused)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4117 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-08-25 19:43:57 +00:00
parent 6eb1559c1d
commit 41e53d37e1
1 changed files with 85 additions and 65 deletions

View File

@ -52,11 +52,12 @@ import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
* Walks along all loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using downstream reads). * Walks along all loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using downstream reads).
* Use '-BTI variant' to only stop at positions in the VCF file bound to 'variant'. * Use '-BTI variant' to only stop at positions in the VCF file bound to 'variant'.
*/ */
@Requires(value = {}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) @Allows(value = {DataSource.READS, DataSource.REFERENCE})
@Requires(value = {DataSource.READS, DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class))
@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality @ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality
public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput, PhasingStats> { public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, PhasingStats> {
@Output(doc="File to which variants should be written",required=true) @Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null; protected VCFWriter writer = null;
@ -124,7 +125,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
* @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range.
*/ */
public PhasingStatsAndOutput map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public PhasingStatsAndOutput map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (tracker == null) if (tracker == null || ref == null)
return null; return null;
PhasingStats phaseStats = new PhasingStats(); PhasingStats phaseStats = new PhasingStats();
@ -138,6 +139,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
VariantAndReads vr = new VariantAndReads(vc, context, processVariant); VariantAndReads vr = new VariantAndReads(vc, context, processVariant);
unphasedSiteQueue.add(vr); unphasedSiteQueue.add(vr);
logger.debug("Added variant to queue = " + VariantContextUtils.getLocation(vr.variant));
int numReads = 0; int numReads = 0;
if (context.hasBasePileup()) { if (context.hasBasePileup()) {
@ -155,11 +157,13 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) { private List<VariantContext> processQueue(PhasingStats phaseStats, boolean processAll) {
List<VariantContext> oldPhasedList = new LinkedList<VariantContext>();
if (!unphasedSiteQueue.isEmpty()) {
GenomeLoc lastLocus = null; GenomeLoc lastLocus = null;
if (!processAll && !unphasedSiteQueue.isEmpty()) if (!processAll)
lastLocus = VariantContextUtils.getLocation(unphasedSiteQueue.peekLast().variant); lastLocus = VariantContextUtils.getLocation(unphasedSiteQueue.peekLast().variant);
List<VariantContext> oldPhasedList = new LinkedList<VariantContext>();
while (!unphasedSiteQueue.isEmpty()) { while (!unphasedSiteQueue.isEmpty()) {
if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue if (!processAll) { // otherwise, phase until the end of unphasedSiteQueue
VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant; VariantContext nextToPhaseVc = unphasedSiteQueue.peek().variant;
@ -170,16 +174,19 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
// Already saw all variant positions within cacheWindow distance ahead of vc (on its contig) // Already saw all variant positions within cacheWindow distance ahead of vc (on its contig)
} }
// Update phasedSites before it's used in finalizePhasing: // Update phasedSites before it's used in finalizePhasing:
oldPhasedList.addAll(discardIrrelevantPhasedSites()); oldPhasedList.addAll(discardIrrelevantPhasedSites());
logger.debug("oldPhasedList(1) = " + toStringVCL(oldPhasedList));
VariantAndReads phasedVr = finalizePhasing(unphasedSiteQueue.remove(), phaseStats); VariantAndReads phasedVr = finalizePhasing(unphasedSiteQueue.remove(), phaseStats);
logger.debug("Finalized phasing for " + VariantContextUtils.getLocation(phasedVr.variant));
phasedSites.add(phasedVr); phasedSites.add(phasedVr);
} }
}
// Update phasedSites after finalizePhasing is done: // Update phasedSites after finalizePhasing is done:
oldPhasedList.addAll(discardIrrelevantPhasedSites()); oldPhasedList.addAll(discardIrrelevantPhasedSites());
logger.debug("oldPhasedList(2) = " + toStringVCL(oldPhasedList));
return oldPhasedList; return oldPhasedList;
} }
@ -193,10 +200,8 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
while (!phasedSites.isEmpty()) { while (!phasedSites.isEmpty()) {
VariantAndReads phasedVr = phasedSites.peek(); VariantAndReads phasedVr = phasedSites.peek();
VariantContext phasedVc = phasedVr.variant; VariantContext phasedVc = phasedVr.variant;
if (nextToPhaseVc != null && phasedVr.processVariant && if (nextToPhaseVc != null && phasedVr.processVariant && isInWindowRange(phasedVc, nextToPhaseVc)) {
(isInWindowRange(phasedVc, nextToPhaseVc) // nextToPhaseVc is still not far enough ahead of phasedVc to exclude phasedVc from calculations // nextToPhaseVc is still not far enough ahead of phasedVc to exclude phasedVc from calculations
// since ALWAYS want the previous site to be included for phasing nextToPhaseVc:
|| (phasedSites.size() == 1 && VariantContextUtils.getLocation(phasedVc).onSameContig(VariantContextUtils.getLocation(nextToPhaseVc))))) {
break; break;
} }
vcList.add(phasedSites.remove().variant); vcList.add(phasedSites.remove().variant);
@ -205,8 +210,8 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
return vcList; return vcList;
} }
/* Finalize phasing of vc (head of unphasedSiteQueue) using all VariantContext objects in /* Phase vc (removed head of unphasedSiteQueue) using all VariantContext objects in
phasedSites and all in unphasedSiteQueue that are within cacheWindow distance ahead of vc (on its contig). phasedSites, and all in unphasedSiteQueue that are within cacheWindow distance ahead of vc (on its contig).
ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue). ASSUMES: All VariantContexts in unphasedSiteQueue are in positions downstream of vc (head of queue).
*/ */
@ -214,12 +219,9 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
if (!vr.processVariant) if (!vr.processVariant)
return vr; // return vr as is return vr; // return vr as is
VariantContext vc = vr.variant;
logger.debug("Will phase vc = " + VariantContextUtils.getLocation(vc));
// Find the previous VariantContext (that was processed and phased): // Find the previous VariantContext (that was processed and phased):
VariantAndReads prevVr = null; VariantAndReads prevVr = null;
Iterator<VariantAndReads> backwardsIt = phasedSites.descendingIterator(); Iterator<VariantAndReads> backwardsIt = phasedSites.descendingIterator(); // look at most recently phased sites
while (backwardsIt.hasNext()) { while (backwardsIt.hasNext()) {
VariantAndReads backVr = backwardsIt.next(); VariantAndReads backVr = backwardsIt.next();
if (backVr.processVariant) { if (backVr.processVariant) {
@ -227,11 +229,13 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
break; break;
} }
} }
boolean hasPreviousSite = (prevVr != null); if (prevVr == null)
return vr; // return vr as is, since cannot phase against "nothing" (vc is at the beginning of the chromosome, or the previous was so far back it was removed from phasedSites)
LinkedList<VariantAndReads> windowVaList = null; VariantContext vc = vr.variant;
if (hasPreviousSite) { logger.debug("Will phase vc = " + VariantContextUtils.getLocation(vc));
windowVaList = new LinkedList<VariantAndReads>();
LinkedList<VariantAndReads> windowVaList = new LinkedList<VariantAndReads>();
// Include previously phased sites in the phasing computation: // Include previously phased sites in the phasing computation:
for (VariantAndReads phasedVr : phasedSites) { for (VariantAndReads phasedVr : phasedSites) {
@ -254,7 +258,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
for (VariantAndReads phaseInfoVr : windowVaList) for (VariantAndReads phaseInfoVr : windowVaList)
logger.debug("Using phaseInfoVc = " + VariantContextUtils.getLocation(phaseInfoVr.variant)); logger.debug("Using phaseInfoVc = " + VariantContextUtils.getLocation(phaseInfoVr.variant));
} }
}
logger.debug(""); logger.debug("");
Map<String, Genotype> sampGenotypes = vc.getGenotypes(); Map<String, Genotype> sampGenotypes = vc.getGenotypes();
@ -271,12 +274,11 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
Biallele biall = new Biallele(gt); Biallele biall = new Biallele(gt);
HashMap<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes()); HashMap<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
if (hasPreviousSite && gt.isHet()) { if (gt.isHet()) {
VariantContext prevVc = prevVr.variant; VariantContext prevVc = prevVr.variant;
Genotype prevGenotype = prevVc.getGenotype(samp); Genotype prevGenotype = prevVc.getGenotype(samp);
if (prevGenotype.isHet()) { //otherwise, can trivially phase if (prevGenotype.isHet()) { //otherwise, can trivially phase
logger.debug("NON-TRIVIALLY CARE about TOP vs. BOTTOM for: "); logger.debug("NON-TRIVIALLY CARE about TOP vs. BOTTOM for: " + "\n" + biall);
logger.debug("\n" + biall);
List<VariantAndReads> sampleWindowVaList = new LinkedList<VariantAndReads>(); List<VariantAndReads> sampleWindowVaList = new LinkedList<VariantAndReads>();
int phasingSiteIndex = -1; int phasingSiteIndex = -1;
@ -288,7 +290,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
sampleWindowVaList.add(phaseInfoVr); sampleWindowVaList.add(phaseInfoVr);
if (phasingSiteIndex == -1) { if (phasingSiteIndex == -1) {
if (phaseInfoVr == vr) if (phaseInfoVr == vr)
phasingSiteIndex = currentIndex; phasingSiteIndex = currentIndex; // index of vr in sampleWindowVaList
else else
currentIndex++; currentIndex++;
} }
@ -299,21 +301,22 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
throw new StingException("Internal error: could NOT find vr and/or prevVr!"); throw new StingException("Internal error: could NOT find vr and/or prevVr!");
if (sampleWindowVaList.size() > maxPhaseSites) { if (sampleWindowVaList.size() > maxPhaseSites) {
logger.warn("Trying to phase sample " + samp + " at locus " + VariantContextUtils.getLocation(vc) + " within a window of " + cacheWindow + " bases yields " + sampleWindowVaList.size() + " heterozygous sites to phase -- REDUCING to " + maxPhaseSites + " sites:\n" + toString(sampleWindowVaList)); logger.warn("Trying to phase sample " + samp + " at locus " + VariantContextUtils.getLocation(vc) + " within a window of " + cacheWindow + " bases yields " + sampleWindowVaList.size() + " heterozygous sites to phase:\n" + toStringVRL(sampleWindowVaList));
int prevSiteIndex = phasingSiteIndex - 1; int prevSiteIndex = phasingSiteIndex - 1; // index of prevVr in sampleWindowVaList
int numToUse = maxPhaseSites - 2; // since always keep prevVr and vr int numToUse = maxPhaseSites - 2; // since always keep prevVr and vr
int halfToUse = new Double(Math.floor(numToUse / 2.0)).intValue();
int numOnLeft = prevSiteIndex; int numOnLeft = prevSiteIndex;
int numOnRight = sampleWindowVaList.size() - (phasingSiteIndex + 1); int numOnRight = sampleWindowVaList.size() - (phasingSiteIndex + 1);
int useOnLeft, useOnRight; int useOnLeft, useOnRight;
if (numOnLeft <= numOnRight) { if (numOnLeft <= numOnRight) {
int halfToUse = new Double(Math.floor(numToUse / 2.0)).intValue(); // skimp on the left [floor], and be generous with the right side
useOnLeft = Math.min(halfToUse, numOnLeft); useOnLeft = Math.min(halfToUse, numOnLeft);
useOnRight = Math.min(numToUse - useOnLeft, numOnRight); useOnRight = Math.min(numToUse - useOnLeft, numOnRight);
} }
else { // numOnRight < numOnLeft else { // numOnRight < numOnLeft
int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil]
useOnRight = Math.min(halfToUse, numOnRight); useOnRight = Math.min(halfToUse, numOnRight);
useOnLeft = Math.min(numToUse - useOnRight, numOnLeft); useOnLeft = Math.min(numToUse - useOnRight, numOnLeft);
} }
@ -321,7 +324,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
phasingSiteIndex -= startIndex; phasingSiteIndex -= startIndex;
sampleWindowVaList = sampleWindowVaList.subList(startIndex, stopIndex); sampleWindowVaList = sampleWindowVaList.subList(startIndex, stopIndex);
logger.warn("REDUCED to " + sampleWindowVaList.size() + " sites:\n" + toString(sampleWindowVaList)); logger.warn("REDUCED to " + sampleWindowVaList.size() + " sites:\n" + toStringVRL(sampleWindowVaList));
} }
PhaseResult pr = phaseSample(samp, sampleWindowVaList, phasingSiteIndex); PhaseResult pr = phaseSample(samp, sampleWindowVaList, phasingSiteIndex);
@ -329,8 +332,8 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
if (genotypesArePhased) { if (genotypesArePhased) {
Biallele prevBiall = new Biallele(prevGenotype); Biallele prevBiall = new Biallele(prevGenotype);
logger.debug("CHOSEN PHASE FOR PREVIOUS:\n" + prevBiall + "\n"); logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevBiall + "\n");
logger.debug("CHOSE PHASE:\n" + biall + "\n\n"); logger.debug("THE PHASE CHOSEN HERE:\n" + biall + "\n\n");
ensurePhasing(biall, prevBiall, pr.haplotype); ensurePhasing(biall, prevBiall, pr.haplotype);
gtAttribs.put("PQ", pr.phaseQuality); gtAttribs.put("PQ", pr.phaseQuality);
@ -370,6 +373,8 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
LinkedList<ReadBasesAtPosition> allPositions = new LinkedList<ReadBasesAtPosition>(); LinkedList<ReadBasesAtPosition> allPositions = new LinkedList<ReadBasesAtPosition>();
for (VariantAndReads phaseInfoVr : variantList) { for (VariantAndReads phaseInfoVr : variantList) {
ReadBasesAtPosition readBases = phaseInfoVr.sampleReadBases.get(sample); ReadBasesAtPosition readBases = phaseInfoVr.sampleReadBases.get(sample);
if (readBases == null)
readBases = new ReadBasesAtPosition(); // for transparency, put an empty list of bases at this position for sample
allPositions.add(readBases); allPositions.add(readBases);
} }
HashMap<String, Read> allReads = convertReadBasesAtPositionToReads(allPositions); HashMap<String, Read> allReads = convertReadBasesAtPositionToReads(allPositions);
@ -540,7 +545,6 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
/* /*
Inner classes: Inner classes:
*/ */
private static class Biallele { private static class Biallele {
public Allele top; public Allele top;
public Allele bottom; public Allele bottom;
@ -653,7 +657,7 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
} }
} }
private static String toString(List<VariantAndReads> vrList) { private static String toStringVRL(List<VariantAndReads> vrList) {
boolean first = true; boolean first = true;
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (VariantAndReads vr : vrList) { for (VariantAndReads vr : vrList) {
@ -667,6 +671,20 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
return sb.toString(); return sb.toString();
} }
private static String toStringVCL(List<VariantContext> vcList) {
boolean first = true;
StringBuilder sb = new StringBuilder();
for (VariantContext vc : vcList) {
if (first)
first = false;
else
sb.append(" -- ");
sb.append(VariantContextUtils.getLocation(vc));
}
return sb.toString();
}
private static class ReadBase { private static class ReadBase {
public String readName; public String readName;
public byte base; public byte base;
@ -914,6 +932,8 @@ public class ReadBackedPhasingWalker extends LocusWalker<PhasingStatsAndOutput,
class PhasingScore extends PreciseNonNegativeDouble { class PhasingScore extends PreciseNonNegativeDouble {
public PhasingScore(double score) { public PhasingScore(double score) {
super(score); super(score);