Change ReadBackedPhasing to be VCF4.1 compliant (but instead of using PS tags, it uses HP tags for pairs of diploid haplotype labels). The important point is that it no longer uses the '|' character instead of '/' or flips around the allele order in the genotype. Instead, the HP FORMAT targ is used to mark the haplotype labels with respect to other genotypes for the same sample. Note that this enables the phasing of non-consecutive (but nearby) sites, based on mate pairs, for example. Also, updated the HaplotypeCallerValidation Qscript to perform PhaseByTransmission (if ped file given) and then ReadBackedPhasing

This commit is contained in:
Menachem Fromer 2013-01-08 16:40:32 -05:00
parent b4e7b3d691
commit 6984ab0c78
2 changed files with 213 additions and 137 deletions

View File

@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
class Haplotype extends BaseArray implements Cloneable {
public Haplotype(byte[] bases) {
@ -68,4 +71,19 @@ class Haplotype extends BaseArray implements Cloneable {
public Haplotype subHaplotype(int fromIndex, int toIndex) {
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
}
public Haplotype subHaplotype(Set<Integer> inds) {
List<Byte> basesList = new LinkedList<Byte>();
for (int i : inds) {
if (0 <= i && i < bases.length)
basesList.add(bases[i]);
}
Byte[] newBases = new Byte[basesList.size()];
int index = 0;
for (Byte b : basesList)
newBases[index++] = b;
return new Haplotype(newBases);
}
}

View File

@ -137,6 +137,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
@Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false)
private boolean permitNoSampleOverlap = false;
@Deprecated
/**
* Important note: do not use this argument if your input data set is not already phased or it will cause the tool to skip over all heterozygous sites.
*/
@ -151,6 +152,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0);
public static final String PQ_KEY = "PQ";
public static final String HP_KEY = "HP";
// In order to detect phase inconsistencies:
private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.1; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads
@ -219,6 +221,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
// Phasing-specific INFO fields:
hInfo.add(new VCFFormatHeaderLine(PQ_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality"));
hInfo.add(new VCFFormatHeaderLine(HP_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Read-backed phasing haplotype identifiers"));
hInfo.add(new VCFInfoHeaderLine(PHASING_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?"));
// todo -- fix samplesToPhase
@ -378,27 +381,30 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
if (DEBUG) logger.debug("sample = " + samp);
if (isUnfilteredCalledDiploidGenotype(gt)) {
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
// true <-> can trivially phase a hom site relative to ANY previous site:
Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make();
uvc.setGenotype(samp, phasedGt);
}
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
SNPallelePair allelePair = new SNPallelePair(gt);
if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
if (gt.isHet()) { // Attempt to phase this het genotype relative to *SOME* previous het genotype:
CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
/* Notes:
1. Call to next() advances iterator to next position in partiallyPhasedSites.
2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads.
*/
UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant;
Genotype prevHetGenotype = prevUvc.getGenotype(samp);
// Create the list of all het genotypes preceding this one (and in the phasing window as contained in partiallyPhasedSites):
List<GenotypeAndReadBases> prevHetGenotypes = new LinkedList<GenotypeAndReadBases>();
CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> phasedIt = partiallyPhasedSites.iterator();
while (phasedIt.hasNext()) {
UnfinishedVariantAndReads phasedVr = phasedIt.next();
Genotype prevGt = phasedVr.unfinishedVariant.getGenotype(samp);
if (prevGt != null && isUnfilteredCalledDiploidGenotype(prevGt) && prevGt.isHet()) {
GenotypeAndReadBases grb = new GenotypeAndReadBases(prevGt, phasedVr.sampleReadBases.get(samp), phasedVr.unfinishedVariant.getLocation());
prevHetGenotypes.add(grb);
if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
}
}
SNPallelePair allelePair = new SNPallelePair(gt);
if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
boolean phasedCurGenotypeRelativeToPrevious = false;
for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) {
PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets);
PhaseResult pr = phaseSampleAtSite(phaseWindow);
boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality);
if (pr.phasingContainsInconsistencies) {
if (DEBUG)
@ -406,44 +412,43 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
uvc.setPhasingInconsistent();
}
if (genotypesArePhased) {
if (phasedCurGenotypeRelativeToPrevious) {
Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype();
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
if (!prevHetGenotype.hasAnyAttribute(HP_KEY))
throw new ReviewedStingException("Internal error: missing haplotype markings for previous genotype, even though we put it there...");
String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(HP_KEY);
if (DEBUG)
logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n");
if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype);
Genotype phasedGt = new GenotypeBuilder(gt)
.alleles(allelePair.getAllelesAsList())
.attribute(PQ_KEY, pr.phaseQuality)
.phased(genotypesArePhased).make();
.attribute(HP_KEY, curPairNames)
.make();
uvc.setGenotype(samp, phasedGt);
}
// Now, update the 0 or more "interior" hom sites in between the previous het site and this het site:
while (prevHetAndInteriorIt.hasNext()) {
UnfinishedVariantContext interiorUvc = prevHetAndInteriorIt.next().unfinishedVariant;
Genotype handledGt = interiorUvc.getGenotype(samp);
if (handledGt == null || !isUnfilteredCalledDiploidGenotype(handledGt))
throw new ReviewedStingException("LOGICAL error: should not have breaks WITHIN haplotype");
if (!handledGt.isHom())
throw new ReviewedStingException("LOGICAL error: should not have anything besides hom sites IN BETWEEN two het sites");
if (DEBUG) {
logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]);
logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n");
// Use the same phasing consistency and PQ for each hom site in the "interior" as for the het-het phase:
if (pr.phasingContainsInconsistencies)
interiorUvc.setPhasingInconsistent();
if (genotypesArePhased) {
Genotype phasedHomGt = new GenotypeBuilder(handledGt)
.attribute(PQ_KEY, pr.phaseQuality)
.phased(genotypesArePhased).make();
interiorUvc.setGenotype(samp, phasedHomGt);
logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]);
logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n");
logger.debug("\n");
}
}
if (statsWriter != null)
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
if (statsWriter != null) {
GenomeLoc prevLoc = null;
int curIndex = 0;
for (GenotypeAndReadBases grb : prevHetGenotypes) {
if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) {
prevLoc = grb.loc;
break;
}
++curIndex;
}
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
}
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
if (sampPhaseCounts == null) {
@ -453,14 +458,28 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
sampPhaseCounts.numTestedSites++;
if (pr.phasingContainsInconsistencies) {
if (genotypesArePhased)
if (phasedCurGenotypeRelativeToPrevious)
sampPhaseCounts.numInconsistentSitesPhased++;
else
sampPhaseCounts.numInconsistentSitesNotPhased++;
}
if (genotypesArePhased)
if (phasedCurGenotypeRelativeToPrevious)
sampPhaseCounts.numPhased++;
// Phased current relative to *SOME* previous het genotype, so break out of loop:
if (phasedCurGenotypeRelativeToPrevious)
break;
}
if (!phasedCurGenotypeRelativeToPrevious) { // Either no previous hets, or unable to phase relative to any previous het:
String locStr = Integer.toString(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc).getStart());
Genotype startNewHaplotypeGt = new GenotypeBuilder(gt)
.attribute(HP_KEY, new String[]{locStr + "-1", locStr + "-2"})
.make();
uvc.setGenotype(samp, startNewHaplotypeGt);
}
}
}
@ -488,73 +507,40 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
private class PhasingWindow {
private Genotype[] hetGenotypes = null;
private CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = null;
private int phaseRelativeToIndex = -1;
private int phasingSiteIndex = -1;
private Map<String, PhasingRead> readsAtHetSites = null;
private void clearFields() {
hetGenotypes = null;
prevHetAndInteriorIt = null;
phasingSiteIndex = -1;
readsAtHetSites = null;
}
public boolean hasPreviousHets() {
return phasingSiteIndex > 0;
public Genotype phaseRelativeToGenotype() {
return hetGenotypes[phaseRelativeToIndex];
}
// ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)]
public PhasingWindow(VariantAndReads vr, String sample) {
List<GenotypeAndReadBases> listHetGenotypes = new LinkedList<GenotypeAndReadBases>();
public PhasingWindow(VariantAndReads vr, String sample, List<GenotypeAndReadBases> prevHetGenotypes, int goBackFromEndOfPrevHets) {
if (prevHetGenotypes.isEmpty() || goBackFromEndOfPrevHets >= prevHetGenotypes.size()) // no previous sites against which to phase
throw new ReviewedStingException("Should never get empty set of previous sites to phase against");
// Include previously phased sites in the phasing computation:
CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> phasedIt = partiallyPhasedSites.iterator();
while (phasedIt.hasNext()) {
UnfinishedVariantAndReads phasedVr = phasedIt.next();
Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample);
if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break"
listHetGenotypes.clear(); // clear out any history
}
else if (gt.isHet()) {
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation());
listHetGenotypes.add(grb);
if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
prevHetAndInteriorIt = phasedIt.clone();
}
}
// Include these previously phased sites in the phasing computation:
List<GenotypeAndReadBases> listHetGenotypes = new LinkedList<GenotypeAndReadBases>(prevHetGenotypes);
phaseRelativeToIndex = listHetGenotypes.size() - 1 - goBackFromEndOfPrevHets;
phasingSiteIndex = listHetGenotypes.size();
if (phasingSiteIndex == 0) { // no previous sites against which to phase
clearFields();
return;
}
prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()]
if (respectPhaseInInput) {
Genotype prevHetGenotype = prevHetAndInteriorIt.clone().next().unfinishedVariant.getGenotype(sample);
if (!prevHetGenotype.isPhased()) {
// Make this genotype unphaseable, since its previous het is not already phased [as required by respectPhaseInInput]:
clearFields();
return;
}
}
// Add the (het) position to be phased:
// Add the (het) position to be phased [at phasingSiteIndex]:
GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant);
GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus);
listHetGenotypes.add(grbPhase);
if (DEBUG)
logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
if (DEBUG) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
// Include as-of-yet unphased sites in the phasing computation:
for (VariantAndReads nextVr : unphasedSiteQueue) {
if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc
break;
Genotype gt = nextVr.variant.getGenotype(sample);
if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break"
break;
}
else if (gt.isHet()) {
if (gt != null && isUnfilteredCalledDiploidGenotype(gt) && gt.isHet()) {
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant));
listHetGenotypes.add(grb);
if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc);
@ -571,7 +557,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
listHetGenotypes = removeExtraneousSites(listHetGenotypes);
// In any case, must still trim the window size to be "feasible"
// [**NOTE**: May want to do this to try maximize the preservation of paths from (phasingSiteIndex - 1) to phasingSiteIndex]:
// [**NOTE**: May want to do this to try maximize the preservation of paths from phaseRelativeToIndex to phasingSiteIndex]:
if (listHetGenotypes.size() > maxPhaseSites) {
listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus);
@ -585,8 +571,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
buildReadsAtHetSites(listHetGenotypes, onlyKeepReads);
// Copy to a fixed-size array:
if (DEBUG)
logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
if (DEBUG) logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
hetGenotypes = new Genotype[listHetGenotypes.size()];
int index = 0;
for (GenotypeAndReadBases copyGrb : listHetGenotypes)
@ -713,7 +698,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
if (DEBUG) logger.debug("Read graph:\n" + readGraph);
Set<String> keepReads = new HashSet<String>();
/* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
/* Check which Reads are involved in acyclic paths from phaseRelativeToIndex to (phasingSiteIndex):
In detail:
Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph".
@ -759,7 +744,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
And, the P(r | AAAA + CCCC), ..., P(r | ACCC + CAAA) do not affect ratio (I) iff r's edges do not take part in a path from prev to cur in combination with the other reads in R.
*/
int prev = phasingSiteIndex - 1;
int prev = phaseRelativeToIndex;
int cur = phasingSiteIndex;
if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev
@ -842,53 +827,114 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
// Remove all sites that have no read bases:
List<GenotypeAndReadBases> keepHetSites = new LinkedList<GenotypeAndReadBases>();
int index = 0;
int numPrecedingRemoved = 0;
int numPrecedingPhaseRelativeToSiteRemoved = 0;
int numPrecedingPhasingSiteRemoved = 0;
for (GenotypeAndReadBases grb : listHetGenotypes) {
boolean keepSite = sitesWithReads.contains(index);
if (DEBUG && logger.isDebugEnabled() && !keepSite)
logger.debug("Removing read-less site " + grb.loc);
if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) {
if (keepSite || index == phasingSiteIndex || index == phaseRelativeToIndex) {
keepHetSites.add(grb);
if (!keepSite)
if (DEBUG)
logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]...");
}
else if (index <= phasingSiteIndex)
numPrecedingRemoved++;
else {
if (index <= phaseRelativeToIndex)
numPrecedingPhaseRelativeToSiteRemoved++;
if (index <= phasingSiteIndex)
numPrecedingPhasingSiteRemoved++;
}
index++;
}
phasingSiteIndex -= numPrecedingRemoved;
phaseRelativeToIndex -= numPrecedingPhaseRelativeToSiteRemoved;
phasingSiteIndex -= numPrecedingPhasingSiteRemoved;
return keepHetSites;
}
private class SortSitesBySumOfDist implements Comparator<Integer> {
private Vector<GenotypeAndReadBases> grb;
public SortSitesBySumOfDist(List<GenotypeAndReadBases> listHetGenotypes) {
grb = new Vector<GenotypeAndReadBases>(listHetGenotypes);
}
public int compare(Integer i1, Integer i2) {
int d1 = calcGenomicDist(i1);
int d2 = calcGenomicDist(i2);
if (d1 != d2)
return d1 - d2;
int id1 = calcIndexDist(i1);
int id2 = calcIndexDist(i2);
if (id1 != id2)
return id1 - id2;
return i1 - i2;
}
private int calcGenomicDist(int i) {
int d1 = grb.get(i).loc.distance(grb.get(phaseRelativeToIndex).loc);
int d2 = grb.get(i).loc.distance(grb.get(phasingSiteIndex).loc);
return d1 + d2;
}
private int calcIndexDist(int i) {
int d1 = Math.abs(i - phaseRelativeToIndex);
int d2 = Math.abs(i - phasingSiteIndex);
return d1 + d2;
}
}
private List<GenotypeAndReadBases> trimWindow(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phaseLocus) {
if (DEBUG)
logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes
int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites!
int numOnLeft = prevSiteIndex;
int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1);
int useOnLeft, useOnRight;
if (numOnLeft <= numOnRight) {
int halfToUse = numToUse / 2; // skimp on the left [floor], and be generous with the right side
useOnLeft = Math.min(halfToUse, numOnLeft);
useOnRight = Math.min(numToUse - useOnLeft, numOnRight);
Set<Integer> scoreAllIndices = new TreeSet<Integer>(new SortSitesBySumOfDist(listHetGenotypes));
for (int i = 0; i < listHetGenotypes.size(); ++i) {
if (i != phaseRelativeToIndex && i != phasingSiteIndex)
scoreAllIndices.add(i);
}
else { // numOnRight < numOnLeft
int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil]
useOnRight = Math.min(halfToUse, numOnRight);
useOnLeft = Math.min(numToUse - useOnRight, numOnLeft);
Set<Integer> keepIndices = new TreeSet<Integer>();
// always keep these two indices:
keepIndices.add(phaseRelativeToIndex);
keepIndices.add(phasingSiteIndex);
for (int addInd : scoreAllIndices) {
if (keepIndices.size() >= maxPhaseSites)
break;
else // keepIndices.size() < maxPhaseSites
keepIndices.add(addInd);
}
int startIndex = prevSiteIndex - useOnLeft;
int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
phasingSiteIndex -= startIndex;
listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex);
List<GenotypeAndReadBases> newListHetGenotypes = new LinkedList<GenotypeAndReadBases>();
int newPhaseRelativeToIndex = -1;
int newPhasingSiteIndex = -1;
int oldIndex = 0;
int newIndex = 0;
for (GenotypeAndReadBases grb : listHetGenotypes) {
if (keepIndices.contains(oldIndex)) {
newListHetGenotypes.add(grb);
if (oldIndex == phaseRelativeToIndex)
newPhaseRelativeToIndex = newIndex;
if (oldIndex == phasingSiteIndex)
newPhasingSiteIndex = newIndex;
++newIndex;
}
++oldIndex;
}
phaseRelativeToIndex = newPhaseRelativeToIndex;
phasingSiteIndex = newPhasingSiteIndex;
listHetGenotypes = newListHetGenotypes;
if (DEBUG)
logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
@ -900,7 +946,8 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
/* Will map a phase and its "complement" to a single representative phase,
and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
*/
HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2);
int[] marginalizeInds = {phaseWindow.phaseRelativeToIndex, phaseWindow.phasingSiteIndex};
HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, marginalizeInds);
PhasingTable sampleHaps = tabCreator.getNewTable();
if (DEBUG && logger.isDebugEnabled()) {
@ -1015,17 +1062,27 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
Ensure that curAllelePair is phased relative to prevAllelePair as specified by hap.
*/
public static void ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, Haplotype hap) {
public static String[] ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, String[] prevPairNames, Haplotype hap) {
if (hap.size() < 2)
throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!");
String[] curPairNames = prevPairNames;
byte prevBase = hap.getBase(0); // The 1st base in the haplotype
byte curBase = hap.getBase(1); // The 2nd base in the haplotype
boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase);
boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase);
if (chosePrevTopChrom != choseCurTopChrom)
curAllelePair.swapAlleles();
if (chosePrevTopChrom != choseCurTopChrom) {
//curAllelePair.swapAlleles();
/* Instead of swapping the alleles (as we used to above),
we swap the haplotype names to fit the unswapped alleles as they are ordered in the Genotype:
*/
curPairNames = new String[]{prevPairNames[1], prevPairNames[0]};
}
return curPairNames;
}
private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) {
@ -1036,8 +1093,8 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
return loc1.distance(loc2) <= cacheWindow; // distance() checks: loc1.onSameContig(loc2)
}
private int startDistance(UnfinishedVariantContext uvc1, VariantContext vc2) {
return uvc1.getLocation().distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
private int startDistance(GenomeLoc gl1, VariantContext vc2) {
return gl1.distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
}
public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) {
@ -1287,21 +1344,23 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
private static class TableCreatorOfHaplotypeAndComplementForDiploidAlleles extends HaplotypeTableCreator {
private SNPallelePair[] SNPallelePairs;
private int startIndex;
private int marginalizeLength;
Set<Integer> marginalizeInds;
public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) {
public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int[] marginalizeInds) {
super(hetGenotypes);
this.SNPallelePairs = new SNPallelePair[genotypes.length];
for (int i = 0; i < genotypes.length; i++)
SNPallelePairs[i] = new SNPallelePair(genotypes[i]);
this.startIndex = startIndex;
this.marginalizeLength = marginalizeLength;
this.marginalizeInds = new TreeSet<Integer>();
for (int mind : marginalizeInds)
this.marginalizeInds.add(mind);
}
public PhasingTable getNewTable() {
int startIndex = marginalizeInds.iterator().next();
PhasingTable table = new PhasingTable();
for (Haplotype hap : getAllHaplotypes()) {
if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) {
@ -1313,8 +1372,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
hapList.add(hap);
hapList.add(complement(hap));
// want marginalizeLength positions starting at startIndex:
Haplotype rep = hap.subHaplotype(startIndex, startIndex + marginalizeLength);
Haplotype rep = hap.subHaplotype(marginalizeInds);
double hapClassPrior = getHaplotypeRepresentativePrior(rep); // Note that prior is ONLY a function of the representative haplotype
HaplotypeClass hapClass = new HaplotypeClass(hapList, rep);