Change ReadBackedPhasing to be VCF4.1 compliant (but instead of using PS tags, it uses HP tags for pairs of diploid haplotype labels). The important point is that it no longer uses the '|' character instead of '/' or flips around the allele order in the genotype. Instead, the HP FORMAT targ is used to mark the haplotype labels with respect to other genotypes for the same sample. Note that this enables the phasing of non-consecutive (but nearby) sites, based on mate pairs, for example. Also, updated the HaplotypeCallerValidation Qscript to perform PhaseByTransmission (if ped file given) and then ReadBackedPhasing
This commit is contained in:
parent
b4e7b3d691
commit
6984ab0c78
|
|
@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
class Haplotype extends BaseArray implements Cloneable {
|
||||
public Haplotype(byte[] bases) {
|
||||
|
|
@ -68,4 +71,19 @@ class Haplotype extends BaseArray implements Cloneable {
|
|||
public Haplotype subHaplotype(int fromIndex, int toIndex) {
|
||||
return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size())));
|
||||
}
|
||||
|
||||
public Haplotype subHaplotype(Set<Integer> inds) {
|
||||
List<Byte> basesList = new LinkedList<Byte>();
|
||||
for (int i : inds) {
|
||||
if (0 <= i && i < bases.length)
|
||||
basesList.add(bases[i]);
|
||||
}
|
||||
|
||||
Byte[] newBases = new Byte[basesList.size()];
|
||||
int index = 0;
|
||||
for (Byte b : basesList)
|
||||
newBases[index++] = b;
|
||||
|
||||
return new Haplotype(newBases);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -137,6 +137,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
@Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false)
|
||||
private boolean permitNoSampleOverlap = false;
|
||||
|
||||
@Deprecated
|
||||
/**
|
||||
* Important note: do not use this argument if your input data set is not already phased or it will cause the tool to skip over all heterozygous sites.
|
||||
*/
|
||||
|
|
@ -151,6 +152,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
private static PreciseNonNegativeDouble ZERO = new PreciseNonNegativeDouble(0.0);
|
||||
|
||||
public static final String PQ_KEY = "PQ";
|
||||
public static final String HP_KEY = "HP";
|
||||
|
||||
// In order to detect phase inconsistencies:
|
||||
private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.1; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads
|
||||
|
|
@ -219,6 +221,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
|
||||
// Phasing-specific INFO fields:
|
||||
hInfo.add(new VCFFormatHeaderLine(PQ_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality"));
|
||||
hInfo.add(new VCFFormatHeaderLine(HP_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Read-backed phasing haplotype identifiers"));
|
||||
hInfo.add(new VCFInfoHeaderLine(PHASING_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?"));
|
||||
|
||||
// todo -- fix samplesToPhase
|
||||
|
|
@ -378,27 +381,30 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
|
||||
if (DEBUG) logger.debug("sample = " + samp);
|
||||
if (isUnfilteredCalledDiploidGenotype(gt)) {
|
||||
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
|
||||
// true <-> can trivially phase a hom site relative to ANY previous site:
|
||||
Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make();
|
||||
uvc.setGenotype(samp, phasedGt);
|
||||
}
|
||||
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
|
||||
PhasingWindow phaseWindow = new PhasingWindow(vr, samp);
|
||||
if (phaseWindow.hasPreviousHets()) { // Otherwise, nothing to phase this against
|
||||
SNPallelePair allelePair = new SNPallelePair(gt);
|
||||
if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
|
||||
if (gt.isHet()) { // Attempt to phase this het genotype relative to *SOME* previous het genotype:
|
||||
|
||||
CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt;
|
||||
/* Notes:
|
||||
1. Call to next() advances iterator to next position in partiallyPhasedSites.
|
||||
2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads.
|
||||
*/
|
||||
UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant;
|
||||
Genotype prevHetGenotype = prevUvc.getGenotype(samp);
|
||||
// Create the list of all het genotypes preceding this one (and in the phasing window as contained in partiallyPhasedSites):
|
||||
List<GenotypeAndReadBases> prevHetGenotypes = new LinkedList<GenotypeAndReadBases>();
|
||||
CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> phasedIt = partiallyPhasedSites.iterator();
|
||||
while (phasedIt.hasNext()) {
|
||||
UnfinishedVariantAndReads phasedVr = phasedIt.next();
|
||||
Genotype prevGt = phasedVr.unfinishedVariant.getGenotype(samp);
|
||||
if (prevGt != null && isUnfilteredCalledDiploidGenotype(prevGt) && prevGt.isHet()) {
|
||||
GenotypeAndReadBases grb = new GenotypeAndReadBases(prevGt, phasedVr.sampleReadBases.get(samp), phasedVr.unfinishedVariant.getLocation());
|
||||
prevHetGenotypes.add(grb);
|
||||
if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
|
||||
}
|
||||
}
|
||||
|
||||
SNPallelePair allelePair = new SNPallelePair(gt);
|
||||
if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair);
|
||||
|
||||
boolean phasedCurGenotypeRelativeToPrevious = false;
|
||||
for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) {
|
||||
PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets);
|
||||
|
||||
PhaseResult pr = phaseSampleAtSite(phaseWindow);
|
||||
boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality);
|
||||
phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality);
|
||||
|
||||
if (pr.phasingContainsInconsistencies) {
|
||||
if (DEBUG)
|
||||
|
|
@ -406,44 +412,43 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
uvc.setPhasingInconsistent();
|
||||
}
|
||||
|
||||
if (genotypesArePhased) {
|
||||
if (phasedCurGenotypeRelativeToPrevious) {
|
||||
Genotype prevHetGenotype = phaseWindow.phaseRelativeToGenotype();
|
||||
SNPallelePair prevAllelePair = new SNPallelePair(prevHetGenotype);
|
||||
if (!prevHetGenotype.hasAnyAttribute(HP_KEY))
|
||||
throw new ReviewedStingException("Internal error: missing haplotype markings for previous genotype, even though we put it there...");
|
||||
String[] prevPairNames = (String[]) prevHetGenotype.getAnyAttribute(HP_KEY);
|
||||
|
||||
if (DEBUG)
|
||||
logger.debug("THE PHASE PREVIOUSLY CHOSEN FOR PREVIOUS:\n" + prevAllelePair + "\n");
|
||||
if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
|
||||
|
||||
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
|
||||
String[] curPairNames = ensurePhasing(allelePair, prevAllelePair, prevPairNames, pr.haplotype);
|
||||
Genotype phasedGt = new GenotypeBuilder(gt)
|
||||
.alleles(allelePair.getAllelesAsList())
|
||||
.attribute(PQ_KEY, pr.phaseQuality)
|
||||
.phased(genotypesArePhased).make();
|
||||
.attribute(HP_KEY, curPairNames)
|
||||
.make();
|
||||
uvc.setGenotype(samp, phasedGt);
|
||||
}
|
||||
|
||||
// Now, update the 0 or more "interior" hom sites in between the previous het site and this het site:
|
||||
while (prevHetAndInteriorIt.hasNext()) {
|
||||
UnfinishedVariantContext interiorUvc = prevHetAndInteriorIt.next().unfinishedVariant;
|
||||
Genotype handledGt = interiorUvc.getGenotype(samp);
|
||||
if (handledGt == null || !isUnfilteredCalledDiploidGenotype(handledGt))
|
||||
throw new ReviewedStingException("LOGICAL error: should not have breaks WITHIN haplotype");
|
||||
if (!handledGt.isHom())
|
||||
throw new ReviewedStingException("LOGICAL error: should not have anything besides hom sites IN BETWEEN two het sites");
|
||||
if (DEBUG) {
|
||||
logger.debug("PREVIOUS CHROMOSOME NAMES: Top= " + prevPairNames[0] + ", Bot= " + prevPairNames[1]);
|
||||
logger.debug("PREVIOUS CHROMOSOMES:\n" + prevAllelePair + "\n");
|
||||
|
||||
// Use the same phasing consistency and PQ for each hom site in the "interior" as for the het-het phase:
|
||||
if (pr.phasingContainsInconsistencies)
|
||||
interiorUvc.setPhasingInconsistent();
|
||||
|
||||
if (genotypesArePhased) {
|
||||
Genotype phasedHomGt = new GenotypeBuilder(handledGt)
|
||||
.attribute(PQ_KEY, pr.phaseQuality)
|
||||
.phased(genotypesArePhased).make();
|
||||
interiorUvc.setGenotype(samp, phasedHomGt);
|
||||
logger.debug("CURRENT CHROMOSOME NAMES: Top= " + curPairNames[0] + ", Bot= " + curPairNames[1]);
|
||||
logger.debug("CURRENT CHROMOSOMES:\n" + allelePair + "\n");
|
||||
logger.debug("\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (statsWriter != null)
|
||||
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevUvc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
|
||||
if (statsWriter != null) {
|
||||
GenomeLoc prevLoc = null;
|
||||
int curIndex = 0;
|
||||
for (GenotypeAndReadBases grb : prevHetGenotypes) {
|
||||
if (curIndex == prevHetGenotypes.size() - 1 - goBackFromEndOfPrevHets) {
|
||||
prevLoc = grb.loc;
|
||||
break;
|
||||
}
|
||||
++curIndex;
|
||||
}
|
||||
statsWriter.addStat(samp, GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc), startDistance(prevLoc, vc), pr.phaseQuality, phaseWindow.readsAtHetSites.size(), phaseWindow.hetGenotypes.length);
|
||||
}
|
||||
|
||||
PhaseCounts sampPhaseCounts = samplePhaseStats.get(samp);
|
||||
if (sampPhaseCounts == null) {
|
||||
|
|
@ -453,14 +458,28 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
sampPhaseCounts.numTestedSites++;
|
||||
|
||||
if (pr.phasingContainsInconsistencies) {
|
||||
if (genotypesArePhased)
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
sampPhaseCounts.numInconsistentSitesPhased++;
|
||||
else
|
||||
sampPhaseCounts.numInconsistentSitesNotPhased++;
|
||||
}
|
||||
|
||||
if (genotypesArePhased)
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
sampPhaseCounts.numPhased++;
|
||||
|
||||
// Phased current relative to *SOME* previous het genotype, so break out of loop:
|
||||
if (phasedCurGenotypeRelativeToPrevious)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!phasedCurGenotypeRelativeToPrevious) { // Either no previous hets, or unable to phase relative to any previous het:
|
||||
String locStr = Integer.toString(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc).getStart());
|
||||
|
||||
Genotype startNewHaplotypeGt = new GenotypeBuilder(gt)
|
||||
.attribute(HP_KEY, new String[]{locStr + "-1", locStr + "-2"})
|
||||
.make();
|
||||
|
||||
uvc.setGenotype(samp, startNewHaplotypeGt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -488,73 +507,40 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
|
||||
private class PhasingWindow {
|
||||
private Genotype[] hetGenotypes = null;
|
||||
private CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> prevHetAndInteriorIt = null;
|
||||
|
||||
private int phaseRelativeToIndex = -1;
|
||||
private int phasingSiteIndex = -1;
|
||||
|
||||
private Map<String, PhasingRead> readsAtHetSites = null;
|
||||
|
||||
private void clearFields() {
|
||||
hetGenotypes = null;
|
||||
prevHetAndInteriorIt = null;
|
||||
phasingSiteIndex = -1;
|
||||
readsAtHetSites = null;
|
||||
}
|
||||
|
||||
public boolean hasPreviousHets() {
|
||||
return phasingSiteIndex > 0;
|
||||
public Genotype phaseRelativeToGenotype() {
|
||||
return hetGenotypes[phaseRelativeToIndex];
|
||||
}
|
||||
|
||||
// ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)]
|
||||
|
||||
public PhasingWindow(VariantAndReads vr, String sample) {
|
||||
List<GenotypeAndReadBases> listHetGenotypes = new LinkedList<GenotypeAndReadBases>();
|
||||
public PhasingWindow(VariantAndReads vr, String sample, List<GenotypeAndReadBases> prevHetGenotypes, int goBackFromEndOfPrevHets) {
|
||||
if (prevHetGenotypes.isEmpty() || goBackFromEndOfPrevHets >= prevHetGenotypes.size()) // no previous sites against which to phase
|
||||
throw new ReviewedStingException("Should never get empty set of previous sites to phase against");
|
||||
|
||||
// Include previously phased sites in the phasing computation:
|
||||
CloneableIteratorLinkedList.CloneableIterator<UnfinishedVariantAndReads> phasedIt = partiallyPhasedSites.iterator();
|
||||
while (phasedIt.hasNext()) {
|
||||
UnfinishedVariantAndReads phasedVr = phasedIt.next();
|
||||
Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample);
|
||||
if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break"
|
||||
listHetGenotypes.clear(); // clear out any history
|
||||
}
|
||||
else if (gt.isHet()) {
|
||||
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation());
|
||||
listHetGenotypes.add(grb);
|
||||
if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc);
|
||||
prevHetAndInteriorIt = phasedIt.clone();
|
||||
}
|
||||
}
|
||||
// Include these previously phased sites in the phasing computation:
|
||||
List<GenotypeAndReadBases> listHetGenotypes = new LinkedList<GenotypeAndReadBases>(prevHetGenotypes);
|
||||
|
||||
phaseRelativeToIndex = listHetGenotypes.size() - 1 - goBackFromEndOfPrevHets;
|
||||
phasingSiteIndex = listHetGenotypes.size();
|
||||
if (phasingSiteIndex == 0) { // no previous sites against which to phase
|
||||
clearFields();
|
||||
return;
|
||||
}
|
||||
prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()]
|
||||
|
||||
if (respectPhaseInInput) {
|
||||
Genotype prevHetGenotype = prevHetAndInteriorIt.clone().next().unfinishedVariant.getGenotype(sample);
|
||||
if (!prevHetGenotype.isPhased()) {
|
||||
// Make this genotype unphaseable, since its previous het is not already phased [as required by respectPhaseInInput]:
|
||||
clearFields();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the (het) position to be phased:
|
||||
// Add the (het) position to be phased [at phasingSiteIndex]:
|
||||
GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant);
|
||||
GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus);
|
||||
listHetGenotypes.add(grbPhase);
|
||||
if (DEBUG)
|
||||
logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
|
||||
if (DEBUG) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]");
|
||||
|
||||
// Include as-of-yet unphased sites in the phasing computation:
|
||||
for (VariantAndReads nextVr : unphasedSiteQueue) {
|
||||
if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc
|
||||
break;
|
||||
Genotype gt = nextVr.variant.getGenotype(sample);
|
||||
if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break"
|
||||
break;
|
||||
}
|
||||
else if (gt.isHet()) {
|
||||
if (gt != null && isUnfilteredCalledDiploidGenotype(gt) && gt.isHet()) {
|
||||
GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant));
|
||||
listHetGenotypes.add(grb);
|
||||
if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc);
|
||||
|
|
@ -571,7 +557,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
listHetGenotypes = removeExtraneousSites(listHetGenotypes);
|
||||
|
||||
// In any case, must still trim the window size to be "feasible"
|
||||
// [**NOTE**: May want to do this to try maximize the preservation of paths from (phasingSiteIndex - 1) to phasingSiteIndex]:
|
||||
// [**NOTE**: May want to do this to try maximize the preservation of paths from phaseRelativeToIndex to phasingSiteIndex]:
|
||||
if (listHetGenotypes.size() > maxPhaseSites) {
|
||||
listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus);
|
||||
|
||||
|
|
@ -585,8 +571,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
buildReadsAtHetSites(listHetGenotypes, onlyKeepReads);
|
||||
|
||||
// Copy to a fixed-size array:
|
||||
if (DEBUG)
|
||||
logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
|
||||
if (DEBUG) logger.debug("FINAL phasing window of " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
|
||||
hetGenotypes = new Genotype[listHetGenotypes.size()];
|
||||
int index = 0;
|
||||
for (GenotypeAndReadBases copyGrb : listHetGenotypes)
|
||||
|
|
@ -713,7 +698,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
if (DEBUG) logger.debug("Read graph:\n" + readGraph);
|
||||
Set<String> keepReads = new HashSet<String>();
|
||||
|
||||
/* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex):
|
||||
/* Check which Reads are involved in acyclic paths from phaseRelativeToIndex to (phasingSiteIndex):
|
||||
|
||||
In detail:
|
||||
Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph".
|
||||
|
|
@ -759,7 +744,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
|
||||
And, the P(r | AAAA + CCCC), ..., P(r | ACCC + CAAA) do not affect ratio (I) iff r's edges do not take part in a path from prev to cur in combination with the other reads in R.
|
||||
*/
|
||||
int prev = phasingSiteIndex - 1;
|
||||
int prev = phaseRelativeToIndex;
|
||||
int cur = phasingSiteIndex;
|
||||
|
||||
if (!readGraph.getConnectedComponents().inSameSet(prev, cur)) { // There is NO path between cur and prev
|
||||
|
|
@ -842,53 +827,114 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
// Remove all sites that have no read bases:
|
||||
List<GenotypeAndReadBases> keepHetSites = new LinkedList<GenotypeAndReadBases>();
|
||||
int index = 0;
|
||||
int numPrecedingRemoved = 0;
|
||||
int numPrecedingPhaseRelativeToSiteRemoved = 0;
|
||||
int numPrecedingPhasingSiteRemoved = 0;
|
||||
for (GenotypeAndReadBases grb : listHetGenotypes) {
|
||||
boolean keepSite = sitesWithReads.contains(index);
|
||||
if (DEBUG && logger.isDebugEnabled() && !keepSite)
|
||||
logger.debug("Removing read-less site " + grb.loc);
|
||||
|
||||
if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) {
|
||||
if (keepSite || index == phasingSiteIndex || index == phaseRelativeToIndex) {
|
||||
keepHetSites.add(grb);
|
||||
if (!keepSite)
|
||||
if (DEBUG)
|
||||
logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]...");
|
||||
}
|
||||
else if (index <= phasingSiteIndex)
|
||||
numPrecedingRemoved++;
|
||||
else {
|
||||
if (index <= phaseRelativeToIndex)
|
||||
numPrecedingPhaseRelativeToSiteRemoved++;
|
||||
if (index <= phasingSiteIndex)
|
||||
numPrecedingPhasingSiteRemoved++;
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
phasingSiteIndex -= numPrecedingRemoved;
|
||||
phaseRelativeToIndex -= numPrecedingPhaseRelativeToSiteRemoved;
|
||||
phasingSiteIndex -= numPrecedingPhasingSiteRemoved;
|
||||
return keepHetSites;
|
||||
}
|
||||
|
||||
private class SortSitesBySumOfDist implements Comparator<Integer> {
|
||||
private Vector<GenotypeAndReadBases> grb;
|
||||
|
||||
public SortSitesBySumOfDist(List<GenotypeAndReadBases> listHetGenotypes) {
|
||||
grb = new Vector<GenotypeAndReadBases>(listHetGenotypes);
|
||||
}
|
||||
|
||||
public int compare(Integer i1, Integer i2) {
|
||||
int d1 = calcGenomicDist(i1);
|
||||
int d2 = calcGenomicDist(i2);
|
||||
|
||||
if (d1 != d2)
|
||||
return d1 - d2;
|
||||
|
||||
int id1 = calcIndexDist(i1);
|
||||
int id2 = calcIndexDist(i2);
|
||||
if (id1 != id2)
|
||||
return id1 - id2;
|
||||
|
||||
return i1 - i2;
|
||||
}
|
||||
|
||||
private int calcGenomicDist(int i) {
|
||||
int d1 = grb.get(i).loc.distance(grb.get(phaseRelativeToIndex).loc);
|
||||
int d2 = grb.get(i).loc.distance(grb.get(phasingSiteIndex).loc);
|
||||
|
||||
return d1 + d2;
|
||||
}
|
||||
|
||||
private int calcIndexDist(int i) {
|
||||
int d1 = Math.abs(i - phaseRelativeToIndex);
|
||||
int d2 = Math.abs(i - phasingSiteIndex);
|
||||
|
||||
return d1 + d2;
|
||||
}
|
||||
}
|
||||
|
||||
private List<GenotypeAndReadBases> trimWindow(List<GenotypeAndReadBases> listHetGenotypes, String sample, GenomeLoc phaseLocus) {
|
||||
if (DEBUG)
|
||||
logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
|
||||
|
||||
int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes
|
||||
int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites!
|
||||
|
||||
int numOnLeft = prevSiteIndex;
|
||||
int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1);
|
||||
|
||||
int useOnLeft, useOnRight;
|
||||
if (numOnLeft <= numOnRight) {
|
||||
int halfToUse = numToUse / 2; // skimp on the left [floor], and be generous with the right side
|
||||
useOnLeft = Math.min(halfToUse, numOnLeft);
|
||||
useOnRight = Math.min(numToUse - useOnLeft, numOnRight);
|
||||
Set<Integer> scoreAllIndices = new TreeSet<Integer>(new SortSitesBySumOfDist(listHetGenotypes));
|
||||
for (int i = 0; i < listHetGenotypes.size(); ++i) {
|
||||
if (i != phaseRelativeToIndex && i != phasingSiteIndex)
|
||||
scoreAllIndices.add(i);
|
||||
}
|
||||
else { // numOnRight < numOnLeft
|
||||
int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil]
|
||||
useOnRight = Math.min(halfToUse, numOnRight);
|
||||
useOnLeft = Math.min(numToUse - useOnRight, numOnLeft);
|
||||
|
||||
Set<Integer> keepIndices = new TreeSet<Integer>();
|
||||
// always keep these two indices:
|
||||
keepIndices.add(phaseRelativeToIndex);
|
||||
keepIndices.add(phasingSiteIndex);
|
||||
for (int addInd : scoreAllIndices) {
|
||||
if (keepIndices.size() >= maxPhaseSites)
|
||||
break;
|
||||
else // keepIndices.size() < maxPhaseSites
|
||||
keepIndices.add(addInd);
|
||||
}
|
||||
int startIndex = prevSiteIndex - useOnLeft;
|
||||
int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep
|
||||
phasingSiteIndex -= startIndex;
|
||||
listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex);
|
||||
|
||||
List<GenotypeAndReadBases> newListHetGenotypes = new LinkedList<GenotypeAndReadBases>();
|
||||
int newPhaseRelativeToIndex = -1;
|
||||
int newPhasingSiteIndex = -1;
|
||||
int oldIndex = 0;
|
||||
int newIndex = 0;
|
||||
for (GenotypeAndReadBases grb : listHetGenotypes) {
|
||||
if (keepIndices.contains(oldIndex)) {
|
||||
newListHetGenotypes.add(grb);
|
||||
|
||||
if (oldIndex == phaseRelativeToIndex)
|
||||
newPhaseRelativeToIndex = newIndex;
|
||||
if (oldIndex == phasingSiteIndex)
|
||||
newPhasingSiteIndex = newIndex;
|
||||
|
||||
++newIndex;
|
||||
}
|
||||
++oldIndex;
|
||||
}
|
||||
|
||||
phaseRelativeToIndex = newPhaseRelativeToIndex;
|
||||
phasingSiteIndex = newPhasingSiteIndex;
|
||||
listHetGenotypes = newListHetGenotypes;
|
||||
if (DEBUG)
|
||||
logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes));
|
||||
|
||||
|
|
@ -900,7 +946,8 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
/* Will map a phase and its "complement" to a single representative phase,
|
||||
and marginalizeAsNewTable() marginalizes to 2 positions [starting at the previous position, and then the current position]:
|
||||
*/
|
||||
HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, phaseWindow.phasingSiteIndex - 1, 2);
|
||||
int[] marginalizeInds = {phaseWindow.phaseRelativeToIndex, phaseWindow.phasingSiteIndex};
|
||||
HaplotypeTableCreator tabCreator = new TableCreatorOfHaplotypeAndComplementForDiploidAlleles(phaseWindow.hetGenotypes, marginalizeInds);
|
||||
PhasingTable sampleHaps = tabCreator.getNewTable();
|
||||
|
||||
if (DEBUG && logger.isDebugEnabled()) {
|
||||
|
|
@ -1015,17 +1062,27 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
Ensure that curAllelePair is phased relative to prevAllelePair as specified by hap.
|
||||
*/
|
||||
|
||||
public static void ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, Haplotype hap) {
|
||||
public static String[] ensurePhasing(SNPallelePair curAllelePair, SNPallelePair prevAllelePair, String[] prevPairNames, Haplotype hap) {
|
||||
if (hap.size() < 2)
|
||||
throw new ReviewedStingException("LOGICAL ERROR: Only considering haplotypes of length > 2!");
|
||||
|
||||
String[] curPairNames = prevPairNames;
|
||||
|
||||
byte prevBase = hap.getBase(0); // The 1st base in the haplotype
|
||||
byte curBase = hap.getBase(1); // The 2nd base in the haplotype
|
||||
|
||||
boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase);
|
||||
boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase);
|
||||
if (chosePrevTopChrom != choseCurTopChrom)
|
||||
curAllelePair.swapAlleles();
|
||||
if (chosePrevTopChrom != choseCurTopChrom) {
|
||||
//curAllelePair.swapAlleles();
|
||||
|
||||
/* Instead of swapping the alleles (as we used to above),
|
||||
we swap the haplotype names to fit the unswapped alleles as they are ordered in the Genotype:
|
||||
*/
|
||||
curPairNames = new String[]{prevPairNames[1], prevPairNames[0]};
|
||||
}
|
||||
|
||||
return curPairNames;
|
||||
}
|
||||
|
||||
private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) {
|
||||
|
|
@ -1036,8 +1093,8 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
return loc1.distance(loc2) <= cacheWindow; // distance() checks: loc1.onSameContig(loc2)
|
||||
}
|
||||
|
||||
private int startDistance(UnfinishedVariantContext uvc1, VariantContext vc2) {
|
||||
return uvc1.getLocation().distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
|
||||
private int startDistance(GenomeLoc gl1, VariantContext vc2) {
|
||||
return gl1.distance(GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc2));
|
||||
}
|
||||
|
||||
public PhasingStats reduce(PhasingStatsAndOutput statsAndList, PhasingStats stats) {
|
||||
|
|
@ -1287,21 +1344,23 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
|
||||
private static class TableCreatorOfHaplotypeAndComplementForDiploidAlleles extends HaplotypeTableCreator {
|
||||
private SNPallelePair[] SNPallelePairs;
|
||||
private int startIndex;
|
||||
private int marginalizeLength;
|
||||
Set<Integer> marginalizeInds;
|
||||
|
||||
public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) {
|
||||
public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int[] marginalizeInds) {
|
||||
super(hetGenotypes);
|
||||
|
||||
this.SNPallelePairs = new SNPallelePair[genotypes.length];
|
||||
for (int i = 0; i < genotypes.length; i++)
|
||||
SNPallelePairs[i] = new SNPallelePair(genotypes[i]);
|
||||
|
||||
this.startIndex = startIndex;
|
||||
this.marginalizeLength = marginalizeLength;
|
||||
this.marginalizeInds = new TreeSet<Integer>();
|
||||
for (int mind : marginalizeInds)
|
||||
this.marginalizeInds.add(mind);
|
||||
}
|
||||
|
||||
public PhasingTable getNewTable() {
|
||||
int startIndex = marginalizeInds.iterator().next();
|
||||
|
||||
PhasingTable table = new PhasingTable();
|
||||
for (Haplotype hap : getAllHaplotypes()) {
|
||||
if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) {
|
||||
|
|
@ -1313,8 +1372,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
|
|||
hapList.add(hap);
|
||||
hapList.add(complement(hap));
|
||||
|
||||
// want marginalizeLength positions starting at startIndex:
|
||||
Haplotype rep = hap.subHaplotype(startIndex, startIndex + marginalizeLength);
|
||||
Haplotype rep = hap.subHaplotype(marginalizeInds);
|
||||
double hapClassPrior = getHaplotypeRepresentativePrior(rep); // Note that prior is ONLY a function of the representative haplotype
|
||||
|
||||
HaplotypeClass hapClass = new HaplotypeClass(hapList, rep);
|
||||
|
|
|
|||
Loading…
Reference in New Issue