diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java index 61d5a725e..9e7838647 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java @@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; class Haplotype extends BaseArray implements Cloneable { public Haplotype(byte[] bases) { @@ -68,4 +71,19 @@ class Haplotype extends BaseArray implements Cloneable { public Haplotype subHaplotype(int fromIndex, int toIndex) { return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); } + + public Haplotype subHaplotype(Set inds) { + List basesList = new LinkedList(); + for (int i : inds) { + if (0 <= i && i < bases.length) + basesList.add(bases[i]); + } + + Byte[] newBases = new Byte[basesList.size()]; + int index = 0; + for (Byte b : basesList) + newBases[index++] = b; + + return new Haplotype(newBases); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java old mode 100755 new mode 100644 index 68eab9889..ef84ed10b --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -137,6 +137,7 @@ public class ReadBackedPhasing extends RodWalker prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; - /* Notes: - 1. Call to next() advances iterator to next position in partiallyPhasedSites. - 2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads. - */ - UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant; - Genotype prevHetGenotype = prevUvc.getGenotype(samp); + // Create the list of all het genotypes preceding this one (and in the phasing window as contained in partiallyPhasedSites): + List prevHetGenotypes = new LinkedList(); + CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); + while (phasedIt.hasNext()) { + UnfinishedVariantAndReads phasedVr = phasedIt.next(); + Genotype prevGt = phasedVr.unfinishedVariant.getGenotype(samp); + if (prevGt != null && isUnfilteredCalledDiploidGenotype(prevGt) && prevGt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(prevGt, phasedVr.sampleReadBases.get(samp), phasedVr.unfinishedVariant.getLocation()); + prevHetGenotypes.add(grb); + if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); + } + } + + SNPallelePair allelePair = new SNPallelePair(gt); + if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair); + + boolean phasedCurGenotypeRelativeToPrevious = false; + for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) { + PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets); PhaseResult pr = phaseSampleAtSite(phaseWindow); - boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality); + phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality); if (pr.phasingContainsInconsistencies) { if (DEBUG) @@ -406,44 +412,43 @@ public class ReadBackedPhasing extends RodWalker prevHetAndInteriorIt = null; + + private int phaseRelativeToIndex = -1; private int phasingSiteIndex = -1; + private Map readsAtHetSites = null; - private void clearFields() { - hetGenotypes = null; - prevHetAndInteriorIt = null; - phasingSiteIndex = -1; - readsAtHetSites = null; - } - - public boolean hasPreviousHets() { - return phasingSiteIndex > 0; + public Genotype phaseRelativeToGenotype() { + return hetGenotypes[phaseRelativeToIndex]; } // ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)] - public PhasingWindow(VariantAndReads vr, String sample) { - List listHetGenotypes = new LinkedList(); + public PhasingWindow(VariantAndReads vr, String sample, List prevHetGenotypes, int goBackFromEndOfPrevHets) { + if (prevHetGenotypes.isEmpty() || goBackFromEndOfPrevHets >= prevHetGenotypes.size()) // no previous sites against which to phase + throw new ReviewedStingException("Should never get empty set of previous sites to phase against"); - // Include previously phased sites in the phasing computation: - CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); - while (phasedIt.hasNext()) { - UnfinishedVariantAndReads phasedVr = phasedIt.next(); - Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break" - listHetGenotypes.clear(); // clear out any history - } - else if (gt.isHet()) { - GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation()); - listHetGenotypes.add(grb); - if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); - prevHetAndInteriorIt = phasedIt.clone(); - } - } + // Include these previously phased sites in the phasing computation: + List listHetGenotypes = new LinkedList(prevHetGenotypes); + + phaseRelativeToIndex = listHetGenotypes.size() - 1 - goBackFromEndOfPrevHets; phasingSiteIndex = listHetGenotypes.size(); - if (phasingSiteIndex == 0) { // no previous sites against which to phase - clearFields(); - return; - } - prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()] - if (respectPhaseInInput) { - Genotype prevHetGenotype = prevHetAndInteriorIt.clone().next().unfinishedVariant.getGenotype(sample); - if (!prevHetGenotype.isPhased()) { - // Make this genotype unphaseable, since its previous het is not already phased [as required by respectPhaseInInput]: - clearFields(); - return; - } - } - - // Add the (het) position to be phased: + // Add the (het) position to be phased [at phasingSiteIndex]: GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant); GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); listHetGenotypes.add(grbPhase); - if (DEBUG) - logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); + if (DEBUG) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); // Include as-of-yet unphased sites in the phasing computation: for (VariantAndReads nextVr : unphasedSiteQueue) { if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc break; Genotype gt = nextVr.variant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break" - break; - } - else if (gt.isHet()) { + if (gt != null && isUnfilteredCalledDiploidGenotype(gt) && gt.isHet()) { GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant)); listHetGenotypes.add(grb); if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc); @@ -571,7 +557,7 @@ public class ReadBackedPhasing extends RodWalker maxPhaseSites) { listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus); @@ -585,8 +571,7 @@ public class ReadBackedPhasing extends RodWalker keepReads = new HashSet(); - /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex): + /* Check which Reads are involved in acyclic paths from phaseRelativeToIndex to (phasingSiteIndex): In detail: Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph". @@ -759,7 +744,7 @@ public class ReadBackedPhasing extends RodWalker keepHetSites = new LinkedList(); int index = 0; - int numPrecedingRemoved = 0; + int numPrecedingPhaseRelativeToSiteRemoved = 0; + int numPrecedingPhasingSiteRemoved = 0; for (GenotypeAndReadBases grb : listHetGenotypes) { boolean keepSite = sitesWithReads.contains(index); if (DEBUG && logger.isDebugEnabled() && !keepSite) logger.debug("Removing read-less site " + grb.loc); - if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) { + if (keepSite || index == phasingSiteIndex || index == phaseRelativeToIndex) { keepHetSites.add(grb); if (!keepSite) if (DEBUG) logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); } - else if (index <= phasingSiteIndex) - numPrecedingRemoved++; + else { + if (index <= phaseRelativeToIndex) + numPrecedingPhaseRelativeToSiteRemoved++; + if (index <= phasingSiteIndex) + numPrecedingPhasingSiteRemoved++; + } index++; } - phasingSiteIndex -= numPrecedingRemoved; + phaseRelativeToIndex -= numPrecedingPhaseRelativeToSiteRemoved; + phasingSiteIndex -= numPrecedingPhasingSiteRemoved; return keepHetSites; } + private class SortSitesBySumOfDist implements Comparator { + private Vector grb; + + public SortSitesBySumOfDist(List listHetGenotypes) { + grb = new Vector(listHetGenotypes); + } + + public int compare(Integer i1, Integer i2) { + int d1 = calcGenomicDist(i1); + int d2 = calcGenomicDist(i2); + + if (d1 != d2) + return d1 - d2; + + int id1 = calcIndexDist(i1); + int id2 = calcIndexDist(i2); + if (id1 != id2) + return id1 - id2; + + return i1 - i2; + } + + private int calcGenomicDist(int i) { + int d1 = grb.get(i).loc.distance(grb.get(phaseRelativeToIndex).loc); + int d2 = grb.get(i).loc.distance(grb.get(phasingSiteIndex).loc); + + return d1 + d2; + } + + private int calcIndexDist(int i) { + int d1 = Math.abs(i - phaseRelativeToIndex); + int d2 = Math.abs(i - phasingSiteIndex); + + return d1 + d2; + } + } + private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { if (DEBUG) logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); - int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes - int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites! - - int numOnLeft = prevSiteIndex; - int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1); - - int useOnLeft, useOnRight; - if (numOnLeft <= numOnRight) { - int halfToUse = numToUse / 2; // skimp on the left [floor], and be generous with the right side - useOnLeft = Math.min(halfToUse, numOnLeft); - useOnRight = Math.min(numToUse - useOnLeft, numOnRight); + Set scoreAllIndices = new TreeSet(new SortSitesBySumOfDist(listHetGenotypes)); + for (int i = 0; i < listHetGenotypes.size(); ++i) { + if (i != phaseRelativeToIndex && i != phasingSiteIndex) + scoreAllIndices.add(i); } - else { // numOnRight < numOnLeft - int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil] - useOnRight = Math.min(halfToUse, numOnRight); - useOnLeft = Math.min(numToUse - useOnRight, numOnLeft); + + Set keepIndices = new TreeSet(); + // always keep these two indices: + keepIndices.add(phaseRelativeToIndex); + keepIndices.add(phasingSiteIndex); + for (int addInd : scoreAllIndices) { + if (keepIndices.size() >= maxPhaseSites) + break; + else // keepIndices.size() < maxPhaseSites + keepIndices.add(addInd); } - int startIndex = prevSiteIndex - useOnLeft; - int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep - phasingSiteIndex -= startIndex; - listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex); + + List newListHetGenotypes = new LinkedList(); + int newPhaseRelativeToIndex = -1; + int newPhasingSiteIndex = -1; + int oldIndex = 0; + int newIndex = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + if (keepIndices.contains(oldIndex)) { + newListHetGenotypes.add(grb); + + if (oldIndex == phaseRelativeToIndex) + newPhaseRelativeToIndex = newIndex; + if (oldIndex == phasingSiteIndex) + newPhasingSiteIndex = newIndex; + + ++newIndex; + } + ++oldIndex; + } + + phaseRelativeToIndex = newPhaseRelativeToIndex; + phasingSiteIndex = newPhasingSiteIndex; + listHetGenotypes = newListHetGenotypes; if (DEBUG) logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); @@ -900,7 +946,8 @@ public class ReadBackedPhasing extends RodWalker 2!"); + String[] curPairNames = prevPairNames; + byte prevBase = hap.getBase(0); // The 1st base in the haplotype byte curBase = hap.getBase(1); // The 2nd base in the haplotype boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase); boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase); - if (chosePrevTopChrom != choseCurTopChrom) - curAllelePair.swapAlleles(); + if (chosePrevTopChrom != choseCurTopChrom) { + //curAllelePair.swapAlleles(); + + /* Instead of swapping the alleles (as we used to above), + we swap the haplotype names to fit the unswapped alleles as they are ordered in the Genotype: + */ + curPairNames = new String[]{prevPairNames[1], prevPairNames[0]}; + } + + return curPairNames; } private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) { @@ -1036,8 +1093,8 @@ public class ReadBackedPhasing extends RodWalker marginalizeInds; - public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) { + public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int[] marginalizeInds) { super(hetGenotypes); this.SNPallelePairs = new SNPallelePair[genotypes.length]; for (int i = 0; i < genotypes.length; i++) SNPallelePairs[i] = new SNPallelePair(genotypes[i]); - this.startIndex = startIndex; - this.marginalizeLength = marginalizeLength; + this.marginalizeInds = new TreeSet(); + for (int mind : marginalizeInds) + this.marginalizeInds.add(mind); } public PhasingTable getNewTable() { + int startIndex = marginalizeInds.iterator().next(); + PhasingTable table = new PhasingTable(); for (Haplotype hap : getAllHaplotypes()) { if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) { @@ -1313,8 +1372,7 @@ public class ReadBackedPhasing extends RodWalker