From 6984ab0c78b29e57b1cf6c9cb68cafb58316279a Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Tue, 8 Jan 2013 16:40:32 -0500 Subject: [PATCH 1/7] Change ReadBackedPhasing to be VCF4.1 compliant (but instead of using PS tags, it uses HP tags for pairs of diploid haplotype labels). The important point is that it no longer uses the '|' character instead of '/' or flips around the allele order in the genotype. Instead, the HP FORMAT targ is used to mark the haplotype labels with respect to other genotypes for the same sample. Note that this enables the phasing of non-consecutive (but nearby) sites, based on mate pairs, for example. Also, updated the HaplotypeCallerValidation Qscript to perform PhaseByTransmission (if ped file given) and then ReadBackedPhasing --- .../sting/gatk/walkers/phasing/Haplotype.java | 18 + .../walkers/phasing/ReadBackedPhasing.java | 332 ++++++++++-------- 2 files changed, 213 insertions(+), 137 deletions(-) mode change 100755 => 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java index 61d5a725e..9e7838647 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/Haplotype.java @@ -26,6 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; class Haplotype extends BaseArray implements Cloneable { public Haplotype(byte[] bases) { @@ -68,4 +71,19 @@ class Haplotype extends BaseArray implements Cloneable { public Haplotype subHaplotype(int fromIndex, int toIndex) { return new Haplotype(Arrays.copyOfRange(bases, fromIndex, Math.min(toIndex, size()))); } + + public Haplotype subHaplotype(Set inds) { + List basesList = new LinkedList(); + for (int i : inds) { + if (0 <= i && i < bases.length) + basesList.add(bases[i]); + } + + Byte[] newBases = new Byte[basesList.size()]; + int index = 0; + for (Byte b : basesList) + newBases[index++] = b; + + return new Haplotype(newBases); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java old mode 100755 new mode 100644 index 68eab9889..ef84ed10b --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -137,6 +137,7 @@ public class ReadBackedPhasing extends RodWalker prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; - /* Notes: - 1. Call to next() advances iterator to next position in partiallyPhasedSites. - 2. prevHetGenotype != null, since otherwise prevHetAndInteriorIt would not have been chosen to point to its UnfinishedVariantAndReads. - */ - UnfinishedVariantContext prevUvc = prevHetAndInteriorIt.next().unfinishedVariant; - Genotype prevHetGenotype = prevUvc.getGenotype(samp); + // Create the list of all het genotypes preceding this one (and in the phasing window as contained in partiallyPhasedSites): + List prevHetGenotypes = new LinkedList(); + CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); + while (phasedIt.hasNext()) { + UnfinishedVariantAndReads phasedVr = phasedIt.next(); + Genotype prevGt = phasedVr.unfinishedVariant.getGenotype(samp); + if (prevGt != null && isUnfilteredCalledDiploidGenotype(prevGt) && prevGt.isHet()) { + GenotypeAndReadBases grb = new GenotypeAndReadBases(prevGt, phasedVr.sampleReadBases.get(samp), phasedVr.unfinishedVariant.getLocation()); + prevHetGenotypes.add(grb); + if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); + } + } + + SNPallelePair allelePair = new SNPallelePair(gt); + if (DEBUG) logger.debug("Want to phase TOP vs. BOTTOM for: " + "\n" + allelePair); + + boolean phasedCurGenotypeRelativeToPrevious = false; + for (int goBackFromEndOfPrevHets = 0; goBackFromEndOfPrevHets < prevHetGenotypes.size(); goBackFromEndOfPrevHets++) { + PhasingWindow phaseWindow = new PhasingWindow(vr, samp, prevHetGenotypes, goBackFromEndOfPrevHets); PhaseResult pr = phaseSampleAtSite(phaseWindow); - boolean genotypesArePhased = passesPhasingThreshold(pr.phaseQuality); + phasedCurGenotypeRelativeToPrevious = passesPhasingThreshold(pr.phaseQuality); if (pr.phasingContainsInconsistencies) { if (DEBUG) @@ -406,44 +412,43 @@ public class ReadBackedPhasing extends RodWalker prevHetAndInteriorIt = null; + + private int phaseRelativeToIndex = -1; private int phasingSiteIndex = -1; + private Map readsAtHetSites = null; - private void clearFields() { - hetGenotypes = null; - prevHetAndInteriorIt = null; - phasingSiteIndex = -1; - readsAtHetSites = null; - } - - public boolean hasPreviousHets() { - return phasingSiteIndex > 0; + public Genotype phaseRelativeToGenotype() { + return hetGenotypes[phaseRelativeToIndex]; } // ASSUMES that: isUnfilteredCalledDiploidGenotype(vrGt) && vrGt.isHet() [vrGt = vr.variant.getGenotype(sample)] - public PhasingWindow(VariantAndReads vr, String sample) { - List listHetGenotypes = new LinkedList(); + public PhasingWindow(VariantAndReads vr, String sample, List prevHetGenotypes, int goBackFromEndOfPrevHets) { + if (prevHetGenotypes.isEmpty() || goBackFromEndOfPrevHets >= prevHetGenotypes.size()) // no previous sites against which to phase + throw new ReviewedStingException("Should never get empty set of previous sites to phase against"); - // Include previously phased sites in the phasing computation: - CloneableIteratorLinkedList.CloneableIterator phasedIt = partiallyPhasedSites.iterator(); - while (phasedIt.hasNext()) { - UnfinishedVariantAndReads phasedVr = phasedIt.next(); - Genotype gt = phasedVr.unfinishedVariant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must start AFTER this "break" - listHetGenotypes.clear(); // clear out any history - } - else if (gt.isHet()) { - GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, phasedVr.sampleReadBases.get(sample), phasedVr.unfinishedVariant.getLocation()); - listHetGenotypes.add(grb); - if (DEBUG) logger.debug("Using UPSTREAM het site = " + grb.loc); - prevHetAndInteriorIt = phasedIt.clone(); - } - } + // Include these previously phased sites in the phasing computation: + List listHetGenotypes = new LinkedList(prevHetGenotypes); + + phaseRelativeToIndex = listHetGenotypes.size() - 1 - goBackFromEndOfPrevHets; phasingSiteIndex = listHetGenotypes.size(); - if (phasingSiteIndex == 0) { // no previous sites against which to phase - clearFields(); - return; - } - prevHetAndInteriorIt.previous(); // so that it points to the previous het site [and NOT one after it, due to the last call to next()] - if (respectPhaseInInput) { - Genotype prevHetGenotype = prevHetAndInteriorIt.clone().next().unfinishedVariant.getGenotype(sample); - if (!prevHetGenotype.isPhased()) { - // Make this genotype unphaseable, since its previous het is not already phased [as required by respectPhaseInInput]: - clearFields(); - return; - } - } - - // Add the (het) position to be phased: + // Add the (het) position to be phased [at phasingSiteIndex]: GenomeLoc phaseLocus = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vr.variant); GenotypeAndReadBases grbPhase = new GenotypeAndReadBases(vr.variant.getGenotype(sample), vr.sampleReadBases.get(sample), phaseLocus); listHetGenotypes.add(grbPhase); - if (DEBUG) - logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); + if (DEBUG) logger.debug("PHASING het site = " + grbPhase.loc + " [phasingSiteIndex = " + phasingSiteIndex + "]"); // Include as-of-yet unphased sites in the phasing computation: for (VariantAndReads nextVr : unphasedSiteQueue) { if (!startDistancesAreInWindowRange(vr.variant, nextVr.variant)) //nextVr too far ahead of the range used for phasing vc break; Genotype gt = nextVr.variant.getGenotype(sample); - if (gt == null || !isUnfilteredCalledDiploidGenotype(gt)) { // constructed haplotype must end BEFORE this "break" - break; - } - else if (gt.isHet()) { + if (gt != null && isUnfilteredCalledDiploidGenotype(gt) && gt.isHet()) { GenotypeAndReadBases grb = new GenotypeAndReadBases(gt, nextVr.sampleReadBases.get(sample), GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), nextVr.variant)); listHetGenotypes.add(grb); if (DEBUG) logger.debug("Using DOWNSTREAM het site = " + grb.loc); @@ -571,7 +557,7 @@ public class ReadBackedPhasing extends RodWalker maxPhaseSites) { listHetGenotypes = trimWindow(listHetGenotypes, sample, phaseLocus); @@ -585,8 +571,7 @@ public class ReadBackedPhasing extends RodWalker keepReads = new HashSet(); - /* Check which Reads are involved in acyclic paths from (phasingSiteIndex - 1) to (phasingSiteIndex): + /* Check which Reads are involved in acyclic paths from phaseRelativeToIndex to (phasingSiteIndex): In detail: Every Read links EACH pair of sites for which it contains bases. Then, each such edge is added to a "site connectivity graph". @@ -759,7 +744,7 @@ public class ReadBackedPhasing extends RodWalker keepHetSites = new LinkedList(); int index = 0; - int numPrecedingRemoved = 0; + int numPrecedingPhaseRelativeToSiteRemoved = 0; + int numPrecedingPhasingSiteRemoved = 0; for (GenotypeAndReadBases grb : listHetGenotypes) { boolean keepSite = sitesWithReads.contains(index); if (DEBUG && logger.isDebugEnabled() && !keepSite) logger.debug("Removing read-less site " + grb.loc); - if (keepSite || index == phasingSiteIndex || index == phasingSiteIndex - 1) { + if (keepSite || index == phasingSiteIndex || index == phaseRelativeToIndex) { keepHetSites.add(grb); if (!keepSite) if (DEBUG) logger.debug("Although current or previous sites have no relevant reads, continuing empty attempt to phase them [for sake of program flow]..."); } - else if (index <= phasingSiteIndex) - numPrecedingRemoved++; + else { + if (index <= phaseRelativeToIndex) + numPrecedingPhaseRelativeToSiteRemoved++; + if (index <= phasingSiteIndex) + numPrecedingPhasingSiteRemoved++; + } index++; } - phasingSiteIndex -= numPrecedingRemoved; + phaseRelativeToIndex -= numPrecedingPhaseRelativeToSiteRemoved; + phasingSiteIndex -= numPrecedingPhasingSiteRemoved; return keepHetSites; } + private class SortSitesBySumOfDist implements Comparator { + private Vector grb; + + public SortSitesBySumOfDist(List listHetGenotypes) { + grb = new Vector(listHetGenotypes); + } + + public int compare(Integer i1, Integer i2) { + int d1 = calcGenomicDist(i1); + int d2 = calcGenomicDist(i2); + + if (d1 != d2) + return d1 - d2; + + int id1 = calcIndexDist(i1); + int id2 = calcIndexDist(i2); + if (id1 != id2) + return id1 - id2; + + return i1 - i2; + } + + private int calcGenomicDist(int i) { + int d1 = grb.get(i).loc.distance(grb.get(phaseRelativeToIndex).loc); + int d2 = grb.get(i).loc.distance(grb.get(phasingSiteIndex).loc); + + return d1 + d2; + } + + private int calcIndexDist(int i) { + int d1 = Math.abs(i - phaseRelativeToIndex); + int d2 = Math.abs(i - phasingSiteIndex); + + return d1 + d2; + } + } + private List trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { if (DEBUG) logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); - int prevSiteIndex = phasingSiteIndex - 1; // index of previous in listHetGenotypes - int numToUse = maxPhaseSites - 2; // since always keep previous and current het sites! - - int numOnLeft = prevSiteIndex; - int numOnRight = listHetGenotypes.size() - (phasingSiteIndex + 1); - - int useOnLeft, useOnRight; - if (numOnLeft <= numOnRight) { - int halfToUse = numToUse / 2; // skimp on the left [floor], and be generous with the right side - useOnLeft = Math.min(halfToUse, numOnLeft); - useOnRight = Math.min(numToUse - useOnLeft, numOnRight); + Set scoreAllIndices = new TreeSet(new SortSitesBySumOfDist(listHetGenotypes)); + for (int i = 0; i < listHetGenotypes.size(); ++i) { + if (i != phaseRelativeToIndex && i != phasingSiteIndex) + scoreAllIndices.add(i); } - else { // numOnRight < numOnLeft - int halfToUse = new Double(Math.ceil(numToUse / 2.0)).intValue(); // be generous with the right side [ceil] - useOnRight = Math.min(halfToUse, numOnRight); - useOnLeft = Math.min(numToUse - useOnRight, numOnLeft); + + Set keepIndices = new TreeSet(); + // always keep these two indices: + keepIndices.add(phaseRelativeToIndex); + keepIndices.add(phasingSiteIndex); + for (int addInd : scoreAllIndices) { + if (keepIndices.size() >= maxPhaseSites) + break; + else // keepIndices.size() < maxPhaseSites + keepIndices.add(addInd); } - int startIndex = prevSiteIndex - useOnLeft; - int stopIndex = phasingSiteIndex + useOnRight + 1; // put the index 1 past the desired index to keep - phasingSiteIndex -= startIndex; - listHetGenotypes = listHetGenotypes.subList(startIndex, stopIndex); + + List newListHetGenotypes = new LinkedList(); + int newPhaseRelativeToIndex = -1; + int newPhasingSiteIndex = -1; + int oldIndex = 0; + int newIndex = 0; + for (GenotypeAndReadBases grb : listHetGenotypes) { + if (keepIndices.contains(oldIndex)) { + newListHetGenotypes.add(grb); + + if (oldIndex == phaseRelativeToIndex) + newPhaseRelativeToIndex = newIndex; + if (oldIndex == phasingSiteIndex) + newPhasingSiteIndex = newIndex; + + ++newIndex; + } + ++oldIndex; + } + + phaseRelativeToIndex = newPhaseRelativeToIndex; + phasingSiteIndex = newPhasingSiteIndex; + listHetGenotypes = newListHetGenotypes; if (DEBUG) logger.warn("NAIVELY REDUCED to " + listHetGenotypes.size() + " sites:\n" + toStringGRL(listHetGenotypes)); @@ -900,7 +946,8 @@ public class ReadBackedPhasing extends RodWalker 2!"); + String[] curPairNames = prevPairNames; + byte prevBase = hap.getBase(0); // The 1st base in the haplotype byte curBase = hap.getBase(1); // The 2nd base in the haplotype boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase); boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase); - if (chosePrevTopChrom != choseCurTopChrom) - curAllelePair.swapAlleles(); + if (chosePrevTopChrom != choseCurTopChrom) { + //curAllelePair.swapAlleles(); + + /* Instead of swapping the alleles (as we used to above), + we swap the haplotype names to fit the unswapped alleles as they are ordered in the Genotype: + */ + curPairNames = new String[]{prevPairNames[1], prevPairNames[0]}; + } + + return curPairNames; } private boolean startDistancesAreInWindowRange(VariantContext vc1, VariantContext vc2) { @@ -1036,8 +1093,8 @@ public class ReadBackedPhasing extends RodWalker marginalizeInds; - public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int startIndex, int marginalizeLength) { + public TableCreatorOfHaplotypeAndComplementForDiploidAlleles(Genotype[] hetGenotypes, int[] marginalizeInds) { super(hetGenotypes); this.SNPallelePairs = new SNPallelePair[genotypes.length]; for (int i = 0; i < genotypes.length; i++) SNPallelePairs[i] = new SNPallelePair(genotypes[i]); - this.startIndex = startIndex; - this.marginalizeLength = marginalizeLength; + this.marginalizeInds = new TreeSet(); + for (int mind : marginalizeInds) + this.marginalizeInds.add(mind); } public PhasingTable getNewTable() { + int startIndex = marginalizeInds.iterator().next(); + PhasingTable table = new PhasingTable(); for (Haplotype hap : getAllHaplotypes()) { if (SNPallelePairs[startIndex].matchesTopBase(hap.getBase(startIndex))) { @@ -1313,8 +1372,7 @@ public class ReadBackedPhasing extends RodWalker Date: Wed, 9 Jan 2013 14:44:22 -0500 Subject: [PATCH 2/7] Added more options to control RBP behavior; raised default RBP PQ threshold to 20 --- .../sting/gatk/walkers/phasing/ReadBackedPhasing.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index ef84ed10b..d0b5fc5bd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -117,7 +117,7 @@ public class ReadBackedPhasing extends RodWalker P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9 + protected Double phaseQualityThresh = 20.0; // PQ = 20.0 <=> P(error) = 10^(-20/10) = 0.01, P(correct) = 0.99 @Hidden @Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files [For DEBUGGING purposes only - DO NOT USE!]", required = false) From 5577715ae25c7b668366daefbdb99b53626f4cd0 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Wed, 6 Mar 2013 10:18:23 -0500 Subject: [PATCH 3/7] Have all significant XHMM commands be run with LongRunTime --- .../sting/queue/qscripts/CNV/xhmmCNVpipeline.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala index 7dd771873..9bb031c38 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -196,7 +196,7 @@ class xhmmCNVpipeline extends QScript { } addAll(docs) - val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit + val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit with LongRunTime add(mergeDepths) var excludeTargets : List[File] = List[File]() @@ -305,7 +305,7 @@ class xhmmCNVpipeline extends QScript { } } - class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit { + class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { @Input(doc = "") val input = inputParam @@ -335,7 +335,7 @@ class xhmmCNVpipeline extends QScript { override def description = "Filters samples and targets and then mean-centers the targets: " + command } - class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { + class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { @Input(doc = "") val input = inputParam @@ -358,7 +358,7 @@ class xhmmCNVpipeline extends QScript { override def description = "Runs PCA on mean-centered data: " + command } - class Normalize(pca: PCA) extends CommandLineFunction { + class Normalize(pca: PCA) extends CommandLineFunction with LongRunTime { @Input(doc = "") val input = pca.input @@ -387,7 +387,7 @@ class xhmmCNVpipeline extends QScript { override def description = "Normalizes mean-centered data using PCA information: " + command } - class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { + class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { @Input(doc = "") val input = inputParam @@ -413,7 +413,7 @@ class xhmmCNVpipeline extends QScript { override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command } - class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit { + class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit with LongRunTime { @Input(doc = "") val input = inputParam From 13240588cf70052c7139fa3e30b54d7fd071286f Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Mon, 6 May 2013 13:52:14 -0400 Subject: [PATCH 5/7] Fix to only consider the samples that are both in the PED file and in the VCF file --- .../sting/gatk/walkers/phasing/PhaseByTransmission.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index a4c1caf86..87e9a6ea0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -450,9 +450,9 @@ public class PhaseByTransmission extends RodWalker, HashMa Set vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); //Get the trios from the families passed as ped - setTrios(); + setTrios(vcfSamples); if(trios.size()<1) - throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); + throw new UserException.BadInput("No PED file passed or no *non-skipped* trios found in PED file. Aborted."); Set headerLines = new HashSet(); @@ -471,9 +471,9 @@ public class PhaseByTransmission extends RodWalker, HashMa /** * Select trios and parent/child pairs only */ - private void setTrios(){ + private void setTrios(Set vcfSamples){ - Map> families = this.getSampleDB().getFamilies(); + Map> families = this.getSampleDB().getFamilies(vcfSamples); Set family; ArrayList parents; for(Map.Entry> familyEntry : families.entrySet()){ From c7dcc2b53bba03a2024919b467316a60dadc9de9 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Mon, 6 May 2013 15:47:27 -0400 Subject: [PATCH 6/7] Fix to deal with multi-generational families being allowed if only one level (one 'trio', effectively) appears in the VCF --- .../gatk/walkers/phasing/PhaseByTransmission.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 87e9a6ea0..7bbc4e981 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -478,6 +478,16 @@ public class PhaseByTransmission extends RodWalker, HashMa ArrayList parents; for(Map.Entry> familyEntry : families.entrySet()){ family = familyEntry.getValue(); + + // Since getFamilies(vcfSamples) above still returns parents of samples in the VCF even if those parents are not in the VCF, need to subset down here: + Set familyMembersInVCF = new TreeSet(); + for(Sample familyMember : family){ + if (vcfSamples.contains(familyMember.getID())) { + familyMembersInVCF.add(familyMember); + } + } + family = familyMembersInVCF; + if(family.size()<2 || family.size()>3){ logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey(),family.size())); } @@ -488,8 +498,7 @@ public class PhaseByTransmission extends RodWalker, HashMa if(family.containsAll(parents)) this.trios.add(familyMember); else - logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyEntry.getKey())); - break; + logger.info(String.format("Caution: Child %s of family %s skipped as info is not provided as a complete trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Child skipped.", familyMember.getID(), familyEntry.getKey())); } } } From e33d3dafc6554c4fea705e62dabb93115d457240 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Fri, 3 Jan 2014 12:04:47 -0500 Subject: [PATCH 7/7] Add documentation for RBP, and also update the MD5 for the tests now that the output uses HP tags instead of '|', which is now reserved for trio-based phasing --- .../walkers/phasing/ReadBackedPhasing.java | 47 +++++++++++++++---- .../ReadBackedPhasingIntegrationTest.java | 26 ++++------ 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index a28fc83b5..7ed77b845 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -82,6 +82,12 @@ import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersF /** * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). * + * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites. + * + * The underlying algorithm is based on building up 2^n local haplotypes, + * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). + * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag). + * *

* Performs physical phasing of SNP calls, based on sequencing reads. *

@@ -161,13 +167,6 @@ public class ReadBackedPhasing extends RodWalker unphasedSiteQueue = null; @@ -327,6 +326,7 @@ public class ReadBackedPhasing extends RodWalker processQueue(PhasingStats phaseStats, boolean processAll) { List oldPhasedList = new LinkedList(); @@ -362,6 +362,7 @@ public class ReadBackedPhasing extends RodWalker discardIrrelevantPhasedSites() { List vcList = new LinkedList(); @@ -517,6 +518,7 @@ public class ReadBackedPhasing extends RodWalker= phaseQualityThresh; } + // A genotype and the base pileup that supports it private static class GenotypeAndReadBases { public Genotype genotype; public ReadBasesAtPosition readBases; @@ -529,6 +531,7 @@ public class ReadBackedPhasing extends RodWalker listHetGenotypes, String sample, GenomeLoc phasingLoc) { buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null); } @@ -650,6 +654,7 @@ public class ReadBackedPhasing extends RodWalker> edgeReads; @@ -695,6 +700,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousReads(int numHetSites) { PhasingGraph readGraph = new PhasingGraph(numHetSites); EdgeToReads edgeToReads = new EdgeToReads(); @@ -840,6 +846,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousSites(List listHetGenotypes) { Set sitesWithReads = new HashSet(); for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { @@ -879,6 +886,10 @@ public class ReadBackedPhasing extends RodWalker { private Vector grb; @@ -916,6 +927,7 @@ public class ReadBackedPhasing extends RodWalker trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { if (DEBUG) logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); @@ -966,6 +978,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases; @@ -1214,6 +1229,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases; @@ -1313,6 +1329,9 @@ public class ReadBackedPhasing extends RodWalker hapMap = new TreeMap(); for (PhasingTable.PhasingTableEntry pte : table) { @@ -1366,6 +1388,7 @@ public class ReadBackedPhasing extends RodWalker marginalizeInds; @@ -1412,6 +1435,9 @@ public class ReadBackedPhasing extends RodWalker { private LinkedList table; @@ -1464,6 +1491,7 @@ public class ReadBackedPhasing extends RodWalker { private HaplotypeClass haplotypeClass; private PhasingScore score; @@ -1528,6 +1557,7 @@ public class ReadBackedPhasing extends RodWalker multipleBaseCounts = null; @@ -1644,6 +1674,7 @@ class HaplotypeClass implements Iterable { } } +// Summary statistics about phasing rates, for each sample class PhasingStats { private int numReads; private int numVarSites; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java index 9759004a0..8c8817fe6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -72,7 +72,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:332341-382503", 1, - Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c")); + Arrays.asList("1bb034bd54421fe4884e3142ed92d47e")); executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); } @@ -82,7 +82,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:1232503-1332503", 1, - Arrays.asList("a3ca151145379e0d4bae64a91165ea0b")); + Arrays.asList("c12954252d4c8659b5ecf7517b277496")); executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); } @@ -92,7 +92,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) + " -L chr20:332341-382503", 1, - Arrays.asList("f685803333123a102ce1851d984cbd10")); + Arrays.asList("0b945e30504d04e9c6fa659ca5c25ed5")); executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); } @@ -102,7 +102,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) + " -L chr20:332341-382503", 1, - Arrays.asList("aaa7c25d118383639f273128d241e140")); + Arrays.asList("e9e8ef92d694ca71f29737fba26282f5")); executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); } @@ -112,7 +112,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) + " -L chr20:332341-482503", 1, - Arrays.asList("418e29400762972e77bae4f73e16befe")); + Arrays.asList("b9c9347c760a06db635952bf4920fb48")); executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); } @@ -122,7 +122,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:652810-681757", 1, - Arrays.asList("4c8f6190ecc86766baba3aba08542991")); + Arrays.asList("02c3a903842aa035ae379f16bc3d64ae")); executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); } @@ -132,18 +132,8 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) + " -L chr20:332341-802503", 1, - Arrays.asList("44eb225ab3167651ec0a9e1fdcc83d34")); - executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec); - } - - @Test - public void test8() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) - + " -L chr20:332341-802503" + " -respectPhaseInInput", - 1, - Arrays.asList("e3549b89d49092e73cc6eb21f233471c")); - executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec); + Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574")); + executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); } }