From 798955b0060992af1d27520bae7b1e781014cc8c Mon Sep 17 00:00:00 2001 From: fromer Date: Tue, 1 Feb 2011 19:50:15 +0000 Subject: [PATCH] After discussing with Mark, revert to "Master merging" of phase information from VCFs. This has the advantage of creating minimal phased VCFs from RBP, from which phase info is merged into the original "master VCF". Also, updated Genotype.sameGenotype() to be simpler and NOT REVERSE the ignorePhase flag in comparing Allele lists/sets git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5167 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContextUtils.java | 72 ++++++++++++++++++- .../MendelianViolationEvaluator.java | 2 +- .../walkers/variantutils/CombineVariants.java | 15 ++-- ...ingToTrioPhasingNoRecombinationWalker.java | 5 +- ...{phaseSamples.scala => PhaseSamples.scala} | 4 +- 5 files changed, 88 insertions(+), 10 deletions(-) rename scala/qscript/oneoffs/fromer/{phaseSamples.scala => PhaseSamples.scala} (97%) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 4e70428f7..24e2a2cef 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -316,9 +316,79 @@ public class VariantContextUtils { } public enum VariantMergeType { - UNION, INTERSECT + UNION, INTERSECT, MASTER } + /** + * Performs a master merge on the VCs. Here there is a master input [contains all of the information] and many + * VCs containing partial, extra genotype information which should be added to the master. For example, + * we scatter out the phasing algorithm over some samples in the master, producing a minimal VCF with phasing + * information per genotype. The master merge will add the PQ information from each genotype record, where + * appropriate, to the master VC. + * + * @param unsortedVCs + * @param masterName + * @return + */ + public static VariantContext masterMerge(Collection unsortedVCs, String masterName) { + VariantContext master = findMaster(unsortedVCs, masterName); + Map genotypes = master.getGenotypes(); + for (Genotype g : genotypes.values()) { + genotypes.put(g.getSampleName(), new MutableGenotype(g)); + } + + Map masterAttributes = new HashMap(master.getAttributes()); + + for (VariantContext vc : unsortedVCs) { + if (!vc.getSource().equals(masterName)) { + for (Genotype g : vc.getGenotypes().values()) { + MutableGenotype masterG = (MutableGenotype) genotypes.get(g.getSampleName()); + for (Map.Entry attr : g.getAttributes().entrySet()) { + if (!masterG.hasAttribute(attr.getKey())) { + //System.out.printf("Adding GT attribute %s to masterG %s, new %s%n", attr, masterG, g); + masterG.putAttribute(attr.getKey(), attr.getValue()); + } + } + + if (masterG.isPhased() != g.isPhased()) { + if (masterG.sameGenotype(g)) { + // System.out.printf("Updating phasing %s to masterG %s, new %s%n", g.isPhased(), masterG, g); + masterG.setAlleles(g.getAlleles()); + masterG.setPhase(g.isPhased()); + } + //else System.out.println("WARNING: Not updating phase, since genotypes differ between master file and auxiliary info file!"); + } + +// if ( MathUtils.compareDoubles(masterG.getNegLog10PError(), g.getNegLog10PError()) != 0 ) { +// System.out.printf("Updating GQ %s to masterG %s, new %s%n", g.getNegLog10PError(), masterG, g); +// masterG.setNegLog10PError(g.getNegLog10PError()); +// } + + } + + for (Map.Entry attr : vc.getAttributes().entrySet()) { + if (!masterAttributes.containsKey(attr.getKey())) { + //System.out.printf("Adding VC attribute %s to master %s, new %s%n", attr, master, vc); + masterAttributes.put(attr.getKey(), attr.getValue()); + } + } + } + } + + return new VariantContext(master.getSource(), master.getChr(), master.getStart(), master.getEnd(), master.getAlleles(), genotypes, master.getNegLog10PError(), master.getFilters(), masterAttributes); + } + + private static final VariantContext findMaster(Collection unsortedVCs, String masterName) { + for (VariantContext vc : unsortedVCs) { + if (vc.getSource().equals(masterName)) { + return vc; + } + } + + throw new ReviewedStingException(String.format("Couldn't find master VCF %s at %s", masterName, unsortedVCs.iterator().next())); + } + + public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, byte refBase) { return simpleMerge(genomeLocParser, unsortedVCs, null, VariantMergeType.INTERSECT, GenotypeMergeType.UNSORTED, false, false, refBase); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java index 1eee21fe5..bf162d88d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java @@ -172,7 +172,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator { for (Allele dadAllele : dadA) { if (momAllele.isCalled() && dadAllele.isCalled()) { Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); - if (childG.sameGenotype(possibleChild, false)) { + if (childG.sameGenotype(possibleChild)) { return false; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index b306b915e..1a21a92e0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -120,24 +120,29 @@ public class CombineVariants extends RodWalker { } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) // RodWalkers can make funky map calls + if ( tracker == null ) // RodWalkers can make funky map calls return 0; // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); - VariantContext mergedVC = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs, priority, variantMergeOption, - genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY, filteredAreUncalled); + VariantContext mergedVC = null; + if ( variantMergeOption == VariantContextUtils.VariantMergeType.MASTER ) { + mergedVC = VariantContextUtils.masterMerge(vcs, "master"); + } else { + mergedVC = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(),vcs, priority, variantMergeOption, + genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY, filteredAreUncalled); + } //out.printf(" merged => %s%nannotated => %s%n", mergedVC, annotatedMergedVC); - if (mergedVC != null) { // only operate at the start of events + if ( mergedVC != null ) { // only operate at the start of events HashMap attributes = new HashMap(mergedVC.getAttributes()); // re-compute chromosome counts VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false); VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); - if (minimalVCF) + if ( minimalVCF ) annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, new HashSet(Arrays.asList(SET_KEY))); vcfWriter.add(annotatedMergedVC, ref.getBase()); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java index fc1cac8e2..f10062185 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java @@ -168,7 +168,7 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< if (curTrioVc.getNSamples() > NUM_IN_TRIO || sampleCurGtInTrio == null) throw new UserException("Must provide trio data for sample: " + phasingSample); - if (!new TreeSet(curPhasingGt.getAlleles()).equals(new TreeSet(sampleCurGtInTrio.getAlleles()))) { + if (!curPhasingGt.sameGenotype(sampleCurGtInTrio)) { logger.warn("Locus " + curLoc + " breaks phase, since " + PHASING_ROD_NAME + " and " + TRIO_ROD_NAME + " tracks have different genotypes for " + phasingSample + "!"); prevLoc = null; return result; @@ -323,7 +323,8 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< stats.bothCanPhase++; useTrioPhase = false; - if (!phasedCurAlleles.equals(curPhasingGt.getAlleles())) { + boolean ignorePhase = false; + if (!phasedGt.sameGenotype(curPhasingGt, ignorePhase)) { String contradictMessage = "Phase from " + PHASING_ROD_NAME + " track at " + curLoc + " contradicts the trio-based phasing."; stats.contradictoryPhaseSites++; addToOutput += "\tcontradictory"; diff --git a/scala/qscript/oneoffs/fromer/phaseSamples.scala b/scala/qscript/oneoffs/fromer/PhaseSamples.scala similarity index 97% rename from scala/qscript/oneoffs/fromer/phaseSamples.scala rename to scala/qscript/oneoffs/fromer/PhaseSamples.scala index 4bad047d9..b06673819 100644 --- a/scala/qscript/oneoffs/fromer/phaseSamples.scala +++ b/scala/qscript/oneoffs/fromer/PhaseSamples.scala @@ -136,7 +136,9 @@ class CombineVariants(vcfsToCombine: List[File]) extends org.broadinstitute.stin this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) } - this.variantMergeOptions = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION) + // add the master call: + this.rodBind :+= RodBind("master", "VCF", masterCalls) + this.variantMergeOptions = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.MASTER) this.out = outputPhased }