From b4ef716aafedf2943ac07cb470b32d972246e252 Mon Sep 17 00:00:00 2001 From: fromer Date: Wed, 1 Dec 2010 16:42:08 +0000 Subject: [PATCH] As per Eric and Mark's suggestions, separated the segregating MNP merger (MergeMNPs) from the more general merger employed for annotation purposes (MergeSegregatingAlternateAlleles). Both use the same core MergePhasedSegregatingAlternateAllelesVCFWriter git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4763 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContextUtils.java | 78 ++++++++-- .../gatk/walkers/phasing/MergeMNPsWalker.java | 146 ++++++++++++++++++ ...SegregatingAlternateAllelesVCFWriter.java} | 119 +++++++++++--- ...ergeSegregatingAlternateAllelesWalker.java | 75 +++++---- .../phasing/ReadBackedPhasingWalker.java | 4 +- .../walkers/phasing/RefSeqDataParser.java | 54 +++++++ .../phasing/MergeMNPsIntegrationTest.java | 51 ++++++ ...gatingAlternateAllelesIntegrationTest.java | 12 +- 8 files changed, 461 insertions(+), 78 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java rename java/src/org/broadinstitute/sting/gatk/walkers/phasing/{MergePhasedSegregatingAlternateAllelesVCFWriter.java => MergeSegregatingAlternateAllelesVCFWriter.java} (79%) create mode 100644 java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index d755a7ecc..1f0d732e5 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -701,13 +701,27 @@ public class VariantContextUtils { return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd()); } - // NOTE: returns null if vc1 and vc2 are not mergeable into a single MNP record - public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser,VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { + public abstract static class AlleleMergeRule { + // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): + abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); + + public String toString() { + return "all samples are mergeable"; + } + } + + // NOTE: returns null if vc1 and vc2 are not merged into a single MNP record + + public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) return null; - // Check that it's logically possible to merge the VCs, and that there's a point in doing so (e.g., annotations could be changed): - if (!allSamplesAreMergeable(vc1, vc2) || !someSampleHasDoubleNonReferenceAllele(vc1, vc2)) + // Check that it's logically possible to merge the VCs: + if (!allSamplesAreMergeable(vc1, vc2)) + return null; + + // Check if there's a "point" in merging the VCs (e.g., annotations could be changed) + if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2)) return null; return reallyMergeIntoMNP(vc1, vc2, referenceFile); @@ -788,7 +802,7 @@ public class VariantContextUtils { if (!(other instanceof AlleleOneAndTwo)) return false; - AlleleOneAndTwo otherAot = (AlleleOneAndTwo)other; + AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other; return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2)); } } @@ -877,9 +891,9 @@ public class VariantContextUtils { return mergedAttribs; } - private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser,VariantContext vc1, VariantContext vc2) { - GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser,vc1); - GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser,vc2); + private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { + GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1); + GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2); if (!loc1.onSameContig(loc2)) throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome"); @@ -910,6 +924,7 @@ public class VariantContextUtils { } // Assumes that vc1 and vc2 were already checked to have the same sample names: + private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { @@ -948,6 +963,7 @@ public class VariantContextUtils { } // Assumes that alleleSegregationIsKnown(gt1, gt2): + private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) { if (gt2.genotypesArePhased() || gt2.isHom()) return new PhaseAndQuality(gt1); // maintain the phase of gt1 @@ -972,7 +988,8 @@ public class VariantContextUtils { /* Checks if any sample has a MNP of ALT alleles (segregating together): [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] */ - private static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { + + public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { String sample = gt1Entry.getKey(); Genotype gt1 = gt1Entry.getValue(); @@ -992,4 +1009,47 @@ public class VariantContextUtils { return false; } + + /* Checks if all samples are consistent in their haplotypes: + [Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)] + */ + + public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { + // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): + Map allele1ToAllele2 = new HashMap(); + Map allele2ToAllele1 = new HashMap(); + + // Note the segregation of the alleles for the reference genome: + allele1ToAllele2.put(vc1.getReference(), vc2.getReference()); + allele2ToAllele1.put(vc2.getReference(), vc1.getReference()); + + // Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples). + for (Map.Entry gt1Entry : vc1.getGenotypes().entrySet()) { + String sample = gt1Entry.getKey(); + Genotype gt1 = gt1Entry.getValue(); + Genotype gt2 = vc2.getGenotype(sample); + + List site1Alleles = gt1.getAlleles(); + List site2Alleles = gt2.getAlleles(); + + Iterator all2It = site2Alleles.iterator(); + for (Allele all1 : site1Alleles) { + Allele all2 = all2It.next(); + + Allele all1To2 = allele1ToAllele2.get(all1); + if (all1To2 == null) + allele1ToAllele2.put(all1, all2); + else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2 + return false; + + Allele all2To1 = allele2ToAllele1.get(all2); + if (all2To1 == null) + allele2ToAllele1.put(all2, all1); + else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1 + return false; + } + } + + return true; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java new file mode 100644 index 000000000..2d5fd2d9a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFWriter; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import java.util.*; + +import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods; + + +/** + * Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD. + */ +@Allows(value = {DataSource.REFERENCE}) +@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) +@By(DataSource.REFERENCE_ORDERED_DATA) + +public class MergeMNPsWalker extends RodWalker { + + @Output(doc = "File to which variants should be written", required = true) + protected VCFWriter writer = null; + private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null; + + @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false) + protected int maxGenomicDistanceForMNP = 1; + + private LinkedList rodNames = null; + + public void initialize() { + rodNames = new LinkedList(); + rodNames.add("variant"); + + initializeVcfWriter(); + } + + private void initializeVcfWriter() { + // false <-> don't take control of writer, since didn't create it: + vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false); + writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter] + + // setup the header fields: + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); + vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples()))); + } + + public boolean generateExtendedEvents() { + return false; + } + + public Integer reduceInit() { + return 0; + } + + /** + * For each site, send it to be (possibly) merged with previously observed sites. + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return dummy Integer + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker == null) + return null; + + boolean requireStartHere = true; // only see each VariantContext once + boolean takeFirstOnly = false; // take as many entries as the VCF file has + for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) + writeVCF(vc); + + return 0; + } + + private void writeVCF(VariantContext vc) { + byte refBase; + if (!vc.isIndel()) { + Allele varAllele = vc.getReference(); + refBase = SNPallelePair.getSingleBase(varAllele); + } + else { + refBase = vc.getReferenceBaseForIndel(); + } + + vcMergerWriter.add(vc, refBase); + } + + public Integer reduce(Integer result, Integer total) { + if (result == null) + return total; + + return total + result; + } + + /** + * Release any VariantContexts not yet processed. + * + * @param result Empty for now... + */ + public void onTraversalDone(Integer result) { + vcMergerWriter.close(); + + System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge()); + System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule()); + System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords()); + System.out.println(vcMergerWriter.getAltAlleleStats()); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java similarity index 79% rename from java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java rename to java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java index ea3a95bda..05e96e7da 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergePhasedSegregatingAlternateAllelesVCFWriter.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java @@ -42,13 +42,15 @@ import java.util.*; // Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts -public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWriter { +public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private VCFWriter innerWriter; private GenomeLocParser genomeLocParser; private ReferenceSequenceFile referenceFileForMNPmerging; - private MergeRule mergeRule; + + private VariantContextMergeRule vcMergeRule; + private VariantContextUtils.AlleleMergeRule alleleMergeRule; private String useSingleSample = null; @@ -67,11 +69,12 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite // Should we call innerWriter.close() in close() private boolean takeOwnershipOfInner; - public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, MergeRule mergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { + public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { this.innerWriter = innerWriter; this.genomeLocParser = genomeLocParser; this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile); - this.mergeRule = mergeRule; + this.vcMergeRule = vcMergeRule; + this.alleleMergeRule = alleleMergeRule; this.useSingleSample = singleSample; this.emitOnlyMergedRecords = emitOnlyMergedRecords; @@ -87,8 +90,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite this.takeOwnershipOfInner = takeOwnershipOfInner; } - public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) { - this(innerWriter, genomeLocParser, referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser), null, false, logger, false, false); // by default: consider all samples, emit all records, don't own inner, don't keep track of alt allele statistics + public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, Logger logger, boolean takeOwnershipOfInner) { + this(innerWriter, genomeLocParser, referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser), new SegregatingMNPmergeAllelesRule(), null, false, logger, takeOwnershipOfInner, true); // by default: consider all samples, emit all records, keep track of alt allele statistics } public void writeHeader(VCFHeader header) { @@ -146,7 +149,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite } else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them: numRecordsAttemptToMerge++; - boolean shouldMerge = mergeRule.shouldMerge(vcfrWaitingToMerge.vc, vc); + boolean shouldAttemptToMerge = vcMergeRule.shouldAttemptToMerge(vcfrWaitingToMerge.vc, vc); /* TODO: -- CONSIDER THE FOLLOWING EXAMPLE: WHAT DO WE WANT HERE??? -- @@ -163,14 +166,20 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite But, since we merged sites 1 and 2, we get that sites 1-2 and 3 are counted as two haplotypes of: ALT-REF and ALT-ALT */ if (altAlleleStats != null) - altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, shouldMerge); + altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, shouldAttemptToMerge); boolean mergedRecords = false; - if (shouldMerge) { + if (shouldAttemptToMerge) { numRecordsSatisfyingMergeRule++; - VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging); + VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule); + if (mergedVc != null) { mergedRecords = true; + + Map addedAttribs = vcMergeRule.addToMergedAttributes(vcfrWaitingToMerge.vc, vc); + addedAttribs.putAll(mergedVc.getAttributes()); + mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs); + vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true); numMergedRecords++; } @@ -213,8 +222,12 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite return numMergedRecords; } - public MergeRule getMergeRule() { - return mergeRule; + public VariantContextMergeRule getVcMergeRule() { + return vcMergeRule; + } + + public VariantContextUtils.AlleleMergeRule getAlleleMergeRule() { + return alleleMergeRule; } /** @@ -248,7 +261,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite private class AltAlleleStats { public int numSuccessiveGenotypes; - public int numSuccessiveGenotypesThatCouldBeMerged; + public int numSuccessiveGenotypesAttemptedToBeMerged; public int oneSampleMissing; public int atLeastOneSampleNotCalledOrFiltered; @@ -267,7 +280,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite public AltAlleleStats() { this.numSuccessiveGenotypes = 0; - this.numSuccessiveGenotypesThatCouldBeMerged = 0; + this.numSuccessiveGenotypesAttemptedToBeMerged = 0; this.oneSampleMissing = 0; this.atLeastOneSampleNotCalledOrFiltered = 0; @@ -292,11 +305,11 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite sb.append("Not called or filtered:\t" + atLeastOneSampleNotCalledOrFiltered + "\n"); sb.append("* Number of successive pairs of genotypes:\t" + numSuccessiveGenotypes + "\n"); - sb.append("Number of successive pairs of genotypes with " + mergeRule + ":\t" + numSuccessiveGenotypesThatCouldBeMerged + "\n"); + sb.append("Number of successive pairs of genotypes with " + vcMergeRule + ":\t" + numSuccessiveGenotypesAttemptedToBeMerged + "\n"); - sb.append("Unknown segregation, " + mergeRule + ":\t" + segregationUnknown + "\n"); - sb.append("Not variant at least one of pair, segregation known, " + mergeRule + ":\t" + eitherNotVariant + "\n"); - sb.append("* Variant at both, segregation known, " + mergeRule + ":\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n"); + sb.append("Unknown segregation, " + vcMergeRule + ":\t" + segregationUnknown + "\n"); + sb.append("Not variant at least one of pair, segregation known, " + vcMergeRule + ":\t" + eitherNotVariant + "\n"); + sb.append("* Variant at both, segregation known, " + vcMergeRule + ":\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n"); sb.append("[Total haplotypes at pairs:\t" + (ref_ref_pair + ref_alt_pair + alt_ref_pair + alt_alt_pair) + "\n"); sb.append("REF-REF:\t" + ref_ref_pair + "\n"); @@ -326,7 +339,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite this.sampleStats = new HashMap(); } - public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean shouldMerge) { + public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean shouldAttemptToMerge) { if (vc1.isFiltered() || vc2.isFiltered()) return; @@ -351,8 +364,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite else { aas.numSuccessiveGenotypes++; - if (shouldMerge) { - aas.numSuccessiveGenotypesThatCouldBeMerged++; + if (shouldAttemptToMerge) { + aas.numSuccessiveGenotypesAttemptedToBeMerged++; if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) { aas.segregationUnknown++; @@ -428,7 +441,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite public String toString() { StringBuilder sb = new StringBuilder(); sb.append("-------------------------------------------------------------------------\n"); - sb.append("Per-sample alternate allele statistics [" + mergeRule + "]\n"); + sb.append("Per-sample alternate allele statistics [" + vcMergeRule + "]\n"); sb.append("-------------------------------------------------------------------------"); for (Map.Entry sampAltAllStatsEntry : sampleStats.entrySet()) { @@ -440,4 +453,66 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite return sb.toString(); } } +} + + + +/* + External classes: + */ + +abstract class VariantContextMergeRule { + abstract public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2); + + public Map addToMergedAttributes(VariantContext vc1, VariantContext vc2) { + return new HashMap(); + } +} + +class DistanceMergeRule extends VariantContextMergeRule { + private int maxGenomicDistanceForMNP; + private GenomeLocParser genomeLocParser; + + public DistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser) { + this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP; + this.genomeLocParser = genomeLocParser; + } + + public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) { + return minDistance(vc1, vc2) <= maxGenomicDistanceForMNP; + } + + public String toString() { + return "Merge distance <= " + maxGenomicDistanceForMNP; + } + + public int minDistance(VariantContext vc1, VariantContext vc2) { + return VariantContextUtils.getLocation(genomeLocParser, vc1).minDistance(VariantContextUtils.getLocation(genomeLocParser, vc2)); + } +} + + +class ExistsDoubleAltAlleleMergeRule extends VariantContextUtils.AlleleMergeRule { + public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { + return VariantContextUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2); + } + + public String toString() { + return super.toString() + ", some sample has a MNP of ALT alleles"; + } +} + +class SegregatingMNPmergeAllelesRule extends ExistsDoubleAltAlleleMergeRule { + public SegregatingMNPmergeAllelesRule() { + super(); + } + + public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { + // Must be interesting AND consistent: + return super.allelesShouldBeMerged(vc1, vc2) && VariantContextUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2); + } + + public String toString() { + return super.toString() + ", all alleles segregate consistently"; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java index 6ea7b2e01..7bbbd34d9 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java @@ -45,7 +45,7 @@ import java.util.*; import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods; /** - * Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD. + * Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD. */ @Allows(value = {DataSource.REFERENCE}) @Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) @@ -55,10 +55,10 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker rodNames = null; public void initialize() { @@ -89,14 +92,20 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker don't take control of writer, since didn't create it: - vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,genomeLocParser, getToolkit().getArguments().referenceFile, mergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics); + vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, genomeLocParser, getToolkit().getArguments().referenceFile, vcMergeRule, alleleMergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics); writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter] // setup the header fields: @@ -168,8 +177,8 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker addToMergedAttributes(VariantContext vc1, VariantContext vc2) { + Map addedAttribs = super.addToMergedAttributes(vc1, vc2); + addedAttribs.putAll(RefSeqDataParser.getMergedRefSeqNameAttributes(vc1, vc2)); + return addedAttribs; + } +} + + + +class AlwaysMergeAllelesRule extends VariantContextUtils.AlleleMergeRule { + public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { + return true; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 236c1dfc5..956c6f57d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -138,8 +138,8 @@ public class ReadBackedPhasingWalker extends RodWalker use ALL samples, false <-> emit all records, false <-> don't track the statistics of alternate alleles being merged: - writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, getToolkit().getGenomeLocParser()), null, false, logger, writer != origWriter, false); + if (enableMergePhasedSegregatingPolymorphismsToMNP) + writer = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, writer != origWriter); /* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java index 692e01eeb..14fdd6302 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.broad.tribble.util.variantcontext.VariantContext; + import java.util.*; /* Some methods for extracting RefSeq-related data from annotated VCF INFO fields: @@ -36,6 +37,8 @@ public class RefSeqDataParser { private static String NAME_KEY = REFSEQ_PREFIX + "name"; private static String NAME2_KEY = REFSEQ_PREFIX + "name2"; + private static String[] NAME_KEYS = {NAME_KEY, NAME2_KEY}; + private static Map getRefSeqEntriesToNames(VariantContext vc, boolean getName2) { String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY; String nameKeyToUseMultiplePrefix = nameKeyToUse + "_"; @@ -78,6 +81,57 @@ public class RefSeqDataParser { return getRefSeqNames(vc, false); } + public static Map getMergedRefSeqNameAttributes(VariantContext vc1, VariantContext vc2) { + Map refSeqNameAttribs = new HashMap(); + + Map entriesMap1 = getAllRefSeqEntriesByName(vc1); + Map entriesMap2 = getAllRefSeqEntriesByName(vc2); + + Set commonNames = entriesMap1.keySet(); + commonNames.retainAll(entriesMap2.keySet()); + boolean addSuffix = commonNames.size() > 1; + int count = 1; + + for (String name : commonNames) { + RefSeqEntry refseq1 = entriesMap1.get(name); + RefSeqEntry refseq2 = entriesMap2.get(name); + + String keySuffix = ""; + if (addSuffix) + keySuffix = "_" + count; + + boolean added = false; + for (String key : NAME_KEYS) { + Object obj1 = refseq1.info.get(key); + Object obj2 = refseq2.info.get(key); + if (obj1 != null && obj2 != null && obj1.equals(obj2)) { + added = true; + String useKey = key + keySuffix; + refSeqNameAttribs.put(useKey, obj1); + } + } + if (added) + count++; + } + if (count > 1) + refSeqNameAttribs.put(NUM_RECORDS_KEY, count - 1); // since incremented count one extra time + + return refSeqNameAttribs; + } + + private static Map getAllRefSeqEntriesByName(VariantContext vc) { + Map nameToEntries = new TreeMap(); + + List allEntries = getAllRefSeqEntries(vc); + for (RefSeqEntry entry : allEntries) { + Object name = entry.info.get(NAME_KEY); + if (name != null) + nameToEntries.put(name.toString(), entry); + } + + return nameToEntries; + } + // Returns a List of SEPARATE Map for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc. private static List getAllRefSeqEntries(VariantContext vc) { List allRefSeq = new LinkedList(); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java new file mode 100644 index 000000000..9afbde302 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java @@ -0,0 +1,51 @@ +package org.broadinstitute.sting.gatk.walkers.phasing; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class MergeMNPsIntegrationTest extends WalkerTest { + + public static String baseTestString(String reference, String VCF, int maxDistMNP) { + return "-T MergeMNPs" + + " -R " + reference + + " -B:variant,VCF " + validationDataLocation + VCF + + " --maxGenomicDistanceForMNP " + maxDistMNP + + " -o %s" + + " -NO_HEADER"; + } + + + @Test + public void test1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1) + + " -L chr20:556259-756570", + 1, + Arrays.asList("19d0b2361367024bb9a83b9c15ef2453")); + executeTest("Merge MNP sites within genomic distance of 1 [TEST ONE]", spec); + } + + @Test + public void test2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10) + + " -L chr20:556259-756570", + 1, + Arrays.asList("f25a6403579dab1395773b3ba365c327")); + executeTest("Merge MNP sites within genomic distance of 10 [TEST TWO]", spec); + } + + @Test + public void test3() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100) + + " -L chr20:556259-756570", + 1, + Arrays.asList("a064955ffeea7fc4e09512f3e9cdbb9e")); + executeTest("Merge MNP sites within genomic distance of 100 [TEST THREE]", spec); + } + + +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java index 2ab6573d8..3ba46b25b 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java @@ -7,11 +7,11 @@ import java.util.Arrays; public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest { - public static String baseTestString(String reference, String VCF, int maxDistMNP) { + public static String baseTestString(String reference, String VCF, int maxDist) { return "-T MergeSegregatingAlternateAlleles" + " -R " + reference + " -B:variant,VCF " + validationDataLocation + VCF + - " --maxGenomicDistanceForMNP " + maxDistMNP + + " --maxGenomicDistance " + maxDist + " -o %s" + " -NO_HEADER"; } @@ -24,7 +24,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest + " -L chr20:556259-756570", 1, Arrays.asList("e6a14fc97dbd0aaa8e6a4d9a7f1616a6")); - executeTest("Merge MNP het sites within genomic distance of 1 [TEST ONE]", spec); + executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec); } @Test @@ -34,7 +34,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest + " -L chr20:556259-756570", 1, Arrays.asList("cc2b45c85a51b4998e30758c48f61940")); - executeTest("Merge MNP het sites within genomic distance of 10 [TEST TWO]", spec); + executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec); } @Test @@ -44,8 +44,8 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest + " -L chr20:556259-756570", 1, Arrays.asList("47300cc7a5a7d84b3c279f04c4567739")); - executeTest("Merge MNP het sites within genomic distance of 100 [TEST THREE]", spec); + executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec); } -} +} \ No newline at end of file