From 61fe4092117ba447dfc7638043a41d4f20e44403 Mon Sep 17 00:00:00 2001 From: fromer Date: Mon, 24 Jan 2011 17:53:14 +0000 Subject: [PATCH] Basic walker to count the number of (phased) hets in each exome target git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5064 348d0f76-0448-11de-a6fe-93d51630548a --- .../CountHetPhasingInIntervalWalker.java | 255 ++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/phasing/CountHetPhasingInIntervalWalker.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/phasing/CountHetPhasingInIntervalWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/phasing/CountHetPhasingInIntervalWalker.java new file mode 100755 index 000000000..7b118a381 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/phasing/CountHetPhasingInIntervalWalker.java @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.phasing; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.PrintStream; +import java.util.*; + +/** + * Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites. + */ +@Allows(value = {DataSource.REFERENCE}) +@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) + +@ReadFilters({ZeroMappingQualityReadFilter.class}) +// Filter out all reads with zero mapping quality + +public class CountHetPhasingInIntervalWalker extends RodWalker { + private LinkedList rodNames = null; + + private GenomeLoc prevInterval = null; + + private MultiSampleIntervalStats intervalStats = null; + + @Output + protected PrintStream out; + + public void initialize() { + rodNames = new LinkedList(); + rodNames.add("variant"); + + intervalStats = new MultiSampleIntervalStats(); + } + + public boolean generateExtendedEvents() { + return false; + } + + public Integer reduceInit() { + return 0; + } + + /** + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker == null) + return null; + + int processed = 1; + + List interval = tracker.getGATKFeatureMetaData("intervals", true); + if (interval.size() != 1) { + String error = "At " + ref.getLocus() + " : Must provide a track named 'intervals' with exactly ONE interval per locus in -L argument!"; + if (interval.size() < 1) + throw new UserException(error); + else // interval.size() > 1 + logger.warn(error); + } + // Take the FIRST interval covering this locus, and WARN about multiple intervals (above): + GenomeLoc curInterval = interval.get(0).getLocation(); + logger.debug("refLocus: " + ref.getLocus() + "\tcurInterval = " + curInterval); + + boolean isNewInterval = (prevInterval == null || !curInterval.equals(prevInterval)); + if (isNewInterval) + intervalStats.startNewInterval(); + + boolean requireStartHere = true; // only see each VariantContext once + boolean takeFirstOnly = false; // take as many entries as the VCF file has + for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { + Map sampToGenotypes = vc.getGenotypes(); + for (Map.Entry sampEntry : sampToGenotypes.entrySet()) { + Genotype gt = sampEntry.getValue(); + intervalStats.processHetSiteInInterval(sampEntry.getKey(), gt.isHet(), gt.isPhased()); + } + } + + prevInterval = curInterval; + + return processed; + } + + public Integer reduce(Integer addIn, Integer runningCount) { + if (addIn == null) + addIn = 0; + + return runningCount + addIn; + } + + /** + * @param result the number of reads and VariantContexts seen. + */ + public void onTraversalDone(Integer result) { + intervalStats.finalizeStats(); + + System.out.println("Processed " + result + " sites."); + + for (Map.Entry sampleEntry : intervalStats.entrySet()) { + out.println("Sample:\t" + sampleEntry.getKey()); + out.println(sampleEntry.getValue() + "\n"); + } + } + + class MultiSampleIntervalStats { + private Map sampleToStat; + protected int numIntervals; + + public MultiSampleIntervalStats() { + this.sampleToStat = new HashMap(); + this.numIntervals = 0; + } + + public void processHetSiteInInterval(String sample, boolean isHet, boolean isPhased) { + SingleSampleIntervalStats sampleStats = sampleToStat.get(sample); + if (sampleStats == null) { + sampleStats = new SingleSampleIntervalStats(); + sampleToStat.put(sample, sampleStats); + } + + sampleStats.updateHetStats(isHet, isPhased); + } + + public void finalizeStats() { + for (SingleSampleIntervalStats stats : sampleToStat.values()) + stats.finalizeStats(); + } + + public Set> entrySet() { + return sampleToStat.entrySet(); + } + + public void startNewInterval() { + finalizeStats(); + numIntervals++; + } + + private class SingleSampleIntervalStats { + public Map hetStatInIntervalToCount; + public int firstHetIsPhased; + + private int numHetsInCurrentInterval; + private int numPhasedInCurrentInterval; + + public SingleSampleIntervalStats() { + this.hetStatInIntervalToCount = new TreeMap(); // implemented PhasedHetsStat.compareTo() + this.firstHetIsPhased = 0; + + this.numHetsInCurrentInterval = 0; + this.numPhasedInCurrentInterval = 0; + } + + public void updateHetStats(boolean isHet, boolean isPhased) { + if (isHet) { + numHetsInCurrentInterval++; + + if (isPhased) { + numPhasedInCurrentInterval++; + + if (numHetsInCurrentInterval == 1) + firstHetIsPhased++; + } + } + } + + public void finalizeStats() { + if (numIntervals == 0) // have not yet seen any intervals + return; + + PhasedHetsStat hetsAndPhased = new PhasedHetsStat(numHetsInCurrentInterval, numPhasedInCurrentInterval); + Integer cnt = hetStatInIntervalToCount.get(hetsAndPhased); + if (cnt == null) + cnt = 0; + hetStatInIntervalToCount.put(hetsAndPhased, cnt + 1); + + numHetsInCurrentInterval = 0; + numPhasedInCurrentInterval = 0; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + + sb.append("# of intervals: " + numIntervals + "\n"); + sb.append("First het is phased: " + firstHetIsPhased + "\n"); + + sb.append("Distribution of number of phased / hets per interval:" + "\n"); + for (Map.Entry hetStatEntry : hetStatInIntervalToCount.entrySet()) + sb.append(hetStatEntry.getKey() + "\t" + hetStatEntry.getValue() + "\n"); + + return sb.toString(); + } + } + } + + class PhasedHetsStat implements Comparable { + public int numHets; + public int numPhased; + + public PhasedHetsStat(int numHets, int numPhased) { + this.numHets = numHets; + this.numPhased = numPhased; + } + + public int compareTo(PhasedHetsStat that) { + if (this.numHets != that.numHets) + return this.numHets - that.numHets; + + return this.numPhased - that.numPhased; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + + sb.append(numPhased + " / " + numHets); + + return sb.toString(); + } + } +} \ No newline at end of file