From 74298f685815199e4a325395e4cc2c397c75f3ed Mon Sep 17 00:00:00 2001 From: fromer Date: Thu, 2 Jun 2011 21:46:35 +0000 Subject: [PATCH] Basic walker to calculate statistics of CNV genotyping copy counts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5927 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/CNV/CNVstatsWalker.java | 288 ++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/CNVstatsWalker.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/CNVstatsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/CNVstatsWalker.java new file mode 100644 index 000000000..c1ab782cf --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/CNVstatsWalker.java @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers.CNV; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; + +import java.io.PrintStream; +import java.util.*; + + +/** + * Walks along all variant ROD loci, and tabulates the statistics of the CNVs detected. + */ +@Allows(value = {DataSource.REFERENCE}) +@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) +@By(DataSource.REFERENCE_ORDERED_DATA) + +public class CNVstatsWalker extends RodWalker { + + @Output(doc = "File to which copy number counts should be written", required = true) + protected PrintStream out; + + @Argument(fullName = "alleleCountsCopyCounts", shortName = "AC_CC", doc = "File to which discovered allele copy and copy number counts should be written", required = false) + private PrintStream alleleCountsCopyCounts = null; + + @Argument(fullName = "allowFilteredGenotypes", shortName = "allowFilteredGenotypes", doc = "Include filtered genotypes in the output/calculations", required = false) + private boolean ALLOW_FILTERED_GENOTYPES = false; + + private LinkedList rodNames = null; + + public static String CNV_TAG = ""; + public static String CN_FIELD = "CN"; + + public static String SVLEN_FIELD = "SVLEN"; + public static String AC_FIELD = "AC"; + + public static int DIPLOID = 2; + + public void initialize() { + rodNames = new LinkedList(); + rodNames.add("variant"); + } + + public boolean generateExtendedEvents() { + return false; + } + + public CNVstatistics reduceInit() { + return new CNVstatistics(); + } + + /** + * For each site, calculate the CNV stats. + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return dummy Integer + */ + public CNVstatistics map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker == null) + return null; + + logger.debug("REF:" + ref.getLocus()); + CNVstatistics stats = new CNVstatistics(); + + boolean requireStartHere = true; // only see each VariantContext once + boolean takeFirstOnly = false; // take as many entries as the VCF file has + for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { + if (vc.isMixed() && vc.isBiallelic()) { + Allele altAll = vc.getAlternateAllele(0); + if (altAll.isSymbolic() && altAll.getDisplayString().equals(CNV_TAG)) { + logger.debug("Found CNV at locus..."); + stats.cnvDeclaredLoci++; + + CopyNumberCounts cnc = new CopyNumberCounts(); + + boolean hasDiploidGt = false; + boolean hasNonDiploidGt = false; + for (Map.Entry gtEntry : vc.getGenotypes().entrySet()) { + Genotype gt = gtEntry.getValue(); + Integer copyNum = gt.getAttributeAsIntegerNoException(CN_FIELD); + if (copyNum != null && (ALLOW_FILTERED_GENOTYPES || gt.isNotFiltered())) { + cnc.incrementCopyNumber(copyNum); + + if (copyNum == DIPLOID) + hasDiploidGt = true; + else + hasNonDiploidGt = true; + } + } + + if (hasDiploidGt && hasNonDiploidGt) { + stats.diploidAndNonDiploidLoci++; + } + else { + if (hasDiploidGt) + stats.diploidOnlyLoci++; + if (hasNonDiploidGt) + stats.nonDiploidOnlyLoci++; + } + + int cnvEnd = vc.getEnd(); + Integer cnvLength = vc.getAttributeAsIntegerNoException(SVLEN_FIELD); + if (cnvLength != null) + cnvEnd = vc.getStart() + cnvLength - 1; + GenomeLoc vcLoc = getToolkit().getGenomeLocParser().createGenomeLoc(vc.getChr(), vc.getStart(), cnvEnd, true); + out.print(vcLoc); + + for (Map.Entry copyNumEntry : cnc.entrySet()) { + out.print("\t" + copyNumEntry.getKey() + ":" + copyNumEntry.getValue()); + } + out.println(); + + if (alleleCountsCopyCounts != null) { + Integer ac = vc.getAttributeAsIntegerNoException(AC_FIELD); + CopyNumberCounts.DeletionDuplicationCounts counts = cnc.deletionDuplicationCounts(); + double cnvCount = counts.deletionCount + counts.duplicationCount; + + alleleCountsCopyCounts.println(vcLoc + "\t" + ac + "\t" + counts.deletionCount + "\t" + counts.duplicationCount + "\t" + cnvCount); + } + } + } + } + + return stats; + } + + public CNVstatistics reduce(CNVstatistics result, CNVstatistics total) { + if (result == null) + return total; + + return total.addIn(result); + } + + /** + * @param result statistics of CNV sites + */ + public void onTraversalDone(CNVstatistics result) { + System.out.println(); + System.out.println("--------------------------------------"); + System.out.println("CNV summary:"); + System.out.println("--------------------------------------"); + + System.out.println("cnvDeclaredLoci: " + result.cnvDeclaredLoci); + + System.out.println(); + System.out.println("noGenotypesLoci: " + result.noGenotypesLoci()); + + System.out.println(); + System.out.println("nonDiploidOnlyLoci: " + result.nonDiploidOnlyLoci); + + System.out.println(); + System.out.println("lociWithDiploid: " + result.lociWithDiploid()); + System.out.println("diploidOnlyLoci: " + result.diploidOnlyLoci); + System.out.println("diploidAndNonDiploidLoci: " + result.diploidAndNonDiploidLoci); + String onlyDiploidRateStr = percentageString(result.diploidOnlyLoci, result.lociWithDiploid()); + System.out.println("onlyDiploidRate = " + onlyDiploidRateStr + "%"); + + System.out.println(); + int noDiploidGenotypes = result.noGenotypesLoci() + result.nonDiploidOnlyLoci; + System.out.println("loci with no diploid genotypes: " + noDiploidGenotypes); + String noDiploidGtRateStr = percentageString(noDiploidGenotypes, result.cnvDeclaredLoci); + System.out.println("noDiploidGtRate = " + noDiploidGtRateStr + "%"); + } + + private static String percentageString(int numerator, int denominator) { + int NUM_DECIMAL_PLACES = 2; + + return new Formatter().format("%." + NUM_DECIMAL_PLACES + "f", MathUtils.percentage(numerator, denominator)).toString(); + } +} + + +class CNVstatistics { + protected int cnvDeclaredLoci, diploidOnlyLoci, nonDiploidOnlyLoci, diploidAndNonDiploidLoci; + + public CNVstatistics() { + this.cnvDeclaredLoci = 0; + this.diploidOnlyLoci = 0; + this.nonDiploidOnlyLoci = 0; + this.diploidAndNonDiploidLoci = 0; + } + + public CNVstatistics addIn(CNVstatistics other) { + this.cnvDeclaredLoci += other.cnvDeclaredLoci; + this.diploidOnlyLoci += other.diploidOnlyLoci; + this.nonDiploidOnlyLoci += other.nonDiploidOnlyLoci; + this.diploidAndNonDiploidLoci += other.diploidAndNonDiploidLoci; + + return this; + } + + public int noGenotypesLoci() { + return cnvDeclaredLoci - (diploidOnlyLoci + nonDiploidOnlyLoci + diploidAndNonDiploidLoci); + } + + public int lociWithDiploid() { + return diploidOnlyLoci + diploidAndNonDiploidLoci; + } +} + +class CopyNumberCounts { + private Map copyNumToCountsMap; + + public CopyNumberCounts() { + this.copyNumToCountsMap = new TreeMap(); + } + + public void incrementCopyNumber(int copyNum) { + Integer count = copyNumToCountsMap.get(copyNum); + if (count == null) + count = 0; + + copyNumToCountsMap.put(copyNum, count + 1); + } + + public Set> entrySet() { + return copyNumToCountsMap.entrySet(); + } + + class DeletionDuplicationCounts { + public double deletionCount; + public double duplicationCount; + + public DeletionDuplicationCounts() { + this.deletionCount = 0; + this.duplicationCount = 0; + } + } + + public DeletionDuplicationCounts deletionDuplicationCounts() { + int total = 0; + DeletionDuplicationCounts counts = new DeletionDuplicationCounts(); + + for (Map.Entry copyNumEntry : this.entrySet()) { + int copyNum = copyNumEntry.getKey(); + int count = copyNumEntry.getValue(); + + if (copyNum < CNVstatsWalker.DIPLOID) { + counts.deletionCount += count; + } + else if (copyNum > CNVstatsWalker.DIPLOID) { + counts.duplicationCount += count; + } + + total += count; + } + + //counts.deletionCount /= total; + //counts.duplicationCount /= total; + + return counts; + } +} \ No newline at end of file