From 568a1362f54a505cf14f5f02078f135c894eac4f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 13 Mar 2012 16:19:15 -0400 Subject: [PATCH] Splitting up the MultiallelicSummary module into the standard part for use by all and the dev piece used just by me --- .../evaluators/MultiallelicAFs.java | 254 ++++++++++++++++++ .../evaluators/MultiallelicSummary.java | 2 +- 2 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicAFs.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicAFs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicAFs.java new file mode 100644 index 000000000..056b54945 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicAFs.java @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +@Analysis(description = "Evaluation summary for multi-allelic variants") +public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval { + final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class); + + public enum Type { + SNP, INDEL + } + + // basic counts on various rates found + @DataPoint(description = "Number of processed loci", format = "%d") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs", format = "%d") + public int nSNPs = 0; + @DataPoint(description = "Number of multi-allelic SNPs", format = "%d") + public int nMultiSNPs = 0; + @DataPoint(description = "% processed sites that are multi-allelic SNPs", format = "%.5f") + public double processedMultiSnpRatio = 0; + @DataPoint(description = "% SNP sites that are multi-allelic", format = "%.3f") + public double variantMultiSnpRatio = 0; + + @DataPoint(description = "Number of Indels", format = "%d") + public int nIndels = 0; + @DataPoint(description = "Number of multi-allelic Indels", format = "%d") + public int nMultiIndels = 0; + @DataPoint(description = "% processed sites that are multi-allelic Indels", format = "%.5f") + public double processedMultiIndelRatio = 0; + @DataPoint(description = "% Indel sites that are multi-allelic", format = "%.3f") + public double variantMultiIndelRatio = 0; + + @DataPoint(description = "Number of Transitions", format = "%d") + public int nTi = 0; + @DataPoint(description = "Number of Transversions", format = "%d") + public int nTv = 0; + @DataPoint(description = "Overall TiTv ratio", format = "%.2f") + public double TiTvRatio = 0; + + @DataPoint(description = "Multi-allelic SNPs partially known", format = "%d") + public int knownSNPsPartial = 0; + @DataPoint(description = "Multi-allelic SNPs completely known", format = "%d") + public int knownSNPsComplete = 0; + @DataPoint(description = "Multi-allelic SNP Novelty Rate") + public String SNPNoveltyRate = "NA"; + + //TODO -- implement me + //@DataPoint(description = "Multi-allelic Indels partially known", format = "%d") + public int knownIndelsPartial = 0; + //@DataPoint(description = "Multi-allelic Indels completely known", format = "%d") + public int knownIndelsComplete = 0; + //@DataPoint(description = "Multi-allelic Indel Novelty Rate") + public String indelNoveltyRate = "NA"; + + @DataPoint(description="Histogram of allele frequencies for most common SNP alternate allele") + AFHistogram AFhistogramMaxSnp = new AFHistogram(); + + @DataPoint(description="Histogram of allele frequencies for less common SNP alternate alleles") + AFHistogram AFhistogramMinSnp = new AFHistogram(); + + @DataPoint(description="Histogram of allele frequencies for most common Indel alternate allele") + AFHistogram AFhistogramMaxIndel = new AFHistogram(); + + @DataPoint(description="Histogram of allele frequencies for less common Indel alternate alleles") + AFHistogram AFhistogramMinIndel = new AFHistogram(); + + /* + * AF histogram table object + */ + static class AFHistogram implements TableType { + private Object[] rowKeys, colKeys = {"count"}; + private int[] AFhistogram; + + private static final double AFincrement = 0.01; + private static final int numBins = (int)(1.00 / AFincrement); + + public AFHistogram() { + rowKeys = initRowKeys(); + AFhistogram = new int[rowKeys.length]; + } + + public Object[] getColumnKeys() { + return colKeys; + } + + public Object[] getRowKeys() { + return rowKeys; + } + + public Object getCell(int row, int col) { + return AFhistogram[row]; + } + + private static Object[] initRowKeys() { + ArrayList keyList = new ArrayList(numBins + 1); + for ( double a = 0.00; a <= 1.01; a += AFincrement ) { + keyList.add(String.format("%.2f", a)); + } + return keyList.toArray(); + } + + public String getName() { return "AFHistTable"; } + + public void update(final double AF) { + final int bin = (int)(numBins * MathUtils.round(AF, 2)); + AFhistogram[bin]++; + } + } + + public void initialize(VariantEvalWalker walker) {} + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || eval.isMonomorphicInSamples() ) + return null; + + // update counts + switch ( eval.getType() ) { + case SNP: + nSNPs++; + if ( !eval.isBiallelic() ) { + nMultiSNPs++; + calculatePairwiseTiTv(eval); + calculateSNPPairwiseNovelty(eval, comp); + updateAFhistogram(eval, AFhistogramMaxSnp, AFhistogramMinSnp); + } + break; + case INDEL: + nIndels++; + if ( !eval.isBiallelic() ) { + nMultiIndels++; + calculateIndelPairwiseNovelty(eval, comp); + updateAFhistogram(eval, AFhistogramMaxIndel, AFhistogramMinIndel); + } + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + return null; // we don't capture any interesting sites + } + + private void calculatePairwiseTiTv(VariantContext vc) { + for ( Allele alt : vc.getAlternateAlleles() ) { + if ( VariantContextUtils.isTransition(vc.getReference(), alt) ) + nTi++; + else + nTv++; + } + } + + private void calculateSNPPairwiseNovelty(VariantContext eval, VariantContext comp) { + if ( comp == null ) + return; + + int knownAlleles = 0; + for ( Allele alt : eval.getAlternateAlleles() ) { + if ( comp.getAlternateAlleles().contains(alt) ) + knownAlleles++; + } + + if ( knownAlleles == eval.getAlternateAlleles().size() ) + knownSNPsComplete++; + else if ( knownAlleles > 0 ) + knownSNPsPartial++; + } + + private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) { + } + + private void updateAFhistogram(VariantContext vc, AFHistogram max, AFHistogram min) { + + final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null); + if ( obj == null || !(obj instanceof List) ) + return; + + List list = (List)obj; + ArrayList AFs = new ArrayList(list.size()); + for ( String str : list ) { + AFs.add(Double.valueOf(str)); + } + + Collections.sort(AFs); + max.update(AFs.get(AFs.size()-1)); + for ( int i = 0; i < AFs.size() - 1; i++ ) + min.update(AFs.get(i)); + } + + private final String noveltyRate(final int all, final int known) { + final int novel = all - known; + final double rate = (novel / (1.0 * all)); + return all == 0 ? "NA" : String.format("%.2f", rate); + } + + public void finalizeEvaluation() { + processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci; + variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs; + processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci; + variantMultiIndelRatio = (double)nMultiIndels / (double)nIndels; + + TiTvRatio = (double)nTi / (double)nTv; + + SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete); + indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java index 056b54945..1a4aa1cc8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -40,7 +40,7 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @Analysis(description = "Evaluation summary for multi-allelic variants") -public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval { +public class MultiallelicSummary extends VariantEvaluator implements StandardEval { final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class); public enum Type {