From 944a7d815e640d6bf2e4661cbf0bb6cea70ea1f3 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Sat, 28 Apr 2012 11:31:03 -0400 Subject: [PATCH] Bringing VQSRV3 up to date. Lots of new features (un-classifying the worst-performing training sites, treating the x% best/worst sites as postive/negative points, ability to pass in a monomorphic track to see ROC curves output). Minor changes to AlleleBalance: weighted average was incorrectly specified (using logscale actually biased the average towards the AB of low-quality genotypes), and breaking out AB by het, hom, and diploid to bring it in line with some (private) changes to the indel likelihood model that (correctly) computes these values for indels. --- .../gatk/walkers/annotator/AlleleBalance.java | 87 ++++++++++++++----- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index ea356e050..04c7ab756 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -50,22 +51,25 @@ import java.util.Map; */ public class AlleleBalance extends InfoFieldAnnotation { + + char[] BASES = {'A','C','G','T'}; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; - + if ( !vc.isBiallelic() ) return null; final GenotypesContext genotypes = vc.getGenotypes(); if ( !vc.hasGenotypes() ) return null; - double ratio = 0.0; - double totalWeights = 0.0; + double ratioHom = 0.0; + double ratioHet = 0.0; + double weightHom = 0.0; + double weightHet = 0.0; + double overallNonDiploid = 0.0; for ( Genotype genotype : genotypes ) { // we care only about het calls - if ( !genotype.isHet() ) - continue; AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null || !context.hasBasePileup() ) @@ -76,35 +80,72 @@ public class AlleleBalance extends InfoFieldAnnotation { final String bases = new String(pileup.getBases()); if ( bases.length() == 0 ) return null; - final char refChr = vc.getReference().toString().charAt(0); - final char altChr = vc.getAlternateAllele(0).toString().charAt(0); - final int refCount = MathUtils.countOccurrences(refChr, bases); - final int altCount = MathUtils.countOccurrences(altChr, bases); + double pTrue = 1.0 - Math.pow(10.0,genotype.getLog10PError()); + if ( genotype.isHet() ) { + final char refChr = vc.getReference().toString().charAt(0); + final char altChr = vc.getAlternateAllele(0).toString().charAt(0); - // sanity check - if ( refCount + altCount == 0 ) - continue; + final int refCount = MathUtils.countOccurrences(refChr, bases); + final int altCount = MathUtils.countOccurrences(altChr, bases); + final int otherCount = bases.length()-refCount-altCount; - // weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much - ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount)); - totalWeights += genotype.getLog10PError(); + // sanity check + if ( refCount + altCount == 0 ) + continue; + + // weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much + ratioHet += pTrue * ((double)refCount / (double)(refCount + altCount)); + weightHet += pTrue; + overallNonDiploid += ( (double) otherCount )/(bases.length()*genotypes.size()); + } else if ( genotype.isHom() ) { + char alleleChr; + if ( genotype.isHomRef() ) { + alleleChr = vc.getReference().toString().charAt(0); + } else { + alleleChr = vc.getAlternateAllele(0).toString().charAt(0); + } + final int alleleCount = MathUtils.countOccurrences(alleleChr,bases); + int bestOtherCount = 0; + for ( char b : BASES ) { + if ( b == alleleChr ) + continue; + int count = MathUtils.countOccurrences(b,bases); + if ( count > bestOtherCount ) + bestOtherCount = count; + } + final int otherCount = bases.length() - alleleCount; + ratioHom += pTrue*( (double) alleleCount)/(alleleCount+bestOtherCount); + weightHom += pTrue; + overallNonDiploid += ((double ) otherCount)/(bases.length()*genotypes.size()); + } + // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of + // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from + // scratch, be my guest - but make sure it's done correctly! [EB] } - // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of - // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from - // scratch, be my guest - but make sure it's done correctly! [EB] } // make sure we had a het genotype - if ( MathUtils.compareDoubles(totalWeights, 0.0) == 0 ) - return null; Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.3f", (ratio / totalWeights))); + if ( weightHet > 0.0 ) { + map.put("ABHet",ratioHet/weightHet); + } + + if ( weightHom > 0.0 ) { + map.put("ABHom",ratioHom/weightHom); + } + + if ( overallNonDiploid > 0.0 ) { + map.put("OND",overallNonDiploid); + } return map; } - public List getKeyNames() { return Arrays.asList("AB"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("AB", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))")); } + public List getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ABHet", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))"), + new VCFInfoHeaderLine("ABHom", 1, VCFHeaderLineType.Float, "Allele Balance for homs (A/(A+O))"), + new VCFInfoHeaderLine("OND", 1, VCFHeaderLineType.Float, "Overall non-diploid ratio (alleles/(alleles+non-alleles))")); } }