Bringing VQSRV3 up to date. Lots of new features (un-classifying the worst-performing training sites, treating the x% best/worst sites as postive/negative points, ability to pass in a monomorphic track to see ROC curves output). Minor changes to AlleleBalance: weighted average was incorrectly specified (using logscale actually biased the average towards the AB of low-quality genotypes), and breaking out AB by het, hom, and diploid to bring it in line with some (private) changes to the indel likelihood model that (correctly) computes these values for indels.

This commit is contained in:
Christopher Hartl 2012-04-28 11:31:03 -04:00
parent 9801dd114f
commit 944a7d815e
1 changed files with 64 additions and 23 deletions

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
@ -50,22 +51,25 @@ import java.util.Map;
*/
public class AlleleBalance extends InfoFieldAnnotation {
char[] BASES = {'A','C','G','T'};
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
return null;
if ( !vc.isBiallelic() )
return null;
final GenotypesContext genotypes = vc.getGenotypes();
if ( !vc.hasGenotypes() )
return null;
double ratio = 0.0;
double totalWeights = 0.0;
double ratioHom = 0.0;
double ratioHet = 0.0;
double weightHom = 0.0;
double weightHet = 0.0;
double overallNonDiploid = 0.0;
for ( Genotype genotype : genotypes ) {
// we care only about het calls
if ( !genotype.isHet() )
continue;
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null || !context.hasBasePileup() )
@ -76,35 +80,72 @@ public class AlleleBalance extends InfoFieldAnnotation {
final String bases = new String(pileup.getBases());
if ( bases.length() == 0 )
return null;
final char refChr = vc.getReference().toString().charAt(0);
final char altChr = vc.getAlternateAllele(0).toString().charAt(0);
final int refCount = MathUtils.countOccurrences(refChr, bases);
final int altCount = MathUtils.countOccurrences(altChr, bases);
double pTrue = 1.0 - Math.pow(10.0,genotype.getLog10PError());
if ( genotype.isHet() ) {
final char refChr = vc.getReference().toString().charAt(0);
final char altChr = vc.getAlternateAllele(0).toString().charAt(0);
// sanity check
if ( refCount + altCount == 0 )
continue;
final int refCount = MathUtils.countOccurrences(refChr, bases);
final int altCount = MathUtils.countOccurrences(altChr, bases);
final int otherCount = bases.length()-refCount-altCount;
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount));
totalWeights += genotype.getLog10PError();
// sanity check
if ( refCount + altCount == 0 )
continue;
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
ratioHet += pTrue * ((double)refCount / (double)(refCount + altCount));
weightHet += pTrue;
overallNonDiploid += ( (double) otherCount )/(bases.length()*genotypes.size());
} else if ( genotype.isHom() ) {
char alleleChr;
if ( genotype.isHomRef() ) {
alleleChr = vc.getReference().toString().charAt(0);
} else {
alleleChr = vc.getAlternateAllele(0).toString().charAt(0);
}
final int alleleCount = MathUtils.countOccurrences(alleleChr,bases);
int bestOtherCount = 0;
for ( char b : BASES ) {
if ( b == alleleChr )
continue;
int count = MathUtils.countOccurrences(b,bases);
if ( count > bestOtherCount )
bestOtherCount = count;
}
final int otherCount = bases.length() - alleleCount;
ratioHom += pTrue*( (double) alleleCount)/(alleleCount+bestOtherCount);
weightHom += pTrue;
overallNonDiploid += ((double ) otherCount)/(bases.length()*genotypes.size());
}
// Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of
// prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from
// scratch, be my guest - but make sure it's done correctly! [EB]
}
// Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of
// prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from
// scratch, be my guest - but make sure it's done correctly! [EB]
}
// make sure we had a het genotype
if ( MathUtils.compareDoubles(totalWeights, 0.0) == 0 )
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.3f", (ratio / totalWeights)));
if ( weightHet > 0.0 ) {
map.put("ABHet",ratioHet/weightHet);
}
if ( weightHom > 0.0 ) {
map.put("ABHom",ratioHom/weightHom);
}
if ( overallNonDiploid > 0.0 ) {
map.put("OND",overallNonDiploid);
}
return map;
}
public List<String> getKeyNames() { return Arrays.asList("AB"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("AB", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))")); }
public List<String> getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ABHet", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))"),
new VCFInfoHeaderLine("ABHom", 1, VCFHeaderLineType.Float, "Allele Balance for homs (A/(A+O))"),
new VCFInfoHeaderLine("OND", 1, VCFHeaderLineType.Float, "Overall non-diploid ratio (alleles/(alleles+non-alleles))")); }
}