Merge pull request #661 from broadinstitute/jw_allele_balance_gvcf

Enable AB annotation in reference model pipeline. Incorporates patches f...
This commit is contained in:
Ryan Poplin 2014-06-19 13:10:41 -04:00
commit da1dab6c32
2 changed files with 179 additions and 83 deletions

View File

@ -25,19 +25,21 @@
package org.broadinstitute.gatk.tools.walkers.annotator; package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.MathUtils;
import htsjdk.variant.vcf.VCFHeaderLineType; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
@ -55,16 +57,14 @@ import java.util.Map;
*/ */
public class AlleleBalance extends InfoFieldAnnotation { public class AlleleBalance extends InfoFieldAnnotation {
char[] BASES = {'A','C','G','T'};
public Map<String, Object> annotate(final RefMetaDataTracker tracker, public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker, final AnnotatorCompatible walker,
final ReferenceContext ref, final ReferenceContext ref,
final Map<String, AlignmentContext> stratifiedContexts, final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc, final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) { final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( stratifiedContexts.size() == 0 ) //if ( stratifiedContexts.size() == 0 )
return null; // return null;
if ( !vc.isBiallelic() ) if ( !vc.isBiallelic() )
return null; return null;
@ -78,55 +78,45 @@ public class AlleleBalance extends InfoFieldAnnotation {
double weightHet = 0.0; double weightHet = 0.0;
double overallNonDiploid = 0.0; double overallNonDiploid = 0.0;
for ( Genotype genotype : genotypes ) { for ( Genotype genotype : genotypes ) {
// we care only about het calls
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( vc.isSNP() ) {
if ( context == null )
final int[] counts = getCounts(genotype, stratifiedContexts, vc);
// If AD was not calculated, we can't continue
if(counts == null)
continue; continue;
final ReadBackedPileup pileup = context.getBasePileup(); final int n_allele = counts.length;
if ( vc.isSNP() ) { int count_sum = 0;
final String bases = new String(pileup.getBases()); for(int i=0; i<n_allele; i++){
if ( bases.length() == 0 ) count_sum += counts[i];
return null; }
double pTrue = 1.0 - Math.pow(10.0,-genotype.getGQ() / (double) 10 );
double pTrue = 1.0 - Math.pow(10.0,genotype.getLog10PError());
if ( genotype.isHet() ) { if ( genotype.isHet() ) {
final char refChr = vc.getReference().toString().charAt(0);
final char altChr = vc.getAlternateAllele(0).toString().charAt(0);
final int refCount = MathUtils.countOccurrences(refChr, bases);
final int altCount = MathUtils.countOccurrences(altChr, bases);
final int otherCount = bases.length()-refCount-altCount;
final int otherCount = count_sum - (counts[0] + counts[1]);
// sanity check // sanity check
if ( refCount + altCount == 0 ) if ( counts[0] + counts[1] == 0 )
continue; continue;
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much // weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
ratioHet += pTrue * ((double)refCount / (double)(refCount + altCount)); ratioHet += pTrue * ((double)counts[0] / (double)(counts[0] + counts[1]));
weightHet += pTrue; weightHet += pTrue;
overallNonDiploid += ( (double) otherCount )/(bases.length()*genotypes.size()); overallNonDiploid += ( (double) otherCount )/((double) count_sum*genotypes.size());
} else if ( genotype.isHom() ) { } else if ( genotype.isHom() ) {
char alleleChr; final int alleleIdx = genotype.isHomRef() ? 0 : 1 ;
if ( genotype.isHomRef() ) { final int alleleCount = counts[alleleIdx];
alleleChr = vc.getReference().toString().charAt(0);
} else {
alleleChr = vc.getAlternateAllele(0).toString().charAt(0);
}
final int alleleCount = MathUtils.countOccurrences(alleleChr,bases);
int bestOtherCount = 0; int bestOtherCount = 0;
for ( char b : BASES ) { for(int i=0; i<n_allele; i++){
if ( b == alleleChr ) if( i == alleleIdx )
continue; continue;
int count = MathUtils.countOccurrences(b,bases); if( counts[i] > bestOtherCount )
if ( count > bestOtherCount ) bestOtherCount = counts[i];
bestOtherCount = count;
} }
final int otherCount = bases.length() - alleleCount; final int otherCount = count_sum - alleleCount;
ratioHom += pTrue*( (double) alleleCount)/(alleleCount+bestOtherCount); ratioHom += pTrue*( (double) alleleCount)/((double) (alleleCount+bestOtherCount));
weightHom += pTrue; weightHom += pTrue;
overallNonDiploid += ((double ) otherCount)/(bases.length()*genotypes.size()); overallNonDiploid += ((double ) otherCount)/((double) count_sum*genotypes.size());
} }
// Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of
// prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from
@ -136,7 +126,7 @@ public class AlleleBalance extends InfoFieldAnnotation {
// make sure we had a het genotype // make sure we had a het genotype
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<>();
if ( weightHet > 0.0 ) { if ( weightHet > 0.0 ) {
map.put("ABHet",ratioHet/weightHet); map.put("ABHet",ratioHet/weightHet);
} }
@ -151,6 +141,58 @@ public class AlleleBalance extends InfoFieldAnnotation {
return map; return map;
} }
/**
* Provide a centralized method of getting the number of reads per allele,
* depending on the input given. Will use the following (in order of preference):
* - genotype.getAD()
* - reads from an AlignmentContext
* - reads from a PerReadAlleleLikelihoodMap (Not yet implemented)
*
*
* @param genotype The genotype of interest
* @param stratifiedContexts A mapping
* @param vc
* @return
*/
private int[] getCounts(final Genotype genotype,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc){
// Can't do anything without a genotype here
if(genotype == null)
return null;
int[] retVal = genotype.getAD();
AlignmentContext context;
if ( retVal == null && stratifiedContexts != null &&
(context = stratifiedContexts.get(genotype.getSampleName())) != null){
// If we get to this point, the getAD() function returned no information
// about AlleleDepth by Sample - perhaps it wasn't annotated?
// In that case, let's try to build it up using the algorithm that
// was here in v 3.1-1 and earlier
// Also, b/c of the assignment check in the if statement above,
// we know we have a valid AlignmentContext for this sample!
final ReadBackedPileup pileup = context.getBasePileup();
final String bases = new String(pileup.getBases());
List<Allele> alleles = vc.getAlleles();
final int n_allele = alleles.size();
retVal = new int[n_allele];
// Calculate the depth for each allele, under the assumption that
// the allele is a single base
int i=0;
for(Allele a : alleles){
retVal[i] = MathUtils.countOccurrences(a.toString().charAt(0), bases);
i++;
}
}
return retVal;
}
public List<String> getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); } public List<String> getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); }

View File

@ -25,24 +25,31 @@
package org.broadinstitute.gatk.tools.walkers.annotator; package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineType;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.pileup.PileupElement;
import htsjdk.variant.vcf.VCFFormatHeaderLine; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import htsjdk.variant.vcf.VCFHeaderLineType; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.HashMap;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Set;
/** /**
@ -64,50 +71,97 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
final Genotype g, final Genotype g,
final GenotypeBuilder gb, final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ final PerReadAlleleLikelihoodMap alleleLikelihoodMap){
if ( stratifiedContext == null )
// We need a heterozygous genotype and either a context or alleleLikelihoodMap
if ( g == null || !g.isCalled() || !g.isHet() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
return;
// Test for existence of <NON_REF> allele, and manually check isSNP()
// and isBiallelic() while ignoring the <NON_REF> allele
boolean biallelicSNP = vc.isSNP() && vc.isBiallelic();
if(vc.hasAllele(GVCF_NONREF)){
// If we have the GVCF <NON_REF> allele, then the SNP is biallelic
// iff there are 3 alleles and both the reference and first alt
// allele are length 1.
biallelicSNP = vc.getAlleles().size() == 3 &&
vc.getReference().length() == 1 &&
vc.getAlternateAllele(0).length() == 1;
}
if ( !biallelicSNP )
return;
Double ratio;
if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())
ratio = annotateWithLikelihoods(alleleLikelihoodMap, vc);
else if ( stratifiedContext != null )
ratio = annotateWithPileup(stratifiedContext, vc);
else
return; return;
Double ratio = annotateSNP(stratifiedContext, vc, g);
if (ratio == null) if (ratio == null)
return; return;
gb.attribute(getKeyNames().get(0), Double.valueOf(String.format("%.2f", ratio.doubleValue()))); gb.attribute(getKeyNames().get(0), Double.valueOf(String.format("%.2f", ratio)));
} }
private Double annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { private static final Allele GVCF_NONREF = Allele.create("<NON_REF>", false);
double ratio = -1;
if ( !vc.isSNP() ) private Double annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc) {
return null;
if ( !vc.isBiallelic() ) final HashMap<Byte, Integer> alleleCounts = new HashMap<>();
return null; for ( final Allele allele : vc.getAlleles() )
alleleCounts.put(allele.getBases()[0], 0);
if ( g == null || !g.isCalled() ) final ReadBackedPileup pileup = stratifiedContext.getBasePileup();
return null; for ( final PileupElement p : pileup ) {
if ( alleleCounts.containsKey(p.getBase()) )
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
}
if (!g.isHet()) // we need to add counts in the correct order
return null; final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
Collection<Allele> altAlleles = vc.getAlternateAlleles(); for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
if ( altAlleles.size() == 0 ) counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
return null;
final String bases = new String(stratifiedContext.getBasePileup().getBases());
if ( bases.length() == 0 )
return null;
char refChr = vc.getReference().toString().charAt(0);
char altChr = vc.getAlternateAllele(0).toString().charAt(0);
int refCount = MathUtils.countOccurrences(refChr, bases);
int altCount = MathUtils.countOccurrences(altChr, bases);
// sanity check // sanity check
if ( refCount + altCount == 0 ) if(counts[0] + counts[1] == 0)
return null; return null;
ratio = ((double)refCount / (double)(refCount + altCount)); return ((double) counts[0] / (double)(counts[0] + counts[1]));
return ratio; }
private Double annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) {
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
final HashMap<Allele, Integer> alleleCounts = new HashMap<>();
for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); }
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
if (! a.isInformative() ) continue; // read is non-informative
final int prevCount = alleleCounts.get(a.getMostLikelyAllele());
alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1);
}
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference());
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) );
// sanity check
if(counts[0] + counts[1] == 0)
return null;
return ((double) counts[0] / (double)(counts[0] + counts[1]));
} }
public List<String> getKeyNames() { return Arrays.asList("AB"); } public List<String> getKeyNames() { return Arrays.asList("AB"); }