From 6156b85ad9d26793793ab03a50160c463db8dc81 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Mon, 25 Apr 2016 15:28:22 -0400 Subject: [PATCH] Fixed logic error and tidied AlleleBalance and AlleleBalanceBySample --- .../HaplotypeCallerIntegrationTest.java | 5 + .../walkers/annotator/AlleleBalance.java | 172 ++++++++---------- .../annotator/AlleleBalanceBySample.java | 92 ++++------ 3 files changed, 108 insertions(+), 161 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index aadd03a32..f2a9a6f20 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -522,5 +522,10 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList(md5)); executeTest("testHaplotypeCallerRemoveAltAlleleBasedOnHaptypeScores", spec); } + + @Test + public void testAlleleBalance() throws IOException{ + HCTest(CEUTRIO_BAM, " -L 20:10001000-10010000 -A AlleleBalance -A AlleleBalanceBySample", "a210161843f4cb80143ff56e4e5c250f"); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java index a8d06a603..123ea7716 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java @@ -35,9 +35,9 @@ import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.Arrays; @@ -70,7 +70,7 @@ import java.util.Map; * */ -public class AlleleBalance extends InfoFieldAnnotation { +public class AlleleBalance extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -78,135 +78,105 @@ public class AlleleBalance extends InfoFieldAnnotation { final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - //if ( stratifiedContexts.size() == 0 ) - // return null; - - if ( !vc.isBiallelic() ) + if ( !(vc.isBiallelic() && vc.hasGenotypes())) { return null; - final GenotypesContext genotypes = vc.getGenotypes(); - if ( !vc.hasGenotypes() ) - return null; - - double ratioHom = 0.0; - double ratioHet = 0.0; - double weightHom = 0.0; - double weightHet = 0.0; - double overallNonDiploid = 0.0; - for ( Genotype genotype : genotypes ) { - - if ( vc.isSNP() ) { - - final int[] counts = getCounts(genotype, stratifiedContexts, vc); - // If AD was not calculated, we can't continue - if(counts == null) - continue; - - final int n_allele = counts.length; - int count_sum = 0; - for(int i=0; i bestOtherCount ) - bestOtherCount = counts[i]; - } - final int otherCount = count_sum - alleleCount; - ratioHom += pTrue*( (double) alleleCount)/((double) (alleleCount+bestOtherCount)); - weightHom += pTrue; - overallNonDiploid += ((double ) otherCount)/((double) count_sum*genotypes.size()); - } - // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of - // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from - // scratch, be my guest - but make sure it's done correctly! [EB] - } } - // make sure we had a het genotype + double refCountInHetSamples = 0.0; + double altCountInHetSamples = 0.0; + double correctCountInHomSamples = 0.0; + double incorrectCountInHomSamples = 0.0; + double nonDiploidCount = 0.0; + double totalReadCount = 0.0; + + for ( final Genotype genotype : vc.getGenotypes() ) { + if ( !vc.isSNP() ) { + continue; + } + + final int[] alleleCounts = getCounts(genotype, stratifiedContexts, vc); + + if (alleleCounts == null) continue; + + final long totalReads = MathUtils.sum(alleleCounts); + if ( genotype.isHet() ) { + // weight read counts by genotype quality so that e.g. mis-called homs don't affect the ratio too much + refCountInHetSamples += alleleCounts[0]; + altCountInHetSamples += alleleCounts[1]; + nonDiploidCount += totalReads - (alleleCounts[0] + alleleCounts[1]); + totalReadCount += totalReads; + } else if ( genotype.isHom() ) { + final int alleleIndex = genotype.isHomRef() ? 0 : 1 ; + final int alleleCount = alleleCounts[alleleIndex]; + int bestOtherCount = 0; + for(int n = 0; n < alleleCounts.length; n++){ + if( n != alleleIndex && alleleCounts[n] > bestOtherCount ) { + bestOtherCount = alleleCounts[n]; + } + } + correctCountInHomSamples += alleleCount; + incorrectCountInHomSamples += bestOtherCount; + nonDiploidCount += totalReads - alleleCount; + totalReadCount += totalReads; + } + // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of + // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from + // scratch, be my guest - but make sure it's done correctly! [EB] + + } + final double diploidCountInHetSamples = altCountInHetSamples + refCountInHetSamples; + final double diploidCountInHomSamples = correctCountInHomSamples + incorrectCountInHomSamples; Map map = new HashMap<>(); - if ( weightHet > 0.0 ) { - map.put(GATKVCFConstants.ALLELE_BALANCE_HET_KEY,ratioHet/weightHet); + if ( diploidCountInHetSamples > 0.0 ) { + map.put(GATKVCFConstants.ALLELE_BALANCE_HET_KEY, refCountInHetSamples / diploidCountInHetSamples); } - if ( weightHom > 0.0 ) { - map.put(GATKVCFConstants.ALLELE_BALANCE_HOM_KEY,ratioHom/weightHom); + if ( diploidCountInHomSamples > 0.0 ) { + map.put(GATKVCFConstants.ALLELE_BALANCE_HOM_KEY, correctCountInHomSamples / diploidCountInHomSamples); } - if ( overallNonDiploid > 0.0 ) { - map.put(GATKVCFConstants.NON_DIPLOID_RATIO_KEY,overallNonDiploid); + if ( totalReadCount > 0.0 ) { + map.put(GATKVCFConstants.NON_DIPLOID_RATIO_KEY, nonDiploidCount / totalReadCount); } return map; } /** - * Provide a centralized method of getting the number of reads per allele, - * depending on the input given. Will use the following (in order of preference): + * Get the number of reads per allele, using the following (in order of preference): * - genotype.getAD() * - reads from an AlignmentContext * - reads from a PerReadAlleleLikelihoodMap (Not yet implemented) * - * * @param genotype The genotype of interest - * @param stratifiedContexts A mapping - * @param vc - * @return + * @param stratifiedContexts A mapping of sample name to read alignments at a location + * @param vc The Variant Context + * @return The number of reads per allele */ private int[] getCounts(final Genotype genotype, final Map stratifiedContexts, final VariantContext vc){ - - // Can't do anything without a genotype here if(genotype == null) return null; - int[] retVal = genotype.getAD(); - AlignmentContext context; + if (genotype.hasAD()) { + return genotype.getAD(); + } else { // If getAD() returned no information we count alleles from the pileup + final AlignmentContext context = stratifiedContexts == null ? null : stratifiedContexts.get(genotype.getSampleName()); + if (context == null) return null; - if ( retVal == null && stratifiedContexts != null && - (context = stratifiedContexts.get(genotype.getSampleName())) != null){ - // If we get to this point, the getAD() function returned no information - // about AlleleDepth by Sample - perhaps it wasn't annotated? - // In that case, let's try to build it up using the algorithm that - // was here in v 3.1-1 and earlier - // Also, b/c of the assignment check in the if statement above, - // we know we have a valid AlignmentContext for this sample! - - final ReadBackedPileup pileup = context.getBasePileup(); - final String bases = new String(pileup.getBases()); - List alleles = vc.getAlleles(); - final int n_allele = alleles.size(); - retVal = new int[n_allele]; - - // Calculate the depth for each allele, under the assumption that - // the allele is a single base - int i=0; - for(Allele a : alleles){ - retVal[i] = MathUtils.countOccurrences(a.toString().charAt(0), bases); - i++; + final byte[] bases = context.getBasePileup().getBases(); + // Should be able to replace with the following, but this annotation was not found when using -A AlleleBalance + // return vc.getAlleles().stream().map(a -> MathUtils.countOccurrences(a.getBases()[0], bases)).mapToInt(Integer::intValue).toArray(); + final List alleles = vc.getAlleles(); + final int[] result = new int[alleles.size()]; + // Calculate the depth for each allele, assuming that the allele is a single base + for(int n = 0; n < alleles.size(); n++){ + result[n] = MathUtils.countOccurrences(alleles.get(n).getBases()[0], bases); } + return result; } - - return retVal; - } @Override diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java index 24913bfe9..dd0caa865 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java @@ -32,6 +32,7 @@ import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineType; +import org.apache.commons.lang.mutable.MutableInt; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; @@ -84,36 +85,23 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - - - // We need a heterozygous genotype and either a context or alleleLikelihoodMap - if ( g == null || !g.isCalled() || !g.isHet() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + // We need a heterozygous genotype and either a context or non-empty alleleLikelihoodMap + if ( g == null || !g.isCalled() || !g.isHet() || + ( stratifiedContext == null && (alleleLikelihoodMap == null || alleleLikelihoodMap.isEmpty())) ) return; - // Test for existence of allele, and manually check isSNP() - // and isBiallelic() while ignoring the allele - boolean biallelicSNP = vc.isSNP() && vc.isBiallelic(); - if(vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)){ - // If we have the GVCF allele, then the SNP is biallelic - // iff there are 3 alleles and both the reference and first alt - // allele are length 1. - biallelicSNP = vc.getAlleles().size() == 3 && - vc.getReference().length() == 1 && - vc.getAlternateAllele(0).length() == 1; - } + // If we have a allele the SNP is biallelic if there are 3 alleles and both the reference and first alt allele are length 1. + final boolean biallelicSNP = vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ? + vc.getAlleles().size() == 3 && vc.getReference().length() == 1 && vc.getAlternateAllele(0).length() == 1 : + vc.isSNP() && vc.isBiallelic(); if ( !biallelicSNP ) return; - Double ratio; - if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) - ratio = annotateWithLikelihoods(alleleLikelihoodMap, vc); - else if ( stratifiedContext != null ) - ratio = annotateWithPileup(stratifiedContext, vc); - else - return; - + final Double ratio = (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) ? + annotateWithLikelihoods(alleleLikelihoodMap, vc) : + annotateWithPileup(stratifiedContext, vc); if (ratio == null) return; @@ -121,58 +109,42 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim } private Double annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc) { - - final HashMap alleleCounts = new HashMap<>(); + final HashMap alleleCounts = new HashMap<>(); for ( final Allele allele : vc.getAlleles() ) - alleleCounts.put(allele.getBases()[0], 0); + alleleCounts.put(allele.getBases()[0], new MutableInt(0)); - final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - for ( final PileupElement p : pileup ) { - if ( alleleCounts.containsKey(p.getBase()) ) - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); + for ( final byte base : stratifiedContext.getBasePileup().getBases() ) { + if ( alleleCounts.containsKey(base) ) + alleleCounts.get(base).increment(); } - // we need to add counts in the correct order - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) - counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); - - // sanity check - if(counts[0] + counts[1] == 0) - return null; - - return ((double) counts[0] / (double)(counts[0] + counts[1])); + final int refCount = alleleCounts.get(vc.getReference().getBases()[0]).intValue(); + final int altCount = alleleCounts.get(vc.getAlternateAllele(0).getBases()[0]).intValue(); + return (refCount + altCount == 0) ? null : ((double) refCount) / (refCount + altCount); } private Double annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) { final Set alleles = new HashSet<>(vc.getAlleles()); // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext - if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) + if (!perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles)) throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); - final HashMap alleleCounts = new HashMap<>(); - for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } - - for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); - if (! a.isInformative() ) continue; // read is non-informative - final int prevCount = alleleCounts.get(a.getMostLikelyAllele()); - alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1); + final HashMap alleleCounts = new HashMap<>(); + for (final Allele allele : vc.getAlleles()) { + alleleCounts.put(allele, new MutableInt(0)); } - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(vc.getReference()); - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) - counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); - - // sanity check - if(counts[0] + counts[1] == 0) - return null; - - return ((double) counts[0] / (double)(counts[0] + counts[1])); + for (final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); + if (a.isInformative()) { + alleleCounts.get(a.getMostLikelyAllele()).increment(); + } + } + final int refCount = alleleCounts.get(vc.getReference()).intValue(); + final int altCount = alleleCounts.get(vc.getAlternateAllele(0)).intValue(); + return (refCount + altCount == 0) ? null : ((double) refCount) / (refCount + altCount); } public List getKeyNames() { return Arrays.asList(GATKVCFConstants.ALLELE_BALANCE_KEY); }