From d534241f35505d757f39d482d45d3316d6dd944e Mon Sep 17 00:00:00 2001 From: delangel Date: Sat, 4 Jun 2011 15:34:24 +0000 Subject: [PATCH] Major revamp of annotations for indels: a) All rank sum tests now work for indels including multiallelic sites. For the latter cases, rank sum test is REF vs most common allele b) Redid computation of HaplotypeScore for indels. It's now trivially easy to do because we are already computing likelihoods of each read vs haplotypes in GL computation so we reuse that if available. For multiallelic case, we score against N haplotypes where N is total called alleles. Drawback is that all cases need information contained in likelihood table that stores likelihood for each pileup element, for each allele. If this table is not available we dont annotate, so we can only fully annotate indels right now when running UG but not when running VariantAnnotator alone. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5947 348d0f76-0448-11de-a6fe-93d51630548a --- .../annotator/BaseQualityRankSumTest.java | 40 +++++++- .../gatk/walkers/annotator/FisherStrand.java | 99 +++++++++++++++++-- .../sting/gatk/walkers/annotator/GLstats.java | 18 +++- .../walkers/annotator/HaplotypeScore.java | 84 +++++++++++++--- .../annotator/MappingQualityRankSumTest.java | 40 +++++++- .../gatk/walkers/annotator/RankSumTest.java | 72 +++++++++++--- .../walkers/annotator/ReadPosRankSumTest.java | 51 +++++++++- .../UnifiedGenotyperIntegrationTest.java | 31 +++--- 8 files changed, 375 insertions(+), 60 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 782a321db..588a86087 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -1,10 +1,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Arrays; @@ -14,15 +18,45 @@ public class BaseQualityRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) ) { if ( p.getBase() == ref ) { - refQuals.add((int)p.getQual()); + refQuals.add((double)p.getQual()); } else if ( p.getBase() == alt ) { - altQuals.add((int)p.getQual()); + altQuals.add((double)p.getQual()); } } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? + HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p)) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) + refQuals.add(-10.0*refLikelihood); + else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) + altQuals.add(-10.0*altLikelihood); + + + } + } + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 285be39c1..f47460885 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broad.tribble.util.variantcontext.VariantContext; @@ -36,11 +37,10 @@ import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFInfoHeaderLine; import org.broad.tribble.vcf.VCFHeaderLineType; import cern.jet.math.Arithmetic; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import java.util.List; -import java.util.Map; -import java.util.Arrays; -import java.util.HashMap; +import java.util.*; public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotation { @@ -52,10 +52,22 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati private static final double MIN_PVALUE = 1E-320; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( ! vc.isVariant() || vc.isFiltered() || ! vc.isBiallelic() || ! vc.isSNP() ) + if ( ! vc.isVariant() || vc.isFiltered() ) + return null; + + + int[][] table; + + if (vc.isBiallelic() && vc.isSNP()) + table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0)); + else if (vc.isIndel() || vc.isMixed()) { + table = getIndelContingencyTable(stratifiedContexts, vc); + if (table == null) + return null; + } + else return null; - int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0)); Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); if ( pvalue == null ) return null; @@ -188,7 +200,7 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable(Map stratifiedContexts, Allele ref, Allele alt) { + private static int[][] getSNPContingencyTable(Map stratifiedContexts, Allele ref, Allele alt) { int[][] table = new int[2][2]; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { @@ -213,6 +225,79 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati } } + return table; + } + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getIndelContingencyTable(Map stratifiedContexts, VariantContext vc) { + final double INDEL_LIKELIHOOD_THRESH = 0.3; + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + + if (indelLikelihoodMap == null) + return null; + + int[][] table = new int[2][2]; + + for ( String sample : stratifiedContexts.keySet() ) { + final AlignmentContext context = stratifiedContexts.get(sample); + if ( context == null ) + continue; + + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup == null) + continue; + + for (final PileupElement p: pileup) { + if ( p.getRead().getMappingQuality() < 20) + continue; + if (indelLikelihoodMap.containsKey(p)) { + // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. + // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). + // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. + // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pileup element is "ref" + // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + boolean isFW = !p.getRead().getReadNegativeStrandFlag(); + + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + + boolean matchesRef = (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)); + boolean matchesAlt = (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)); + if ( matchesRef || matchesAlt ) { + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + + + } + } + } + return table; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java index 367bce7c6..2a486b699 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java @@ -29,9 +29,19 @@ public class GLstats implements InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { final Map genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isBiallelic() ) + if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; + int idxAA = 0, idxAB = 1, idxBB = 2; + + if (!vc.isBiallelic()) { + // for non-bliallelic case, do test with most common alt allele. + // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB. + int[] idxVector = vc.getGLIndecesOfAllele(vc.getAltAlleleWithHighestAlleleCount()); + idxAA = idxVector[0]; + idxAB = idxVector[1]; + idxBB = idxVector[2]; + } double refCount = 0.0; double hetCount = 0.0; double homCount = 0.0; @@ -43,9 +53,9 @@ public class GLstats implements InfoFieldAnnotation { N++; final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10( g.getLikelihoods().getAsVector() ); - refCount += normalizedLikelihoods[0]; - hetCount += normalizedLikelihoods[1]; - homCount += normalizedLikelihoods[2]; + refCount += normalizedLikelihoods[idxAA]; + hetCount += normalizedLikelihoods[idxAB]; + homCount += normalizedLikelihoods[idxBB]; } final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index b923e33c4..05914d4b5 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFHeaderLineType; @@ -33,6 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotype.Haplotype; @@ -49,9 +51,12 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { private final static char REGEXP_WILDCARD = '.'; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( !vc.isBiallelic() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here + if (stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here return null; - + + if (vc.isSNP() && !vc.isBiallelic()) + return null; + final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); @@ -69,7 +74,7 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { if (pileup == null) return null; - final List haplotypes = computeHaplotypes(pileup, contextSize, locus); + final List haplotypes = computeHaplotypes(pileup, contextSize, locus, vc); final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); if (haplotypes != null) { @@ -86,7 +91,17 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { thisPileup = null; if (thisPileup != null) { - scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + if (vc.isSNP()) + scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + else if (vc.isIndel() || vc.isMixed()) { + Double d = scoreIndelsAgainstHaplotypes(thisPileup); + if (d == null) + return null; + scoreRA.add( d ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } + else + return null; + } } } @@ -110,8 +125,11 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus) { + private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus, final VariantContext vc) { // Compute all possible haplotypes consistent with current pileup + + int haplotypesToCompute = vc.getAlternateAlleles().size()+1; + final PriorityQueue candidateHaplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); final PriorityQueue consensusHaplotypeQueue = new PriorityQueue(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); @@ -148,16 +166,22 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - // Now retrieve the two most popular haplotypes + // Now retrieve the N most popular haplotypes if (consensusHaplotypeQueue.size() > 0) { // Since the consensus haplotypes are in a quality-ordered priority queue, the two best haplotypes are just the first two in the queue final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); - Haplotype haplotype2 = consensusHaplotypeQueue.poll(); - if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found - return Arrays.asList(new Haplotype(haplotype1.getBasesAsBytes(), 60), new Haplotype(haplotype2.getBasesAsBytes(), 20)); // These qual values aren't used for anything - } else { + + Listhlist = new ArrayList(); + hlist.add(new Haplotype(haplotype1.getBasesAsBytes(), 60)); + + for (int k=1;k < haplotypesToCompute; k++) { + Haplotype haplotype2 = consensusHaplotypeQueue.poll(); + if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found + hlist.add(new Haplotype(haplotype2.getBasesAsBytes(), 20)); + } + return hlist; + } else return null; - } } private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { @@ -321,6 +345,44 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { return mismatches - expected; } + + + private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { + final ArrayList haplotypeScores = new ArrayList(); + + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + + if (indelLikelihoodMap== null) + return null; + + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p)) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + + // Score all the reads in the pileup, even the filtered ones + final double[] scores = new double[el.size()]; + int i = 0; + for (Allele a: el.keySet() ) { + scores[i++] = -el.get(a); + if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i-1, scores[i-1]); } + } + + haplotypeScores.add(scores); + } + } + + // indel likelihoods are stric log-probs, not phred scored + double overallScore = 0.0; + for ( final double[] readHaplotypeScores : haplotypeScores ) { + overallScore += MathUtils.arrayMin(readHaplotypeScores); + } + + return overallScore; + + } + + public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 7563a89b1..0f874ed7f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -1,10 +1,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Arrays; @@ -15,15 +19,45 @@ public class MappingQualityRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) && p.getMappingQual() < 254 ) { // 254 and 255 are special mapping qualities used as a code by aligners if ( p.getBase() == ref ) { - refQuals.add(p.getMappingQual()); + refQuals.add((double)p.getMappingQual()); } else if ( p.getBase() == alt ) { - altQuals.add(p.getMappingQual()); + altQuals.add((double)p.getMappingQual()); } } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? + HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() < 254) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) + refQuals.add((double)p.getMappingQual()); + else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) + altQuals.add((double)p.getMappingQual()); + + + } + } + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index bd91c47ba..e5e08472a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -1,11 +1,13 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -17,37 +19,79 @@ import java.util.Map; import java.util.HashMap; + public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnotation { + static final double INDEL_LIKELIHOOD_THRESH = 0.1; + static final boolean DEBUG = false; + public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; - if ( !vc.isBiallelic() || !vc.isSNP() ) - return null; - final Map genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) return null; - final ArrayList refQuals = new ArrayList(); - final ArrayList altQuals = new ArrayList(); - for ( final Map.Entry genotype : genotypes.entrySet() ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); - if ( context == null ) { - continue; + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); + + if (vc.isSNP() && vc.isBiallelic()) { + // todo - no current support for multiallelic snps + for ( final Map.Entry genotype : genotypes.entrySet() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + if ( context == null ) { + continue; + } + fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals); } - fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals); } + else if (vc.isIndel() || vc.isMixed()) { + + for ( final Map.Entry genotype : genotypes.entrySet() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + if ( context == null ) { + continue; + } + + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup == null) + continue; + + if (IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap() == null || + IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().size() == 0) + return null; + + fillIndelQualsFromPileup(pileup, refQuals, altQuals); + } + } + else + return null; final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for ( final Integer qual : altQuals ) { + for ( final Double qual : altQuals ) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); } - for ( final Integer qual : refQuals ) { + for ( final Double qual : refQuals ) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); } + if (DEBUG) { + System.out.format("%s, REF QUALS:",this.getClass().getName()); + for ( final Double qual : refQuals ) + System.out.format("%4.1f ",qual); + System.out.println(); + System.out.format("%s, ALT QUALS:",this.getClass().getName()); + for ( final Double qual : altQuals ) + System.out.format("%4.1f ",qual); + System.out.println(); + + } // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) final Pair testResults = mannWhitneyU.runOneSidedTest( MannWhitneyU.USet.SET1 ); @@ -55,9 +99,11 @@ public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnota if ( ! Double.isNaN(testResults.first) ) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; + } - protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals); + protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals); + protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); protected static boolean isUsableBase( final PileupElement p ) { return !( p.isDeletion() || p.getMappingQual() == 0 || ((int)p.getQual()) < 6 ); // need the unBAQed quality score here diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 683223044..9d5f2ec1a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; /** @@ -21,7 +25,7 @@ public class ReadPosRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); @@ -31,11 +35,52 @@ public class ReadPosRankSumTest extends RankSumTest { } if ( p.getBase() == ref ) { - refQuals.add( readPos ); + refQuals.add( (double)readPos ); } else if ( p.getBase() == alt ) { - altQuals.add( readPos ); + altQuals.add( (double)readPos ); } } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele + // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. + // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). + // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. + // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref" + // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p)) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); + final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); + if( readPos > numAlignedBases / 2 ) { + readPos = numAlignedBases - ( readPos + 1 ); + } + + if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) + refQuals.add((double)readPos); + else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) + altQuals.add((double)readPos); + + + } + } + } + } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 40c78e738..80c0ed9d0 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("91246d154ba379cb04a1920d365f11c3")); + Arrays.asList("82d77402919f5d5b627a789f0bfffbb9")); executeTest("test MultiSample Pilot1", spec); } @@ -67,7 +67,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("61db698ea6d11deba2fc900bd544b082")); + Arrays.asList("274a4eae68b0083191d59b95db95cbfc")); executeTest("test SingleSample Pilot2", spec); } @@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "af5e98318f56e203b25361fee4d41548"; + private final static String COMPRESSED_OUTPUT_MD5 = "e01fd00964e4340889a59edddd93bc48"; @Test public void testCompressedOutput() { @@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { HashMap e = new HashMap(); e.put( "-sites_only", "71f61655f725cda56bc46d99d1cc24eb" ); e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "8193a4c06ddbd82d0a328491118b16a8" ); - e.put( "--output_mode EMIT_ALL_SITES", "89859bd987e8b797d37f5dec54d37e9c" ); + e.put( "--output_mode EMIT_ALL_SITES", "74c25dedf25652e35707fb617d7637b6" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -186,8 +186,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "649590cbfc67bddf82416a0b82027696" ); - e.put( 1.0 / 1850, "22153b7d06d13ea1a1ffdc45d7668974" ); + e.put( 0.01, "cb8da711af63409f75c49cff3680b7e1" ); + e.put( 1.0 / 1850, "4249a2ab3ff7516d3a77521d9516eb23" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("d453f2e44fb7f0ec027c2a5f791da6aa")); + Arrays.asList("4de696898979e57644e5c983e40b882a")); executeTest(String.format("test multiple technologies"), spec); } @@ -230,7 +230,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("60cc4c793caaf891075d862a2376d73c")); + Arrays.asList("e31fb159a93011edd66c02b365c6c76e")); executeTest(String.format("test calling with BAQ"), spec); } @@ -244,7 +244,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq OFF", 1, - Arrays.asList("d453f2e44fb7f0ec027c2a5f791da6aa")); + Arrays.asList("4de696898979e57644e5c983e40b882a")); executeTest(String.format("test calling with BAQ OFF"), spec); } @@ -263,7 +263,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("51726ce0b309a77cb833dd2e11e2219e")); + Arrays.asList("ca800d36708a337c3e29216c1b73bb6d")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("11d761009bfbc04ba23cbee000a62954")); + Arrays.asList("cb1e3d077b7fb17eb4f1be758ed4e4d6")); executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); } @@ -291,25 +291,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("32a06420a3357c1451fdc36a40df4d08")); + Arrays.asList("8d4b2d5a093cbd6f421e85390d346f83")); executeTest(String.format("test indel calling, multiple technologies"), spec); } - // todo - feature not yet fully working with indels - //@Test + @Test public void testWithIndelAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("e95c545b8ae06f0721f260125cfbe1f0")); + Arrays.asList("a3351235c893d38ed4b1d23ab3cda744")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("6c96d76b9bc3aade0c768d7c657ae210")); + Arrays.asList("31afc73ecba6b466bcb17ef3ebdd7a99")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); }