diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 782a321db..588a86087 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -1,10 +1,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Arrays; @@ -14,15 +18,45 @@ public class BaseQualityRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) ) { if ( p.getBase() == ref ) { - refQuals.add((int)p.getQual()); + refQuals.add((double)p.getQual()); } else if ( p.getBase() == alt ) { - altQuals.add((int)p.getQual()); + altQuals.add((double)p.getQual()); } } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? + HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p)) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) + refQuals.add(-10.0*refLikelihood); + else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) + altQuals.add(-10.0*altLikelihood); + + + } + } + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 285be39c1..f47460885 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broad.tribble.util.variantcontext.VariantContext; @@ -36,11 +37,10 @@ import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFInfoHeaderLine; import org.broad.tribble.vcf.VCFHeaderLineType; import cern.jet.math.Arithmetic; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import java.util.List; -import java.util.Map; -import java.util.Arrays; -import java.util.HashMap; +import java.util.*; public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotation { @@ -52,10 +52,22 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati private static final double MIN_PVALUE = 1E-320; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( ! vc.isVariant() || vc.isFiltered() || ! vc.isBiallelic() || ! vc.isSNP() ) + if ( ! vc.isVariant() || vc.isFiltered() ) + return null; + + + int[][] table; + + if (vc.isBiallelic() && vc.isSNP()) + table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0)); + else if (vc.isIndel() || vc.isMixed()) { + table = getIndelContingencyTable(stratifiedContexts, vc); + if (table == null) + return null; + } + else return null; - int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0)); Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); if ( pvalue == null ) return null; @@ -188,7 +200,7 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable(Map stratifiedContexts, Allele ref, Allele alt) { + private static int[][] getSNPContingencyTable(Map stratifiedContexts, Allele ref, Allele alt) { int[][] table = new int[2][2]; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { @@ -213,6 +225,79 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati } } + return table; + } + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getIndelContingencyTable(Map stratifiedContexts, VariantContext vc) { + final double INDEL_LIKELIHOOD_THRESH = 0.3; + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + + if (indelLikelihoodMap == null) + return null; + + int[][] table = new int[2][2]; + + for ( String sample : stratifiedContexts.keySet() ) { + final AlignmentContext context = stratifiedContexts.get(sample); + if ( context == null ) + continue; + + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup == null) + continue; + + for (final PileupElement p: pileup) { + if ( p.getRead().getMappingQuality() < 20) + continue; + if (indelLikelihoodMap.containsKey(p)) { + // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. + // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). + // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. + // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pileup element is "ref" + // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + boolean isFW = !p.getRead().getReadNegativeStrandFlag(); + + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + + boolean matchesRef = (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)); + boolean matchesAlt = (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)); + if ( matchesRef || matchesAlt ) { + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + + + } + } + } + return table; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java index 367bce7c6..2a486b699 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java @@ -29,9 +29,19 @@ public class GLstats implements InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { final Map genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isBiallelic() ) + if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; + int idxAA = 0, idxAB = 1, idxBB = 2; + + if (!vc.isBiallelic()) { + // for non-bliallelic case, do test with most common alt allele. + // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB. + int[] idxVector = vc.getGLIndecesOfAllele(vc.getAltAlleleWithHighestAlleleCount()); + idxAA = idxVector[0]; + idxAB = idxVector[1]; + idxBB = idxVector[2]; + } double refCount = 0.0; double hetCount = 0.0; double homCount = 0.0; @@ -43,9 +53,9 @@ public class GLstats implements InfoFieldAnnotation { N++; final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10( g.getLikelihoods().getAsVector() ); - refCount += normalizedLikelihoods[0]; - hetCount += normalizedLikelihoods[1]; - homCount += normalizedLikelihoods[2]; + refCount += normalizedLikelihoods[idxAA]; + hetCount += normalizedLikelihoods[idxAB]; + homCount += normalizedLikelihoods[idxBB]; } final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index b923e33c4..05914d4b5 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFHeaderLineType; @@ -33,6 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotype.Haplotype; @@ -49,9 +51,12 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { private final static char REGEXP_WILDCARD = '.'; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( !vc.isBiallelic() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here + if (stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here return null; - + + if (vc.isSNP() && !vc.isBiallelic()) + return null; + final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); @@ -69,7 +74,7 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { if (pileup == null) return null; - final List haplotypes = computeHaplotypes(pileup, contextSize, locus); + final List haplotypes = computeHaplotypes(pileup, contextSize, locus, vc); final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); if (haplotypes != null) { @@ -86,7 +91,17 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { thisPileup = null; if (thisPileup != null) { - scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + if (vc.isSNP()) + scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + else if (vc.isIndel() || vc.isMixed()) { + Double d = scoreIndelsAgainstHaplotypes(thisPileup); + if (d == null) + return null; + scoreRA.add( d ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } + else + return null; + } } } @@ -110,8 +125,11 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus) { + private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus, final VariantContext vc) { // Compute all possible haplotypes consistent with current pileup + + int haplotypesToCompute = vc.getAlternateAlleles().size()+1; + final PriorityQueue candidateHaplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); final PriorityQueue consensusHaplotypeQueue = new PriorityQueue(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); @@ -148,16 +166,22 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - // Now retrieve the two most popular haplotypes + // Now retrieve the N most popular haplotypes if (consensusHaplotypeQueue.size() > 0) { // Since the consensus haplotypes are in a quality-ordered priority queue, the two best haplotypes are just the first two in the queue final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); - Haplotype haplotype2 = consensusHaplotypeQueue.poll(); - if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found - return Arrays.asList(new Haplotype(haplotype1.getBasesAsBytes(), 60), new Haplotype(haplotype2.getBasesAsBytes(), 20)); // These qual values aren't used for anything - } else { + + Listhlist = new ArrayList(); + hlist.add(new Haplotype(haplotype1.getBasesAsBytes(), 60)); + + for (int k=1;k < haplotypesToCompute; k++) { + Haplotype haplotype2 = consensusHaplotypeQueue.poll(); + if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found + hlist.add(new Haplotype(haplotype2.getBasesAsBytes(), 20)); + } + return hlist; + } else return null; - } } private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { @@ -321,6 +345,44 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { return mismatches - expected; } + + + private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { + final ArrayList haplotypeScores = new ArrayList(); + + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + + if (indelLikelihoodMap== null) + return null; + + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p)) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + + // Score all the reads in the pileup, even the filtered ones + final double[] scores = new double[el.size()]; + int i = 0; + for (Allele a: el.keySet() ) { + scores[i++] = -el.get(a); + if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i-1, scores[i-1]); } + } + + haplotypeScores.add(scores); + } + } + + // indel likelihoods are stric log-probs, not phred scored + double overallScore = 0.0; + for ( final double[] readHaplotypeScores : haplotypeScores ) { + overallScore += MathUtils.arrayMin(readHaplotypeScores); + } + + return overallScore; + + } + + public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 7563a89b1..0f874ed7f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -1,10 +1,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Arrays; @@ -15,15 +19,45 @@ public class MappingQualityRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) && p.getMappingQual() < 254 ) { // 254 and 255 are special mapping qualities used as a code by aligners if ( p.getBase() == ref ) { - refQuals.add(p.getMappingQual()); + refQuals.add((double)p.getMappingQual()); } else if ( p.getBase() == alt ) { - altQuals.add(p.getMappingQual()); + altQuals.add((double)p.getMappingQual()); } } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? + HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() < 254) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) + refQuals.add((double)p.getMappingQual()); + else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) + altQuals.add((double)p.getMappingQual()); + + + } + } + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index bd91c47ba..e5e08472a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -1,11 +1,13 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -17,37 +19,79 @@ import java.util.Map; import java.util.HashMap; + public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnotation { + static final double INDEL_LIKELIHOOD_THRESH = 0.1; + static final boolean DEBUG = false; + public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; - if ( !vc.isBiallelic() || !vc.isSNP() ) - return null; - final Map genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) return null; - final ArrayList refQuals = new ArrayList(); - final ArrayList altQuals = new ArrayList(); - for ( final Map.Entry genotype : genotypes.entrySet() ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); - if ( context == null ) { - continue; + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); + + if (vc.isSNP() && vc.isBiallelic()) { + // todo - no current support for multiallelic snps + for ( final Map.Entry genotype : genotypes.entrySet() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + if ( context == null ) { + continue; + } + fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals); } - fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals); } + else if (vc.isIndel() || vc.isMixed()) { + + for ( final Map.Entry genotype : genotypes.entrySet() ) { + final AlignmentContext context = stratifiedContexts.get(genotype.getKey()); + if ( context == null ) { + continue; + } + + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup == null) + continue; + + if (IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap() == null || + IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().size() == 0) + return null; + + fillIndelQualsFromPileup(pileup, refQuals, altQuals); + } + } + else + return null; final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for ( final Integer qual : altQuals ) { + for ( final Double qual : altQuals ) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); } - for ( final Integer qual : refQuals ) { + for ( final Double qual : refQuals ) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); } + if (DEBUG) { + System.out.format("%s, REF QUALS:",this.getClass().getName()); + for ( final Double qual : refQuals ) + System.out.format("%4.1f ",qual); + System.out.println(); + System.out.format("%s, ALT QUALS:",this.getClass().getName()); + for ( final Double qual : altQuals ) + System.out.format("%4.1f ",qual); + System.out.println(); + + } // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) final Pair testResults = mannWhitneyU.runOneSidedTest( MannWhitneyU.USet.SET1 ); @@ -55,9 +99,11 @@ public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnota if ( ! Double.isNaN(testResults.first) ) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; + } - protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals); + protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals); + protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); protected static boolean isUsableBase( final PileupElement p ) { return !( p.isDeletion() || p.getMappingQual() == 0 || ((int)p.getQual()) < 6 ); // need the unBAQed quality score here diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 683223044..9d5f2ec1a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; /** @@ -21,7 +25,7 @@ public class ReadPosRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); } - protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { + protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List refQuals, List altQuals) { for ( final PileupElement p : pileup ) { if( isUsableBase(p) ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); @@ -31,11 +35,52 @@ public class ReadPosRankSumTest extends RankSumTest { } if ( p.getBase() == ref ) { - refQuals.add( readPos ); + refQuals.add( (double)readPos ); } else if ( p.getBase() == alt ) { - altQuals.add( readPos ); + altQuals.add( (double)readPos ); } } } } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { + // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele + // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. + // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). + // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. + // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref" + // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" + final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + for (final PileupElement p: pileup) { + if (indelLikelihoodMap.containsKey(p)) { + // retrieve likelihood information corresponding to this read + LinkedHashMap el = indelLikelihoodMap.get(p); + // by design, first element in LinkedHashMap was ref allele + double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; + + for (Allele a : el.keySet()) { + + if (a.isReference()) + refLikelihood =el.get(a); + else { + double like = el.get(a); + if (like >= altLikelihood) + altLikelihood = like; + } + } + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); + final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); + if( readPos > numAlignedBases / 2 ) { + readPos = numAlignedBases - ( readPos + 1 ); + } + + if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) + refQuals.add((double)readPos); + else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) + altQuals.add((double)readPos); + + + } + } + } + } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 40c78e738..80c0ed9d0 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("91246d154ba379cb04a1920d365f11c3")); + Arrays.asList("82d77402919f5d5b627a789f0bfffbb9")); executeTest("test MultiSample Pilot1", spec); } @@ -67,7 +67,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("61db698ea6d11deba2fc900bd544b082")); + Arrays.asList("274a4eae68b0083191d59b95db95cbfc")); executeTest("test SingleSample Pilot2", spec); } @@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "af5e98318f56e203b25361fee4d41548"; + private final static String COMPRESSED_OUTPUT_MD5 = "e01fd00964e4340889a59edddd93bc48"; @Test public void testCompressedOutput() { @@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { HashMap e = new HashMap(); e.put( "-sites_only", "71f61655f725cda56bc46d99d1cc24eb" ); e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "8193a4c06ddbd82d0a328491118b16a8" ); - e.put( "--output_mode EMIT_ALL_SITES", "89859bd987e8b797d37f5dec54d37e9c" ); + e.put( "--output_mode EMIT_ALL_SITES", "74c25dedf25652e35707fb617d7637b6" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -186,8 +186,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "649590cbfc67bddf82416a0b82027696" ); - e.put( 1.0 / 1850, "22153b7d06d13ea1a1ffdc45d7668974" ); + e.put( 0.01, "cb8da711af63409f75c49cff3680b7e1" ); + e.put( 1.0 / 1850, "4249a2ab3ff7516d3a77521d9516eb23" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("d453f2e44fb7f0ec027c2a5f791da6aa")); + Arrays.asList("4de696898979e57644e5c983e40b882a")); executeTest(String.format("test multiple technologies"), spec); } @@ -230,7 +230,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("60cc4c793caaf891075d862a2376d73c")); + Arrays.asList("e31fb159a93011edd66c02b365c6c76e")); executeTest(String.format("test calling with BAQ"), spec); } @@ -244,7 +244,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq OFF", 1, - Arrays.asList("d453f2e44fb7f0ec027c2a5f791da6aa")); + Arrays.asList("4de696898979e57644e5c983e40b882a")); executeTest(String.format("test calling with BAQ OFF"), spec); } @@ -263,7 +263,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("51726ce0b309a77cb833dd2e11e2219e")); + Arrays.asList("ca800d36708a337c3e29216c1b73bb6d")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("11d761009bfbc04ba23cbee000a62954")); + Arrays.asList("cb1e3d077b7fb17eb4f1be758ed4e4d6")); executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); } @@ -291,25 +291,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("32a06420a3357c1451fdc36a40df4d08")); + Arrays.asList("8d4b2d5a093cbd6f421e85390d346f83")); executeTest(String.format("test indel calling, multiple technologies"), spec); } - // todo - feature not yet fully working with indels - //@Test + @Test public void testWithIndelAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("e95c545b8ae06f0721f260125cfbe1f0")); + Arrays.asList("a3351235c893d38ed4b1d23ab3cda744")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("6c96d76b9bc3aade0c768d7c657ae210")); + Arrays.asList("31afc73ecba6b466bcb17ef3ebdd7a99")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); }