Major revamp of annotations for indels:

a) All rank sum tests now work for indels including multiallelic sites. For the latter cases, rank sum test is REF vs most common allele
b) Redid computation of HaplotypeScore for indels. It's now trivially easy to do because we are already computing likelihoods of each read vs haplotypes in GL computation so we reuse that if available. For multiallelic case, we score against N haplotypes where N is total called alleles.

Drawback is that all cases need information contained in likelihood table that stores likelihood for each pileup element, for each allele. If this table is not available we dont annotate, so we can only fully annotate indels right now when running UG but not when running VariantAnnotator alone.
 


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5947 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2011-06-04 15:34:24 +00:00
parent 1448a1f155
commit d534241f35
8 changed files with 375 additions and 60 deletions

View File

@ -1,10 +1,14 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.vcf.VCFHeaderLineType;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Arrays;
@ -14,15 +18,45 @@ public class BaseQualityRankSumTest extends RankSumTest {
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); }
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Integer> refQuals, List<Integer> altQuals) {
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
for ( final PileupElement p : pileup ) {
if( isUsableBase(p) ) {
if ( p.getBase() == ref ) {
refQuals.add((int)p.getQual());
refQuals.add((double)p.getQual());
} else if ( p.getBase() == alt ) {
altQuals.add((int)p.getQual());
altQuals.add((double)p.getQual());
}
}
}
}
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ?
HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
for (final PileupElement p: pileup) {
if (indelLikelihoodMap.containsKey(p)) {
// retrieve likelihood information corresponding to this read
LinkedHashMap<Allele,Double> el = indelLikelihoodMap.get(p);
// by design, first element in LinkedHashMap was ref allele
double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY;
for (Allele a : el.keySet()) {
if (a.isReference())
refLikelihood =el.get(a);
else {
double like = el.get(a);
if (like >= altLikelihood)
altLikelihood = like;
}
}
if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH)
refQuals.add(-10.0*refLikelihood);
else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH)
altQuals.add(-10.0*altLikelihood);
}
}
}
}

View File

@ -29,6 +29,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broad.tribble.util.variantcontext.VariantContext;
@ -36,11 +37,10 @@ import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broad.tribble.vcf.VCFHeaderLineType;
import cern.jet.math.Arithmetic;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import java.util.List;
import java.util.Map;
import java.util.Arrays;
import java.util.HashMap;
import java.util.*;
public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotation {
@ -52,10 +52,22 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati
private static final double MIN_PVALUE = 1E-320;
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( ! vc.isVariant() || vc.isFiltered() || ! vc.isBiallelic() || ! vc.isSNP() )
if ( ! vc.isVariant() || vc.isFiltered() )
return null;
int[][] table;
if (vc.isBiallelic() && vc.isSNP())
table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0));
else if (vc.isIndel() || vc.isMixed()) {
table = getIndelContingencyTable(stratifiedContexts, vc);
if (table == null)
return null;
}
else
return null;
int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAllele(0));
Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE);
if ( pvalue == null )
return null;
@ -188,7 +200,7 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati
* allele2 # #
* @return a 2x2 contingency table
*/
private static int[][] getContingencyTable(Map<String, AlignmentContext> stratifiedContexts, Allele ref, Allele alt) {
private static int[][] getSNPContingencyTable(Map<String, AlignmentContext> stratifiedContexts, Allele ref, Allele alt) {
int[][] table = new int[2][2];
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
@ -213,6 +225,79 @@ public class FisherStrand implements InfoFieldAnnotation, WorkInProgressAnnotati
}
}
return table;
}
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
* allele1 # #
* allele2 # #
* @return a 2x2 contingency table
*/
private static int[][] getIndelContingencyTable(Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
final double INDEL_LIKELIHOOD_THRESH = 0.3;
final HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
if (indelLikelihoodMap == null)
return null;
int[][] table = new int[2][2];
for ( String sample : stratifiedContexts.keySet() ) {
final AlignmentContext context = stratifiedContexts.get(sample);
if ( context == null )
continue;
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup == null)
continue;
for (final PileupElement p: pileup) {
if ( p.getRead().getMappingQuality() < 20)
continue;
if (indelLikelihoodMap.containsKey(p)) {
// to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element.
// A pileup element then has a list of pairs of form (Allele, likelihood of this allele).
// To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles.
// If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pileup element is "ref"
// otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt"
// retrieve likelihood information corresponding to this read
LinkedHashMap<Allele,Double> el = indelLikelihoodMap.get(p);
// by design, first element in LinkedHashMap was ref allele
boolean isFW = !p.getRead().getReadNegativeStrandFlag();
double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY;
for (Allele a : el.keySet()) {
if (a.isReference())
refLikelihood =el.get(a);
else {
double like = el.get(a);
if (like >= altLikelihood)
altLikelihood = like;
}
}
boolean matchesRef = (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH));
boolean matchesAlt = (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH));
if ( matchesRef || matchesAlt ) {
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
table[row][column]++;
}
}
}
}
return table;
}
}

View File

@ -29,9 +29,19 @@ public class GLstats implements InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
final Map<String, Genotype> genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isBiallelic() )
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
return null;
int idxAA = 0, idxAB = 1, idxBB = 2;
if (!vc.isBiallelic()) {
// for non-bliallelic case, do test with most common alt allele.
// Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB.
int[] idxVector = vc.getGLIndecesOfAllele(vc.getAltAlleleWithHighestAlleleCount());
idxAA = idxVector[0];
idxAB = idxVector[1];
idxBB = idxVector[2];
}
double refCount = 0.0;
double hetCount = 0.0;
double homCount = 0.0;
@ -43,9 +53,9 @@ public class GLstats implements InfoFieldAnnotation {
N++;
final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10( g.getLikelihoods().getAsVector() );
refCount += normalizedLikelihoods[0];
hetCount += normalizedLikelihoods[1];
homCount += normalizedLikelihoods[2];
refCount += normalizedLikelihoods[idxAA];
hetCount += normalizedLikelihoods[idxAB];
homCount += normalizedLikelihoods[idxBB];
}
final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeaderLineType;
@ -33,6 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
@ -49,9 +51,12 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
private final static char REGEXP_WILDCARD = '.';
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( !vc.isBiallelic() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here
if (stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here
return null;
if (vc.isSNP() && !vc.isBiallelic())
return null;
final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values());
final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE);
@ -69,7 +74,7 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
if (pileup == null)
return null;
final List<Haplotype> haplotypes = computeHaplotypes(pileup, contextSize, locus);
final List<Haplotype> haplotypes = computeHaplotypes(pileup, contextSize, locus, vc);
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
if (haplotypes != null) {
@ -86,7 +91,17 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
thisPileup = null;
if (thisPileup != null) {
scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
if (vc.isSNP())
scoreRA.add( scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus) ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
else if (vc.isIndel() || vc.isMixed()) {
Double d = scoreIndelsAgainstHaplotypes(thisPileup);
if (d == null)
return null;
scoreRA.add( d ); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
}
else
return null;
}
}
}
@ -110,8 +125,11 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
}
}
private List<Haplotype> computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus) {
private List<Haplotype> computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus, final VariantContext vc) {
// Compute all possible haplotypes consistent with current pileup
int haplotypesToCompute = vc.getAlternateAlleles().size()+1;
final PriorityQueue<Haplotype> candidateHaplotypeQueue = new PriorityQueue<Haplotype>(100, new HaplotypeComparator());
final PriorityQueue<Haplotype> consensusHaplotypeQueue = new PriorityQueue<Haplotype>(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator());
@ -148,16 +166,22 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
}
}
// Now retrieve the two most popular haplotypes
// Now retrieve the N most popular haplotypes
if (consensusHaplotypeQueue.size() > 0) {
// Since the consensus haplotypes are in a quality-ordered priority queue, the two best haplotypes are just the first two in the queue
final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
Haplotype haplotype2 = consensusHaplotypeQueue.poll();
if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found
return Arrays.asList(new Haplotype(haplotype1.getBasesAsBytes(), 60), new Haplotype(haplotype2.getBasesAsBytes(), 20)); // These qual values aren't used for anything
} else {
List<Haplotype>hlist = new ArrayList<Haplotype>();
hlist.add(new Haplotype(haplotype1.getBasesAsBytes(), 60));
for (int k=1;k < haplotypesToCompute; k++) {
Haplotype haplotype2 = consensusHaplotypeQueue.poll();
if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found
hlist.add(new Haplotype(haplotype2.getBasesAsBytes(), 20));
}
return hlist;
} else
return null;
}
}
private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) {
@ -321,6 +345,44 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
return mismatches - expected;
}
private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) {
final ArrayList<double[]> haplotypeScores = new ArrayList<double[]>();
final HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
if (indelLikelihoodMap== null)
return null;
for (final PileupElement p: pileup) {
if (indelLikelihoodMap.containsKey(p)) {
// retrieve likelihood information corresponding to this read
LinkedHashMap<Allele,Double> el = indelLikelihoodMap.get(p);
// Score all the reads in the pileup, even the filtered ones
final double[] scores = new double[el.size()];
int i = 0;
for (Allele a: el.keySet() ) {
scores[i++] = -el.get(a);
if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i-1, scores[i-1]); }
}
haplotypeScores.add(scores);
}
}
// indel likelihoods are stric log-probs, not phred scored
double overallScore = 0.0;
for ( final double[] readHaplotypeScores : haplotypeScores ) {
overallScore += MathUtils.arrayMin(readHaplotypeScores);
}
return overallScore;
}
public List<String> getKeyNames() { return Arrays.asList("HaplotypeScore"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); }
}

View File

@ -1,10 +1,14 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.vcf.VCFHeaderLineType;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Arrays;
@ -15,15 +19,45 @@ public class MappingQualityRankSumTest extends RankSumTest {
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); }
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Integer> refQuals, List<Integer> altQuals) {
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
for ( final PileupElement p : pileup ) {
if( isUsableBase(p) && p.getMappingQual() < 254 ) { // 254 and 255 are special mapping qualities used as a code by aligners
if ( p.getBase() == ref ) {
refQuals.add(p.getMappingQual());
refQuals.add((double)p.getMappingQual());
} else if ( p.getBase() == alt ) {
altQuals.add(p.getMappingQual());
altQuals.add((double)p.getMappingQual());
}
}
}
}
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ?
HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
for (final PileupElement p: pileup) {
if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() < 254) {
// retrieve likelihood information corresponding to this read
LinkedHashMap<Allele,Double> el = indelLikelihoodMap.get(p);
// by design, first element in LinkedHashMap was ref allele
double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY;
for (Allele a : el.keySet()) {
if (a.isReference())
refLikelihood =el.get(a);
else {
double like = el.get(a);
if (like >= altLikelihood)
altLikelihood = like;
}
}
if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH)
refQuals.add((double)p.getMappingQual());
else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH)
altQuals.add((double)p.getMappingQual());
}
}
}
}

View File

@ -1,11 +1,13 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -17,37 +19,79 @@ import java.util.Map;
import java.util.HashMap;
public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnotation {
static final double INDEL_LIKELIHOOD_THRESH = 0.1;
static final boolean DEBUG = false;
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
return null;
if ( !vc.isBiallelic() || !vc.isSNP() )
return null;
final Map<String, Genotype> genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() == 0 )
return null;
final ArrayList<Integer> refQuals = new ArrayList<Integer>();
final ArrayList<Integer> altQuals = new ArrayList<Integer>();
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
if ( context == null ) {
continue;
final ArrayList<Double> refQuals = new ArrayList<Double>();
final ArrayList<Double> altQuals = new ArrayList<Double>();
if (vc.isSNP() && vc.isBiallelic()) {
// todo - no current support for multiallelic snps
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
if ( context == null ) {
continue;
}
fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals);
}
fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).getBases()[0], context.getBasePileup(), refQuals, altQuals);
}
else if (vc.isIndel() || vc.isMixed()) {
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
if ( context == null ) {
continue;
}
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup == null)
continue;
if (IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap() == null ||
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().size() == 0)
return null;
fillIndelQualsFromPileup(pileup, refQuals, altQuals);
}
}
else
return null;
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
for ( final Integer qual : altQuals ) {
for ( final Double qual : altQuals ) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
}
for ( final Integer qual : refQuals ) {
for ( final Double qual : refQuals ) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
}
if (DEBUG) {
System.out.format("%s, REF QUALS:",this.getClass().getName());
for ( final Double qual : refQuals )
System.out.format("%4.1f ",qual);
System.out.println();
System.out.format("%s, ALT QUALS:",this.getClass().getName());
for ( final Double qual : altQuals )
System.out.format("%4.1f ",qual);
System.out.println();
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final Pair<Double,Double> testResults = mannWhitneyU.runOneSidedTest( MannWhitneyU.USet.SET1 );
@ -55,9 +99,11 @@ public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnota
if ( ! Double.isNaN(testResults.first) )
map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
return map;
}
protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Integer> refQuals, List<Integer> altQuals);
protected abstract void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals);
protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals);
protected static boolean isUsableBase( final PileupElement p ) {
return !( p.isDeletion() || p.getMappingQual() == 0 || ((int)p.getQual()) < 6 ); // need the unBAQed quality score here

View File

@ -1,12 +1,16 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.vcf.VCFHeaderLineType;
import org.broad.tribble.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
/**
@ -21,7 +25,7 @@ public class ReadPosRankSumTest extends RankSumTest {
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); }
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Integer> refQuals, List<Integer> altQuals) {
protected void fillQualsFromPileup(byte ref, byte alt, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
for ( final PileupElement p : pileup ) {
if( isUsableBase(p) ) {
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0);
@ -31,11 +35,52 @@ public class ReadPosRankSumTest extends RankSumTest {
}
if ( p.getBase() == ref ) {
refQuals.add( readPos );
refQuals.add( (double)readPos );
} else if ( p.getBase() == alt ) {
altQuals.add( readPos );
altQuals.add( (double)readPos );
}
}
}
}
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele
// to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element.
// A pileup element then has a list of pairs of form (Allele, likelihood of this allele).
// To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles.
// If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref"
// otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt"
final HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
for (final PileupElement p: pileup) {
if (indelLikelihoodMap.containsKey(p)) {
// retrieve likelihood information corresponding to this read
LinkedHashMap<Allele,Double> el = indelLikelihoodMap.get(p);
// by design, first element in LinkedHashMap was ref allele
double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY;
for (Allele a : el.keySet()) {
if (a.isReference())
refLikelihood =el.get(a);
else {
double like = el.get(a);
if (like >= altLikelihood)
altLikelihood = like;
}
}
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0);
final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead());
if( readPos > numAlignedBases / 2 ) {
readPos = numAlignedBases - ( readPos + 1 );
}
if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH))
refQuals.add((double)readPos);
else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH))
altQuals.add((double)readPos);
}
}
}
}

View File

@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("91246d154ba379cb04a1920d365f11c3"));
Arrays.asList("82d77402919f5d5b627a789f0bfffbb9"));
executeTest("test MultiSample Pilot1", spec);
}
@ -67,7 +67,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("61db698ea6d11deba2fc900bd544b082"));
Arrays.asList("274a4eae68b0083191d59b95db95cbfc"));
executeTest("test SingleSample Pilot2", spec);
}
@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "af5e98318f56e203b25361fee4d41548";
private final static String COMPRESSED_OUTPUT_MD5 = "e01fd00964e4340889a59edddd93bc48";
@Test
public void testCompressedOutput() {
@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-sites_only", "71f61655f725cda56bc46d99d1cc24eb" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "8193a4c06ddbd82d0a328491118b16a8" );
e.put( "--output_mode EMIT_ALL_SITES", "89859bd987e8b797d37f5dec54d37e9c" );
e.put( "--output_mode EMIT_ALL_SITES", "74c25dedf25652e35707fb617d7637b6" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -186,8 +186,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 0.01, "649590cbfc67bddf82416a0b82027696" );
e.put( 1.0 / 1850, "22153b7d06d13ea1a1ffdc45d7668974" );
e.put( 0.01, "cb8da711af63409f75c49cff3680b7e1" );
e.put( 1.0 / 1850, "4249a2ab3ff7516d3a77521d9516eb23" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("d453f2e44fb7f0ec027c2a5f791da6aa"));
Arrays.asList("4de696898979e57644e5c983e40b882a"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -230,7 +230,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("60cc4c793caaf891075d862a2376d73c"));
Arrays.asList("e31fb159a93011edd66c02b365c6c76e"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -244,7 +244,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq OFF",
1,
Arrays.asList("d453f2e44fb7f0ec027c2a5f791da6aa"));
Arrays.asList("4de696898979e57644e5c983e40b882a"));
executeTest(String.format("test calling with BAQ OFF"), spec);
}
@ -263,7 +263,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("51726ce0b309a77cb833dd2e11e2219e"));
Arrays.asList("ca800d36708a337c3e29216c1b73bb6d"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -278,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("11d761009bfbc04ba23cbee000a62954"));
Arrays.asList("cb1e3d077b7fb17eb4f1be758ed4e4d6"));
executeTest(String.format("test indel caller in SLX witn low min allele count"), spec);
}
@ -291,25 +291,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("32a06420a3357c1451fdc36a40df4d08"));
Arrays.asList("8d4b2d5a093cbd6f421e85390d346f83"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
// todo - feature not yet fully working with indels
//@Test
@Test
public void testWithIndelAllelesPassedIn() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("e95c545b8ae06f0721f260125cfbe1f0"));
Arrays.asList("a3351235c893d38ed4b1d23ab3cda744"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("6c96d76b9bc3aade0c768d7c657ae210"));
Arrays.asList("31afc73ecba6b466bcb17ef3ebdd7a99"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
}