From 5bf9dd2def3e5aa166468e79beb3e236a94fff7c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 11 Apr 2012 14:43:40 -0400 Subject: [PATCH] A framework to get annotations working in the HaplotypeCaller (and ART walkers in general). Adding support for active-region-based annotation for most standard annotations. I need to discuss with Ryan what to do about tests that require offsets into the reads (since I don't have access to the offsets) like e.g. the ReadPosRankSumTest. IMPORTANT NOTE: this is still very much a dev effort and can only be accessed through private walkers (i.e. the HaplotypeCaller). The interface is in flux and so we are making no attempt at all to make it clean or to merge this with the Locus-Traversal-based annotation system. When we are satisfied that it's working properly and have settled on the proper interface, we will clean it up then. --- .../annotator/BaseQualityRankSumTest.java | 31 ++++++++-- .../walkers/annotator/ChromosomeCounts.java | 12 +++- .../walkers/annotator/DepthOfCoverage.java | 21 ++++++- .../gatk/walkers/annotator/FisherStrand.java | 57 ++++++++++++++++++- .../walkers/annotator/InbreedingCoeff.java | 12 +++- .../annotator/MappingQualityRankSumTest.java | 23 ++++++-- .../gatk/walkers/annotator/QualByDepth.java | 41 ++++++++++++- .../walkers/annotator/RMSMappingQuality.java | 36 +++++++++++- .../gatk/walkers/annotator/RankSumTest.java | 51 +++++++++++++++-- .../walkers/annotator/ReadPosRankSumTest.java | 27 +++++++-- .../annotator/VariantAnnotatorEngine.java | 29 ++++++++-- .../ActiveRegionBasedAnnotation.java | 2 +- 12 files changed, 309 insertions(+), 33 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 97a4ac468..6eea12e2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -5,12 +5,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** @@ -31,8 +29,31 @@ public class BaseQualityRankSumTest extends RankSumTest { altQuals.add((double)p.getQual()); } } - } + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + // TODO -- implement me; how do we pull out the correct offset from the read? + return; + +/* + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + + if ( isUsableBase(p) ) { + if ( matchesRef ) + refQuals.add((double)p.getQual()); + else + altQuals.add((double)p.getQual()); + } + } + } +*/ + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 0acd3e841..b3a8dbebd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -35,6 +36,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -49,7 +52,7 @@ import java.util.Map; * allele Frequency, for each ALT allele, in the same order as listed; total number * of alleles in called genotypes. */ -public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation { +public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), @@ -63,6 +66,13 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( ! vc.hasGenotypes() ) + return null; + + return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); + } + public List getKeyNames() { return Arrays.asList(keyNames); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index b744fec46..f94d48893 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -33,7 +36,7 @@ import java.util.Map; * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with * -dcov D is N * D */ -public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation { +public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -47,6 +50,22 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List alleleBin : alleleBins.values() ) { + depth += alleleBin.size(); + } + } + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", depth)); + return map; + } + public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 817d6b1ff..0d3bd11a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -28,6 +28,7 @@ import cern.jet.math.Arithmetic; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -49,7 +51,7 @@ import java.util.*; * indicative of false positive calls. Note that the fisher strand test may not be * calculated for certain complex indel cases or for multi-allelic sites. */ -public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation { +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; @@ -78,6 +80,22 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( !vc.isVariant() ) + return null; + + int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + + Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); + if ( pvalue == null ) + return null; + + Map map = new HashMap(); + map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); + return map; + + } + public List getKeyNames() { return Arrays.asList(FS); } @@ -193,6 +211,38 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return sum; } + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getContingencyTable(Map>> stratifiedContexts, Allele ref, Allele alt) { + int[][] table = new int[2][2]; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alt.equals(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + boolean isFW = read.getReadNegativeStrandFlag(); + + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + } + } + + return table; + } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -214,8 +264,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat Allele base = Allele.create(p.getBase(), false); boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - boolean matchesRef = ref.equals(base, true); - boolean matchesAlt = alt.equals(base, true); + final boolean matchesRef = ref.equals(base, true); + final boolean matchesAlt = alt.equals(base, true); if ( matchesRef || matchesAlt ) { int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; @@ -227,6 +277,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return table; } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 6366890d5..57561a277 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -27,12 +30,19 @@ import java.util.Map; * more information. Note that the Inbreeding Coefficient will not be calculated for files * with fewer than a minimum (generally 10) number of samples. */ -public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation { +public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final int MIN_SAMPLES = 10; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + + private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index aa4f26ef3..520b0f232 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -6,12 +6,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** @@ -35,6 +33,23 @@ public class MappingQualityRankSumTest extends RankSumTest { } } } + + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)read.getMappingQuality()); + else + altQuals.add((double)read.getMappingQuality()); + } + } + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index bf60dec6b..24a107235 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -3,11 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -23,7 +26,7 @@ import java.util.Map; * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing * reads associated with the samples with polymorphic genotypes. */ -public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -62,4 +65,40 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.size() == 0 ) + return null; + + int depth = 0; + + for ( final Genotype genotype : genotypes ) { + + // we care only about variant calls with likelihoods + if ( !genotype.isHet() && !genotype.isHomVar() ) + continue; + + final Map> alleleBins = stratifiedContexts.get(genotype.getSampleName()); + if ( alleleBins == null ) + continue; + + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + if ( !alleleBin.getKey().equals(Allele.NO_CALL) ) + depth += alleleBin.getValue().size(); + } + } + + if ( depth == 0 ) + return null; + + double QD = -10.0 * vc.getLog10PError() / (double)depth; + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", QD)); + return map; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 50ade5334..97c15e747 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -13,6 +14,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -24,7 +27,7 @@ import java.util.Map; /** * Root Mean Square of the mapping quality of the reads across all samples. */ -public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation { +public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -34,7 +37,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn for ( AlignmentContext context : stratifiedContexts.values() ) totalSize += context.size(); - int[] qualities = new int[totalSize]; + final int[] qualities = new int[totalSize]; int index = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { @@ -54,6 +57,35 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + depth += alleleBin.getValue().size(); + } + } + + final int[] qualities = new int[depth]; + int index = 0; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List reads : alleleBins.values() ) { + for ( final GATKSAMRecord read : reads ) { + if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[index++] = read.getMappingQuality(); + } + } + } + + double rms = MathUtils.rms(qualities); + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", rms)); + return map; + } + public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ff5f8f144..80d248ac2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -12,6 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -26,7 +28,7 @@ import java.util.Map; /** * Abstract root for all RankSum based annotations */ -public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation { +public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; @@ -38,7 +40,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (genotypes == null || genotypes.size() == 0) return null; - final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); @@ -104,12 +105,52 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (!Double.isNaN(testResults.first)) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; - } - protected abstract void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals); + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if (stratifiedContexts.size() == 0) + return null; - protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); + final GenotypesContext genotypes = vc.getGenotypes(); + if (genotypes == null || genotypes.size() == 0) + return null; + + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); + + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final Map> context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + + fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), context, refQuals, altQuals); + } + + if ( refQuals.size() == 0 || altQuals.size() == 0 ) + return null; + + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + for (final Double qual : altQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); + } + for (final Double qual : refQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); + } + + // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) + final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); + + final Map map = new HashMap(); + if (!Double.isNaN(testResults.first)) + map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); + return map; + } + + protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals); + + protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); + + protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List refQuals, final List altQuals); protected static boolean isUsableBase(final PileupElement p) { return !(p.isInsertionAtBeginningOfRead() || diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index a998cd08b..e013f0e08 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -11,12 +11,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). @@ -49,6 +47,27 @@ public class ReadPosRankSumTest extends RankSumTest { } } + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + // TODO -- implement me; how do we pull out the correct offset from the read? + return; + +/* + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)read.getMappingQuality()); + else + altQuals.add((double)read.getMappingQuality()); + } + } +*/ + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 90d0ad740..413c32a24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -33,10 +33,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -94,6 +92,13 @@ public class VariantAnnotatorEngine { initializeDBs(); } + // experimental constructor for active region traversal + public VariantAnnotatorEngine(GenomeAnalysisEngine toolkit) { + this.walker = null; + this.toolkit = toolkit; + requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(Arrays.asList("ActiveRegionBasedAnnotation"), Collections.emptyList()); + } + // select specific expressions to use public void initializeExpressions(List expressionsToUse) { // set up the expressions @@ -169,7 +174,7 @@ public class VariantAnnotatorEngine { this.requireStrictAlleleMatch = requireStrictAlleleMatch; } - public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences @@ -192,6 +197,20 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); } + public VariantContext annotateContext(final Map>> stratifiedContexts, VariantContext vc) { + Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + + // go through all the requested info annotationTypes + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc); + if ( annotationsFromCurrentType != null ) + infoAnnotations.putAll(annotationsFromCurrentType); + } + + // generate a new annotated VC + return new VariantContextBuilder(vc).attributes(infoAnnotations).make(); + } + private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java index 2c1bb0974..de61c7741 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.Map; // TODO -- make this an abstract class when we move away from InfoFieldAnnotation -public interface ActiveRegionBasedAnnotation { +public interface ActiveRegionBasedAnnotation extends AnnotationType { // return annotations for the given contexts split by sample and then allele public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc);