From 67298f8a11f08d08b169b58d55fe0e45f8cdb32d Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 19 Dec 2011 23:14:26 -0500 Subject: [PATCH] AFCR made public (for use in VSS) Minor changes to ValidationSiteSelector logic (SampleSelectors determine whether a site is valid for output, no actual subset context need be operated on beyond that determination). Implementation of GL-based site selection. Minor changes to EJG. --- .../AlleleFrequencyCalculationResult.java | 10 ++++- .../FrequencyModeSelector.java | 6 +-- .../GLBasedSampleSelector.java | 43 ++++++++++++++++--- .../GTBasedSampleSelector.java | 10 +++-- .../KeepAFSpectrumFrequencySelector.java | 4 +- .../NullSampleSelector.java | 5 +-- .../SampleSelector.java | 2 +- .../UniformSamplingFrequencySelector.java | 4 +- .../ValidationSiteSelectorWalker.java | 17 +++++--- 9 files changed, 74 insertions(+), 27 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index ed80dce0d..9c4af8512 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -48,8 +48,16 @@ public class AlleleFrequencyCalculationResult { double log10LikelihoodOfAFzero = 0.0; double log10PosteriorOfAFzero = 0.0; - AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { + public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; } + + public double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } + + public double getLog10PosteriorOfAFzero() { + return log10PosteriorOfAFzero; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java index 5c0d6cb87..62305d3c0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java @@ -38,10 +38,10 @@ public abstract class FrequencyModeSelector implements Cloneable{ protected FrequencyModeSelector(GenomeLocParser parser) { this.parser = parser; } - protected void logCurrentSiteData(VariantContext vc, VariantContext subVC) { - logCurrentSiteData(vc, subVC, false, false); + protected void logCurrentSiteData(VariantContext vc, boolean passesCriteria) { + logCurrentSiteData(vc, passesCriteria, false, false); } - protected abstract void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC); + protected abstract void logCurrentSiteData(VariantContext vc, boolean included, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC); protected abstract ArrayList selectValidationSites(int numValidationSites); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index d229e718e..ff3fe6506 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -23,21 +23,52 @@ */ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; +import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; +import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.TreeSet; public class GLBasedSampleSelector extends SampleSelector { - public GLBasedSampleSelector(TreeSet sm) { + Map numAllelePriorMatrix = new HashMap(); + double referenceLikelihood; + public GLBasedSampleSelector(TreeSet sm, double refLik) { super(sm); + referenceLikelihood = refLik; } - public VariantContext subsetSiteToSamples(VariantContext vc) { - /* todo - Look at sample array, and create a new vc with samples for which GL's indicate they should be included. - For example, include all samples (and corresponding genotypes) whose GL's are such that argmax(GL) = HET or HOMVAR. */ - throw new ReviewedStingException("GLBasedSampleSelector not implemented yet!"); - //return true; + public boolean selectSiteInSamples(VariantContext vc) { + if ( samples == null || samples.isEmpty() ) + return true; + // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model) + // first subset to the samples + VariantContext subContext = vc.subContextFromSamples(samples); + + // now check to see (using EXACT model) whether this should be variant + // do we want to apply a prior? maybe user-spec? + double[][] flatPrior = createFlatPrior(vc.getAlleles()); + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); + ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); + // do we want to let this qual go up or down? + if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { + return true; + } + + return false; + } + + private double[][] createFlatPrior(List alleles) { + if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) { + numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]); + } + + return numAllelePriorMatrix.get(alleles.size()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java index 44fca71fb..c3987b9db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java @@ -38,14 +38,18 @@ public class GTBasedSampleSelector extends SampleSelector{ super(sm); } - public VariantContext subsetSiteToSamples(VariantContext vc) { + public boolean selectSiteInSamples(VariantContext vc) { // Super class already defined initialization which filled data structure "samples" with desired samples. // We only need to check if current vc if polymorphic in that set of samples if ( samples == null || samples.isEmpty() ) - return vc; + return true; - return vc.subContextFromSamples(samples, vc.getAlleles()); + VariantContext subContext = vc.subContextFromSamples(samples, vc.getAlleles()); + if ( subContext.isPolymorphicInSamples() ) { + return true; + } + return false; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java index 739cfcab4..15274d21c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java @@ -60,7 +60,7 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector { postSampleSelectionHistogram = new int[NUM_BINS]; } - public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { // this method is called for every variant of a selected type, regardless of whether it will be selectable or not // get AC,AF,AN attributes from vc @@ -107,7 +107,7 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector { numTotalSites++; // now process VC subsetted to samples of interest - if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC) return; //System.out.format("Post:%4.4f %d\n",af0, binIndex); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java index 35b8893c2..a48bcb8a1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java @@ -33,8 +33,7 @@ public class NullSampleSelector extends SampleSelector{ super(sm); } - public VariantContext subsetSiteToSamples(VariantContext vc) { - return vc; - + public boolean selectSiteInSamples(VariantContext vc) { + return true; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java index e75fcef5d..afbff93d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java @@ -34,7 +34,7 @@ public abstract class SampleSelector implements Cloneable { samples = new TreeSet(sm); } - protected abstract VariantContext subsetSiteToSamples(VariantContext vc); + protected abstract boolean selectSiteInSamples(VariantContext vc); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java index 506a6b2e4..66720a252 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java @@ -42,14 +42,14 @@ public class UniformSamplingFrequencySelector extends FrequencyModeSelector { } - public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { HashMap attributes = new HashMap(); if (vc.hasGenotypes() && !IGNORE_GENOTYPES) { // recompute AF,AC,AN based on genotypes: VariantContextUtils.calculateChromosomeCounts(vc, attributes, false); - if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC) return; } else { if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java index 10d2f639c..ae11d8102 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java @@ -124,6 +124,10 @@ public class ValidationSiteSelectorWalker extends RodWalker { @Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false) private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE; + @Argument(shortName="samplePNonref",fullName="samplePNonref", doc="GL-based selection mode only: the probability" + + " that a site is non-reference in the samples for which to include the site",required=false) + private double samplePNonref = 0.99; + @Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true) private int numValidationSites; @@ -231,13 +235,14 @@ public class ValidationSiteSelectorWalker extends RodWalker { continue; - // do anything required by frequency selector before we select for samples - VariantContext subVC; + // does this site pass the criteria for the samples we are interested in? + boolean passesSampleSelectionCriteria; if (samples.isEmpty()) - subVC = vc; + passesSampleSelectionCriteria = true; else - subVC = sampleSelector.subsetSiteToSamples(vc); - frequencyModeSelector.logCurrentSiteData(vc, subVC, IGNORE_GENOTYPES, IGNORE_POLYMORPHIC); + passesSampleSelectionCriteria = sampleSelector.selectSiteInSamples(vc); + + frequencyModeSelector.logCurrentSiteData(vc,passesSampleSelectionCriteria,IGNORE_GENOTYPES,IGNORE_POLYMORPHIC); } return 1; } @@ -263,7 +268,7 @@ public class ValidationSiteSelectorWalker extends RodWalker { SampleSelector sm; switch ( sampleMode ) { case POLY_BASED_ON_GL: - sm = new GLBasedSampleSelector(samples); + sm = new GLBasedSampleSelector(samples, Math.log10(1.0-samplePNonref)); break; case POLY_BASED_ON_GT: sm = new GTBasedSampleSelector(samples);