AFCR made public (for use in VSS)
Minor changes to ValidationSiteSelector logic (SampleSelectors determine whether a site is valid for output, no actual subset context need be operated on beyond that determination). Implementation of GL-based site selection. Minor changes to EJG.
This commit is contained in:
parent
339ef92eac
commit
67298f8a11
|
|
@ -48,8 +48,16 @@ public class AlleleFrequencyCalculationResult {
|
|||
double log10LikelihoodOfAFzero = 0.0;
|
||||
double log10PosteriorOfAFzero = 0.0;
|
||||
|
||||
AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
|
||||
public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
|
||||
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
|
||||
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
public double getLog10PosteriorOfAFzero() {
|
||||
return log10PosteriorOfAFzero;
|
||||
}
|
||||
}
|
||||
|
|
@ -38,10 +38,10 @@ public abstract class FrequencyModeSelector implements Cloneable{
|
|||
protected FrequencyModeSelector(GenomeLocParser parser) {
|
||||
this.parser = parser;
|
||||
}
|
||||
protected void logCurrentSiteData(VariantContext vc, VariantContext subVC) {
|
||||
logCurrentSiteData(vc, subVC, false, false);
|
||||
protected void logCurrentSiteData(VariantContext vc, boolean passesCriteria) {
|
||||
logCurrentSiteData(vc, passesCriteria, false, false);
|
||||
}
|
||||
protected abstract void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC);
|
||||
protected abstract void logCurrentSiteData(VariantContext vc, boolean included, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC);
|
||||
protected abstract ArrayList<VariantContext> selectValidationSites(int numValidationSites);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,21 +23,52 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
||||
public class GLBasedSampleSelector extends SampleSelector {
|
||||
public GLBasedSampleSelector(TreeSet<String> sm) {
|
||||
Map<Integer,double[][]> numAllelePriorMatrix = new HashMap<Integer,double[][]>();
|
||||
double referenceLikelihood;
|
||||
public GLBasedSampleSelector(TreeSet<String> sm, double refLik) {
|
||||
super(sm);
|
||||
referenceLikelihood = refLik;
|
||||
}
|
||||
|
||||
public VariantContext subsetSiteToSamples(VariantContext vc) {
|
||||
/* todo - Look at sample array, and create a new vc with samples for which GL's indicate they should be included.
|
||||
For example, include all samples (and corresponding genotypes) whose GL's are such that argmax(GL) = HET or HOMVAR. */
|
||||
throw new ReviewedStingException("GLBasedSampleSelector not implemented yet!");
|
||||
//return true;
|
||||
public boolean selectSiteInSamples(VariantContext vc) {
|
||||
if ( samples == null || samples.isEmpty() )
|
||||
return true;
|
||||
// want to include a site in the given samples if it is *likely* to be variant (via the EXACT model)
|
||||
// first subset to the samples
|
||||
VariantContext subContext = vc.subContextFromSamples(samples);
|
||||
|
||||
// now check to see (using EXACT model) whether this should be variant
|
||||
// do we want to apply a prior? maybe user-spec?
|
||||
double[][] flatPrior = createFlatPrior(vc.getAlleles());
|
||||
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size());
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true);
|
||||
// do we want to let this qual go up or down?
|
||||
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private double[][] createFlatPrior(List<Allele> alleles) {
|
||||
if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) {
|
||||
numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]);
|
||||
}
|
||||
|
||||
return numAllelePriorMatrix.get(alleles.size());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,14 +38,18 @@ public class GTBasedSampleSelector extends SampleSelector{
|
|||
super(sm);
|
||||
}
|
||||
|
||||
public VariantContext subsetSiteToSamples(VariantContext vc) {
|
||||
public boolean selectSiteInSamples(VariantContext vc) {
|
||||
// Super class already defined initialization which filled data structure "samples" with desired samples.
|
||||
// We only need to check if current vc if polymorphic in that set of samples
|
||||
|
||||
if ( samples == null || samples.isEmpty() )
|
||||
return vc;
|
||||
return true;
|
||||
|
||||
return vc.subContextFromSamples(samples, vc.getAlleles());
|
||||
VariantContext subContext = vc.subContextFromSamples(samples, vc.getAlleles());
|
||||
if ( subContext.isPolymorphicInSamples() ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector {
|
|||
postSampleSelectionHistogram = new int[NUM_BINS];
|
||||
}
|
||||
|
||||
public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
|
||||
public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
|
||||
|
||||
// this method is called for every variant of a selected type, regardless of whether it will be selectable or not
|
||||
// get AC,AF,AN attributes from vc
|
||||
|
|
@ -107,7 +107,7 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector {
|
|||
numTotalSites++;
|
||||
|
||||
// now process VC subsetted to samples of interest
|
||||
if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
|
||||
if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC)
|
||||
return;
|
||||
|
||||
//System.out.format("Post:%4.4f %d\n",af0, binIndex);
|
||||
|
|
|
|||
|
|
@ -33,8 +33,7 @@ public class NullSampleSelector extends SampleSelector{
|
|||
super(sm);
|
||||
}
|
||||
|
||||
public VariantContext subsetSiteToSamples(VariantContext vc) {
|
||||
return vc;
|
||||
|
||||
public boolean selectSiteInSamples(VariantContext vc) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ public abstract class SampleSelector implements Cloneable {
|
|||
samples = new TreeSet<String>(sm);
|
||||
}
|
||||
|
||||
protected abstract VariantContext subsetSiteToSamples(VariantContext vc);
|
||||
protected abstract boolean selectSiteInSamples(VariantContext vc);
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,14 +42,14 @@ public class UniformSamplingFrequencySelector extends FrequencyModeSelector {
|
|||
|
||||
}
|
||||
|
||||
public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
|
||||
public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
|
||||
if (vc.hasGenotypes() && !IGNORE_GENOTYPES) {
|
||||
// recompute AF,AC,AN based on genotypes:
|
||||
VariantContextUtils.calculateChromosomeCounts(vc, attributes, false);
|
||||
if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
|
||||
if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC)
|
||||
return;
|
||||
} else {
|
||||
if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) {
|
||||
|
|
|
|||
|
|
@ -124,6 +124,10 @@ public class ValidationSiteSelectorWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false)
|
||||
private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE;
|
||||
|
||||
@Argument(shortName="samplePNonref",fullName="samplePNonref", doc="GL-based selection mode only: the probability" +
|
||||
" that a site is non-reference in the samples for which to include the site",required=false)
|
||||
private double samplePNonref = 0.99;
|
||||
|
||||
@Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true)
|
||||
private int numValidationSites;
|
||||
|
||||
|
|
@ -231,13 +235,14 @@ public class ValidationSiteSelectorWalker extends RodWalker<Integer, Integer> {
|
|||
continue;
|
||||
|
||||
|
||||
// do anything required by frequency selector before we select for samples
|
||||
VariantContext subVC;
|
||||
// does this site pass the criteria for the samples we are interested in?
|
||||
boolean passesSampleSelectionCriteria;
|
||||
if (samples.isEmpty())
|
||||
subVC = vc;
|
||||
passesSampleSelectionCriteria = true;
|
||||
else
|
||||
subVC = sampleSelector.subsetSiteToSamples(vc);
|
||||
frequencyModeSelector.logCurrentSiteData(vc, subVC, IGNORE_GENOTYPES, IGNORE_POLYMORPHIC);
|
||||
passesSampleSelectionCriteria = sampleSelector.selectSiteInSamples(vc);
|
||||
|
||||
frequencyModeSelector.logCurrentSiteData(vc,passesSampleSelectionCriteria,IGNORE_GENOTYPES,IGNORE_POLYMORPHIC);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -263,7 +268,7 @@ public class ValidationSiteSelectorWalker extends RodWalker<Integer, Integer> {
|
|||
SampleSelector sm;
|
||||
switch ( sampleMode ) {
|
||||
case POLY_BASED_ON_GL:
|
||||
sm = new GLBasedSampleSelector(samples);
|
||||
sm = new GLBasedSampleSelector(samples, Math.log10(1.0-samplePNonref));
|
||||
break;
|
||||
case POLY_BASED_ON_GT:
|
||||
sm = new GTBasedSampleSelector(samples);
|
||||
|
|
|
|||
Loading…
Reference in New Issue