Large refactoring of the UGv2 engine so that it is now truly separated into 2 distict phases: GL calculation and AF calculation, where each can be done independently. This is not yet enabled in UGv2 itself though because I need to work out one last issue or two. Tested on 1Mb of 1000G Aug allPops low-pass and results are identical as before. Also, making BQ capping by MQ mandatory.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4744 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-11-28 21:36:33 +00:00
parent ce051e4e9a
commit 102c8b1f59
10 changed files with 231 additions and 222 deletions

View File

@ -26,11 +26,9 @@
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broad.tribble.util.variantcontext.Genotype;
import java.io.PrintStream;
@ -57,17 +55,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
protected class CalculatedAlleleFrequency {
public double log10PNonRef;
public int altAlleles;
public CalculatedAlleleFrequency(double log10PNonRef, int altAlleles) {
this.log10PNonRef = log10PNonRef;
this.altAlleles = altAlleles;
}
}
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
this.N = N;
this.logger = logger;
@ -84,31 +71,19 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
*/
protected abstract void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, BiallelicGenotypeLikelihoods> GLs,
Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors);
/**
* Can be overridden by concrete subclasses
* @param contexts alignment contexts
* @param GLs genotype likelihoods
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
protected abstract Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
Map<String, BiallelicGenotypeLikelihoods> GLs,
protected abstract Map<String, Genotype> assignGenotypes(VariantContext vc,
double[] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood);
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) )
count++;
}
return count;
}
}

View File

@ -31,8 +31,8 @@ public class BiallelicGenotypeLikelihoods {
private String sample;
private double[] GLs;
private double[] GPs;
private Allele A, B;
private int depth;
/**
* Create a new object for sample with given alleles and genotype likelihoods
@ -43,32 +43,7 @@ public class BiallelicGenotypeLikelihoods {
* @param log10AALikelihoods AA likelihoods
* @param log10ABLikelihoods AB likelihoods
* @param log10BBLikelihoods BB likelihoods
*/
public BiallelicGenotypeLikelihoods(String sample,
Allele A,
Allele B,
double log10AALikelihoods,
double log10ABLikelihoods,
double log10BBLikelihoods) {
this.sample = sample;
this.A = A;
this.B = B;
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
this.GPs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
}
/**
* Create a new object for sample with given alleles and genotype likelihoods & posteriors
*
* @param sample sample name
* @param A allele A
* @param B allele B
* @param log10AALikelihoods AA likelihoods
* @param log10ABLikelihoods AB likelihoods
* @param log10BBLikelihoods BB likelihoods
* @param log10AAPosteriors AA posteriors
* @param log10ABPosteriors AB posteriors
* @param log10BBPosteriors BB posteriors
* @param depth the read depth used in creating the likelihoods
*/
public BiallelicGenotypeLikelihoods(String sample,
Allele A,
@ -76,14 +51,12 @@ public class BiallelicGenotypeLikelihoods {
double log10AALikelihoods,
double log10ABLikelihoods,
double log10BBLikelihoods,
double log10AAPosteriors,
double log10ABPosteriors,
double log10BBPosteriors) {
int depth) {
this.sample = sample;
this.A = A;
this.B = B;
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
this.GPs = new double[]{log10AAPosteriors, log10ABPosteriors, log10BBPosteriors};
this.depth = depth;
}
public String getSample() {
@ -106,22 +79,6 @@ public class BiallelicGenotypeLikelihoods {
return GLs;
}
public double getAAPosteriors() {
return GPs[0];
}
public double getABPosteriors() {
return GPs[1];
}
public double getBBPosteriors() {
return GPs[2];
}
public double[] getPosteriors() {
return GPs;
}
public Allele getAlleleA() {
return A;
}
@ -129,4 +86,9 @@ public class BiallelicGenotypeLikelihoods {
public Allele getAlleleB() {
return B;
}
public int getDepth() {
return depth;
}
}

View File

@ -29,11 +29,9 @@ import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.Haplotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -146,11 +144,14 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
if (Double.isInfinite(genotypeLikelihoods[k]))
genotypeLikelihoods[k] = MINUS_LOG_INFINITY;
}
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),vc.getReference(),
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
vc.getReference(),
vc.getAlternateAllele(0),
genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]));
genotypeLikelihoods[0],
genotypeLikelihoods[1],
genotypeLikelihoods[2],
getFilteredDepth(pileup)));
//System.out.format("%4.2f %4.2f %4.2f\n",genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]);
}
}

View File

@ -29,12 +29,12 @@ import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
import java.io.PrintStream;
@ -73,7 +73,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, BiallelicGenotypeLikelihoods> GLs,
Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
@ -108,11 +108,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
*/
for ( String sample : GLs.keySet() ) {
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
j++;
if ( !sample.getValue().hasLikelihoods() )
continue;
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
double[] genotypeLikelihoods = GLs.get(sample).getLikelihoods();
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
@ -120,8 +123,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()];
int k = 1;
for (k=1; k <= 2*j; k++ ) {
for (int k=1; k <= 2*j; k++ ) {
// if (k > 3 && isClearRefSite)
// break;
@ -211,20 +213,22 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
/**
* Can be overridden by concrete subclasses
* @param contexts alignment contexts
* @param GLs genotype likelihoods
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
Map<String, BiallelicGenotypeLikelihoods> GLs,
public Map<String, Genotype> assignGenotypes(VariantContext vc,
double[] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood) {
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
Allele refAllele = vc.getReference();
Allele altAllele = vc.getAlternateAllele(0);
Map<String, Genotype> GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
@ -244,10 +248,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
else {
for (String sample: GLs.keySet()) {
sampleIndices.add(sample);
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
if ( !sample.getValue().hasLikelihoods() )
continue;
double[] likelihoods = GLs.get(sample).getLikelihoods();
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
sampleIndices.add(sample.getKey());
for (int k=0; k <= AFofMaxLikelihood; k++) {
@ -277,61 +283,56 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
int startIdx = AFofMaxLikelihood;
for (int k = sampleIdx; k > 0; k--) {
int bestGTguess;
String sample = sampleIndices.get(k-1);
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
Allele alleleA = GL.getAlleleA();
Allele alleleB = GL.getAlleleB();
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
if (SIMPLE_GREEDY_GENOTYPER)
bestGTguess = Utils.findIndexOfMaxEntry(GLs.get(sample).getLikelihoods());
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
else {
int newIdx = tracebackArray[k][startIdx];
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
if (context.hasBasePileup())
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(context.getBasePileup()));
else if (context.hasExtendedEventPileup())
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(context.getExtendedEventPileup()));
double qual;
double[] posteriors = GLs.get(sample).getPosteriors();
double[] likelihoods = g.getLikelihoods().getAsVector();
if (bestGTguess == 0) {
myAlleles.add(alleleA);
myAlleles.add(alleleA);
qual = posteriors[0] - Math.max(posteriors[1],posteriors[2]);
myAlleles.add(refAllele);
myAlleles.add(refAllele);
qual = likelihoods[0] - Math.max(likelihoods[1], likelihoods[2]);
} else if(bestGTguess == 1) {
myAlleles.add(alleleA);
myAlleles.add(alleleB);
qual = posteriors[1] - Math.max(posteriors[0],posteriors[2]);
myAlleles.add(refAllele);
myAlleles.add(altAllele);
qual = likelihoods[1] - Math.max(likelihoods[0], likelihoods[2]);
} else {
myAlleles.add(alleleB);
myAlleles.add(alleleB);
qual = posteriors[2] - Math.max(posteriors[1],posteriors[0]);
myAlleles.add(altAllele);
myAlleles.add(altAllele);
qual = likelihoods[2] - Math.max(likelihoods[1], likelihoods[0]);
}
if (qual < 0) {
// QUAL can be negative if the chosen genotype is not the most likely one individually.
// In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on
double[] normalized = MathUtils.normalizeFromLog10(posteriors);
double[] normalized = MathUtils.normalizeFromLog10(likelihoods);
double chosenGenotype = normalized[bestGTguess];
qual = -1.0 * Math.log10(1.0 - chosenGenotype);
}
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), UnifiedGenotyperV2.DEFAULT_GENOTYPE_LIKELIHOODS_KEY);
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
HashMap<String, Object> attributes = new HashMap<String, Object>(g.getAttributes());
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(likelihoods));
calls.put(sample, new Genotype(sample, myAlleles, qual, null, attributes, false));

View File

@ -30,6 +30,8 @@ import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broad.tribble.util.variantcontext.Allele;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.Map;
@ -74,4 +76,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
StratifiedAlignmentContext.StratifiedContextType contextType,
GenotypePriors priors,
Map<String, BiallelicGenotypeLikelihoods> GLs);
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) )
count++;
}
return count;
}
}

View File

@ -29,13 +29,14 @@ import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
import java.io.PrintStream;
@ -55,7 +56,7 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
protected void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, BiallelicGenotypeLikelihoods> GLs,
Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
initializeAFMatrix(GLs);
@ -86,43 +87,43 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
/**
* Overrides the super class
* @param contexts alignment contexts
* @param GLs genotype likelihoods
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
protected Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
Map<String, BiallelicGenotypeLikelihoods> GLs,
protected Map<String, Genotype> assignGenotypes(VariantContext vc,
double[] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood) {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
Allele refAllele = vc.getReference();
Allele altAllele = vc.getAlternateAllele(0);
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
// first, the potential alt calls
for ( String sample : AFMatrix.getSamples() ) {
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
Allele alleleA = GL.getAlleleA();
Allele alleleB = GL.getAlleleB();
Genotype g = vc.getGenotype(sample);
// set the genotype and confidence
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample);
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
myAlleles.add(alleleA);
myAlleles.add(alleleA);
myAlleles.add(refAllele);
myAlleles.add(refAllele);
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
myAlleles.add(alleleA);
myAlleles.add(alleleB);
myAlleles.add(refAllele);
myAlleles.add(altAllele);
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
myAlleles.add(alleleB);
myAlleles.add(alleleB);
myAlleles.add(altAllele);
myAlleles.add(altAllele);
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), UnifiedGenotyperV2.DEFAULT_GENOTYPE_LIKELIHOODS_KEY);
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
HashMap<String, Object> attributes = new HashMap<String, Object>(g.getAttributes());
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(g.getLikelihoods().getAsVector()));
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
}
@ -130,11 +131,13 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
return calls;
}
private void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
private void initializeAFMatrix(Map<String, Genotype> GLs) {
AFMatrix.clear();
for ( BiallelicGenotypeLikelihoods GL : GLs.values() )
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
for ( Genotype g : GLs.values() ) {
if ( g.hasLikelihoods() )
AFMatrix.setLikelihoods(g.getLikelihoods().getAsVector(), g.getSampleName());
}
}
protected static class AlleleFrequencyMatrix {
@ -165,14 +168,6 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
samplesToGenotypesPerAF.clear();
}
public void setLikelihoods(double AA, double AB, double BB, String sample) {
int index = samples.size();
samples.add(sample);
matrix[index][GenotypeType.AA.ordinal()] = AA;
matrix[index][GenotypeType.AB.ordinal()] = AB;
matrix[index][GenotypeType.BB.ordinal()] = BB;
}
public void setLikelihoods(double[] GLs, String sample) {
int index = samples.size();
samples.add(sample);

View File

@ -85,12 +85,11 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// create the GenotypeLikelihoods object
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.baseModel, (DiploidSNPGenotypePriors)priors, UAC.defaultPlatform);
int nGoodBases = GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
int nGoodBases = GL.add(pileup, true, true);
if ( nGoodBases == 0 )
continue;
double[] likelihoods = GL.getLikelihoods();
double[] posteriors = GL.getPosteriors();
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase);
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele);
@ -101,9 +100,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
likelihoods[refGenotype.ordinal()],
likelihoods[hetGenotype.ordinal()],
likelihoods[homGenotype.ordinal()],
posteriors[refGenotype.ordinal()],
posteriors[hetGenotype.ordinal()],
posteriors[homGenotype.ordinal()]));
getFilteredDepth(pileup)));
}
return refAllele;

View File

@ -93,9 +93,6 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
@Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false)
public boolean CAP_BASE_QUALITY = false;
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
@ -117,7 +114,6 @@ public class UnifiedArgumentCollection {
uac.MAX_MISMATCHES = MAX_MISMATCHES;
uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS;
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
uac.CAP_BASE_QUALITY = CAP_BASE_QUALITY;
return uac;
}

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.Allele;
@ -133,37 +134,115 @@ public class UnifiedGenotyperEngine {
}
/**
* Compute at a given locus.
* Compute full calls at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
return vc == null ? null : calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
}
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
/**
* Compute GLs at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantContext object
*/
public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
return calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
}
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type) {
if ( stratifiedContexts == null )
return null;
// initialize the data for this thread if that hasn't been done yet
if ( glcm.get() == null ) {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, type, genotypePriors, GLs);
return createVariantContextFromLikelihoods(refContext, refAllele, GLs);
}
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, BiallelicGenotypeLikelihoods> GLs) {
// no-call everyone for now
List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
Set<Allele> alleles = new HashSet<Allele>();
alleles.add(refAllele);
boolean addedAltAllele = false;
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
if ( !addedAltAllele ) {
addedAltAllele = true;
alleles.add(GL.getAlleleA());
alleles.add(GL.getAlleleB());
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
}
GenomeLoc loc = refContext.getLocus();
int endLoc = calculateEndPos(alleles, refAllele, loc);
return new VariantContext("UG_call",
loc.getContig(),
loc.getStart(),
endLoc,
alleles,
genotypes,
VariantContext.NO_NEG_LOG_10PERROR,
null,
null);
}
/**
* Compute genotypes at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @param vc the GL-annotated variant context
* @return the VariantCallContext object
*/
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
}
private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC);
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, badReadPileupFilter);
if ( stratifiedContexts == null )
return null;
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, genotypePriors, GLs);
// estimate our confidence in a reference call and return
if ( GLs.size() == 0 )
if ( vc.getNSamples() == 0 )
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0);
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
@ -204,17 +283,11 @@ public class UnifiedGenotyperEngine {
}
// create the genotypes
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(stratifiedContexts, GLs, log10AlleleFrequencyPosteriors.get(), bestAFguess);
// next, get the variant context data (alleles, attributes, etc.)
HashSet<Allele> alleles = new HashSet<Allele>();
alleles.add(refAllele);
for ( Genotype g : genotypes.values() )
alleles.addAll(g.getAlleles());
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
// print out stats if we have a writer
if ( verboseWriter != null )
printVerboseData(refContext.getLocus().toString(), alleles, PofF, phredScaledConfidence, normalizedPosteriors);
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -237,20 +310,18 @@ public class UnifiedGenotyperEngine {
if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
// the forward lod
GLs.clear();
glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD, genotypePriors, GLs);
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
GLs.clear();
glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE, genotypePriors, GLs);
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
@ -271,26 +342,10 @@ public class UnifiedGenotyperEngine {
GenomeLoc loc = refContext.getLocus();
// todo - temp fix until we can deal with extended events properly
long endLoc;
// for indels, stop location is one more than ref allele length
boolean isSNP = true;
for (Allele a:alleles){
if (a.getBaseString().length() != 1) {
isSNP = false;
break;
}
}
if (isSNP)
endLoc = loc.getStart();
else
endLoc = loc.getStart() + refAllele.length();
//VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
vc.getAlleles(), genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
if ( annotationEngine != null ) {
// first off, we want to use the *unfiltered* context for the annotations
@ -299,23 +354,42 @@ public class UnifiedGenotyperEngine {
pileup = rawContext.getExtendedEventPileup();
else if (rawContext.hasBasePileup())
pileup = rawContext.getBasePileup();
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vc);
vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element.
}
VariantCallContext call = new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
VariantCallContext call = new VariantCallContext(vcCall, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
call.setRefBase(refContext.getBase());
return call;
}
private int calculateEndPos(Set<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// TODO - temp fix until we can deal with extended events properly
// for indels, stop location is one more than ref allele length
boolean isSNP = true;
for (Allele a : alleles){
if (a.getBaseString().length() != 1) {
isSNP = false;
break;
}
}
int endLoc = loc.getStart();
if ( !isSNP )
endLoc += refAllele.length();
return endLoc;
}
private static boolean isValidDeletionFraction(double d) {
return ( d >= 0.0 && d <= 1.0 );
}
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadBaseFilter badBaseFilter) {
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext) {
BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC);
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
@ -342,7 +416,7 @@ public class UnifiedGenotyperEngine {
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
// filter the reads (and test for bad pileups)
if ( !filterPileup(stratifiedContexts, badBaseFilter) )
if ( !filterPileup(stratifiedContexts, badReadPileupFilter) )
return null;
}
@ -382,9 +456,9 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
}
protected void printVerboseData(String pos, Set<Allele> alleles, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
Allele refAllele = null, altAllele = null;
for ( Allele allele : alleles ) {
for ( Allele allele : vc.getAlleles() ) {
if ( allele.isReference() )
refAllele = allele;
else

View File

@ -49,7 +49,6 @@ import java.io.PrintStream;
@By(DataSource.REFERENCE)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedGenotyperV2.UGStatistics> implements TreeReducible<UnifiedGenotyperV2.UGStatistics> {
public static final String DEFAULT_GENOTYPE_LIKELIHOODS_KEY = VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY;
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@ -58,10 +57,6 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false)
@Deprecated
protected String varout;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null;
@ -158,7 +153,8 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
}
// FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(UnifiedGenotyperV2.DEFAULT_GENOTYPE_LIKELIHOODS_KEY));
// TODO: if only outputting GLs, change this to the GL version
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
// FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
@ -177,7 +173,7 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
* @return the VariantCallContext object
*/
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
return UG_engine.runGenotyper(tracker, refContext, rawContext);
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
}
public UGStatistics reduceInit() { return new UGStatistics(); }