UG map() now returns a VariantCallContext object. Also has a field for confidentlyCalledBases. UG reduce() emits statistics on the confident called % of bases

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2664 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-01-22 23:06:43 +00:00
parent fbf82526cb
commit c871a0f221
12 changed files with 137 additions and 71 deletions

View File

@ -19,7 +19,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
protected EMGenotypeCalculationModel() {}
public Pair<VariationCall, List<Genotype>> calculateGenotype(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
public VariantCallContext callLocus(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
// run the EM calculation
EMOutput overall = runEM(ref, contexts, priors, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
@ -40,7 +40,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
return new Pair<VariationCall, List<Genotype>>(null, null);
return new VariantCallContext(phredScaledConfidence >= CONFIDENCE_THRESHOLD);
// generate the calls
List<Genotype> calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts);
@ -81,7 +81,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
for ( Genotype call : calls )
((GenotypeCall)call).setVariation(locusdata);
}
return new Pair<VariationCall, List<Genotype>>(locusdata, calls);
return new VariantCallContext(locusdata, calls, phredScaledConfidence >= CONFIDENCE_THRESHOLD);
}
protected List<Genotype> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, Map<String, StratifiedAlignmentContext> contexts) {

View File

@ -85,11 +85,11 @@ public abstract class GenotypeCalculationModel implements Cloneable {
*
* @return calls
*/
public abstract Pair<VariationCall, List<Genotype>> calculateGenotype(RefMetaDataTracker tracker,
char ref,
GenomeLoc loc,
Map<String, StratifiedAlignmentContext> stratifiedContexts,
DiploidGenotypePriors priors);
public abstract VariantCallContext callLocus(RefMetaDataTracker tracker,
char ref,
GenomeLoc loc,
Map<String, StratifiedAlignmentContext> stratifiedContexts,
DiploidGenotypePriors priors);
/**
* @param tracker rod data
@ -99,15 +99,4 @@ public abstract class GenotypeCalculationModel implements Cloneable {
public static rodDbSNP getDbSNP(RefMetaDataTracker tracker) {
return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
}
/**
* Determine whether we're at a Hapmap site
*
* @param tracker the meta data tracker
*
* @return true if we're at a Hapmap site, false if otherwise
*/
private static boolean isHapmapSite(RefMetaDataTracker tracker) {
return tracker.getTrackData("hapmap", null) != null;
}
}

View File

@ -39,8 +39,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
protected JointEstimateGenotypeCalculationModel() {}
public Pair<VariationCall, List<Genotype>> calculateGenotype(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
public VariantCallContext callLocus(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
int numSamples = getNSamples(contexts);
int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero)
@ -50,8 +49,11 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
// if there are no non-ref bases...
if ( bestAlternateAllele == null ) {
// if we don't want all bases, then we can just return
if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
return new Pair<VariationCall, List<Genotype>>(null, null);
// todo -- we still need to calculate the confidence in the reference base.
// todo -- we can still include this optimization, but we should calculate a confidence score
// if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
// return new VariantCallContext(false);
// otherwise, choose any alternate allele (it doesn't really matter)
bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
@ -282,7 +284,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
return new ArrayList<Genotype>();
}
protected Pair<VariationCall, List<Genotype>> createCalls(RefMetaDataTracker tracker, char ref, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
protected VariantCallContext createCalls(RefMetaDataTracker tracker, char ref, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
// only need to look at the most likely alternate allele
int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
@ -304,7 +306,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestAFguess == 0) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
return new Pair<VariationCall, List<Genotype>>(null, null);
return new VariantCallContext(phredScaledConfidence >= CONFIDENCE_THRESHOLD);
// output to beagle file if requested
if ( beagleWriter != null ) {
@ -380,6 +382,6 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
((GenotypeCall)call).setVariation(locusdata);
}
return new Pair<VariationCall, List<Genotype>>(locusdata, calls);
return new VariantCallContext(locusdata, calls, phredScaledConfidence >= CONFIDENCE_THRESHOLD);
}
}

View File

@ -26,7 +26,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
// overload this method so we can special-case the single sample
public Pair<VariationCall, List<Genotype>> calculateGenotype(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
public VariantCallContext callLocus(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
// we don't actually want to run EM for single samples
if ( samples.size() == 1 ) {
@ -63,7 +63,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
// are we above the lod threshold for emitting calls (and not in all-bases mode)?
if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
return new Pair<VariationCall, List<Genotype>>(null, null);
return new VariantCallContext(phredScaledConfidence >= CONFIDENCE_THRESHOLD);
// we can now create the genotype call object
GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, loc);
@ -104,10 +104,10 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
call.setVariation(locusdata);
return new Pair<VariationCall, List<Genotype>>(locusdata, Arrays.asList((Genotype)call));
return new VariantCallContext(locusdata, Arrays.asList((Genotype)call), phredScaledConfidence >= CONFIDENCE_THRESHOLD);
}
return super.calculateGenotype(tracker, ref, loc, contexts, priors);
return super.callLocus(tracker, ref, loc, contexts, priors);
}
private Pair<ReadBackedPileup, GenotypeLikelihoods> getSingleSampleLikelihoods(StratifiedAlignmentContext sampleContext, DiploidGenotypePriors priors, StratifiedAlignmentContext.StratifiedContextType contextType) {

View File

@ -47,8 +47,7 @@ import java.io.PrintStream;
* multi-sample, and pooled data. The user can choose from several different incorporated calculation models.
*/
@Reference(window=@Window(start=-20,stop=20))
public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genotype>>, Integer> implements TreeReducible<Integer> {
public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGenotyper.UGStatistics> implements TreeReducible<UnifiedGenotyper.UGStatistics> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@ -67,10 +66,27 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
// samples in input
private Set<String> samples = new HashSet<String>();
/** Enable deletions in the pileup **/
public boolean includeReadsWithDeletionAtLoci() { return true; }
/**
* Inner class for collecting output statistics from the UG
*/
public class UGStatistics {
/** The total number of passes examined -- i.e., the number of map calls */
long nBasesVisited = 0;
/** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */
long nBasesCallable = 0;
/** The number of bases called confidently (according to user threshold), either ref or other */
long nBasesCalledConfidently = 0;
double percentCallableOfAll() { return (100.0 * nBasesCallable) / nBasesVisited; }
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / nBasesVisited; }
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / nBasesCallable; }
}
/**
* Sets the argument collection for the UnifiedGenotyper.
* To be used with walkers that call the UnifiedGenotyper's map function
@ -204,7 +220,7 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
* @param refContext the reference base
* @param rawContext contextual information around the locus
*/
public Pair<VariationCall, List<Genotype>> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
if ( gcm.get() == null ) {
@ -249,19 +265,19 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
return null;
DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
Pair<VariationCall, List<Genotype>> call = gcm.get().calculateGenotype(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors);
VariantCallContext call = gcm.get().callLocus(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors);
// annotate the call, if possible
if ( call != null && call.first != null && call.first instanceof ArbitraryFieldsBacked ) {
if ( call != null && call.variation != null && call.variation instanceof ArbitraryFieldsBacked ) {
// first off, we want to use the *unfiltered* context for the annotations
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());
Map<String, String> annotations;
if ( UAC.ALL_ANNOTATIONS )
annotations = VariantAnnotator.getAllAnnotations(tracker, refContext, stratifiedContexts, call.first);
annotations = VariantAnnotator.getAllAnnotations(tracker, refContext, stratifiedContexts, call.variation);
else
annotations = VariantAnnotator.getAnnotations(tracker, refContext, stratifiedContexts, call.first);
((ArbitraryFieldsBacked)call.first).setFields(annotations);
annotations = VariantAnnotator.getAnnotations(tracker, refContext, stratifiedContexts, call.variation);
((ArbitraryFieldsBacked)call.variation).setFields(annotations);
}
return call;
@ -284,38 +300,61 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
return ( d >= 0.0 && d <= 1.0 );
}
public Integer reduceInit() { return 0; }
// ------------------------------------------------------------------------------------------------
//
// Reduce
//
// ------------------------------------------------------------------------------------------------
public UGStatistics reduceInit() { return new UGStatistics(); }
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
lhs.nBasesCallable += rhs.nBasesCallable;
lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
lhs.nBasesVisited += rhs.nBasesVisited;
return lhs;
}
public Integer reduce(Pair<VariationCall, List<Genotype>> value, Integer sum) {
public UGStatistics reduce(VariantCallContext value, UGStatistics sum) {
// We get a point for reaching reduce :-)
sum.nBasesVisited++;
// can't call the locus because of no coverage
if ( value == null )
return sum;
// A call was attempted -- the base was potentially callable
sum.nBasesCallable++;
// if the base was confidently called something, print it out
sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0;
// can't make a confident variant call here
if ( value.second == null ||
(UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED && value.second.size() == 0) ) {
if ( value.genotypes == null ||
(UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED && value.genotypes.size() == 0) ) {
return sum;
}
// if we have a single-sample call (single sample from PointEstimate model returns no VariationCall data)
if ( value.first == null || (!writer.supportsMultiSample() && samples.size() <= 1) ) {
writer.addGenotypeCall(value.second.get(0));
if ( value.variation == null || (!writer.supportsMultiSample() && samples.size() <= 1) ) {
writer.addGenotypeCall(value.genotypes.get(0));
}
// use multi-sample mode if we have multiple samples or the output type allows it
else {
writer.addMultiSampleCall(value.second, value.first);
writer.addMultiSampleCall(value.genotypes, value.variation);
}
return sum + 1;
return sum;
}
// Close any file writers
public void onTraversalDone(Integer sum) {
logger.info("Processed " + sum + " loci that are callable for SNPs");
public void onTraversalDone(UGStatistics sum) {
logger.info(String.format("Visited bases %d", sum.nBasesVisited));
logger.info(String.format("Callable bases %d", sum.nBasesCallable));
logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
// logger.info("Processed " + sum.nBasesCallable + " loci that are callable for SNPs");
}
}

View File

@ -0,0 +1,32 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.genotype.VariationCall;
import org.broadinstitute.sting.utils.genotype.Genotype;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jan 22, 2010
* Time: 2:25:19 PM
*
* Useful helper class to communicate the results of calculateGenotype to framework
*/
public class VariantCallContext {
public VariationCall variation = null;
public List<Genotype> genotypes = null;
/** Was the site called confidently, either reference or variant? */
public boolean confidentlyCalled = false;
VariantCallContext(VariationCall variation, List<Genotype> genotypes, boolean confidentlyCalledP) {
this.variation = variation;
this.genotypes = genotypes;
this.confidentlyCalled = confidentlyCalledP;
}
/** blank variation and genotypes => we're a ref site */
VariantCallContext(boolean confidentlyCalledP) {
this.confidentlyCalled = confidentlyCalledP;
}
}

View File

@ -361,10 +361,10 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<Bas
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if ( !BaseUtils.isRegularBase(ref.getBase()) )
return false;
Pair<VariationCall, List<Genotype>> calls = ug.map(tracker,ref,context);
if ( calls == null || calls.second == null)
VariantCallContext calls = ug.map(tracker,ref,context);
if ( calls == null || calls.genotypes == null)
return false;
return ( calls.second.size() > 0 && !calls.second.get(0).isVariant(ref.getBase()) && calls.second.get(0).getNegLog10PError() > confidentRefThreshold );
return ( calls.genotypes.size() > 0 && !calls.genotypes.get(0).isVariant(ref.getBase()) && calls.genotypes.get(0).getNegLog10PError() > confidentRefThreshold );
}

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariationRod;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.Pair;
@ -73,14 +74,14 @@ public class DeNovoSNPWalker extends RefWalker<String, Integer>{
}
AlignmentContext parent1_subContext = new AlignmentContext(context.getLocation(), parent1_reads, parent1_offsets);
Pair<VariationCall, List<Genotype>> parent1 = UG.map(tracker, ref, parent1_subContext);
VariantCallContext parent1 = UG.map(tracker, ref, parent1_subContext);
AlignmentContext parent2_subContext = new AlignmentContext(context.getLocation(), parent2_reads, parent2_offsets);
Pair<VariationCall, List<Genotype>> parent2 = UG.map(tracker, ref, parent2_subContext);
VariantCallContext parent2 = UG.map(tracker, ref, parent2_subContext);
if ( parent1 != null && parent1.second != null && parent2 != null && parent2.second != null ) {
Genotype parent1call = parent1.second.get(0);
Genotype parent2call = parent2.second.get(0);
if ( parent1 != null && parent1.genotypes != null && parent2 != null && parent2.genotypes != null ) {
Genotype parent1call = parent1.genotypes.get(0);
Genotype parent2call = parent2.genotypes.get(0);
if (!parent1call.isVariant(parent1call.getReference()) &&
parent1call.getNegLog10PError() > 5 &&

View File

@ -165,11 +165,11 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
}
private Genotype getGenotype( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
Pair<VariationCall, List<Genotype>> calls = ug.map(tracker,ref,context);
if ( calls == null || calls.first == null || calls.second == null )
VariantCallContext calls = ug.map(tracker,ref,context);
if ( calls == null || calls.variation == null || calls.genotypes == null )
return null;
else {
return calls.second.get(0);
return calls.genotypes.get(0);
}
}

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RodGenotypeChipAsGFF;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ListUtils;
@ -77,9 +78,9 @@ public class SnpCallRateByCoverageWalker extends LocusWalker<List<String>, Strin
List<Integer> sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets);
AlignmentContext subContext = new AlignmentContext(context.getLocation(), sub_reads, sub_offsets);
Pair<VariationCall, List<Genotype>> calls = UG.map(tracker, ref, subContext);
if (calls != null && calls.second != null && calls.second.size() > 0) {
Genotype call = calls.second.get(0);
VariantCallContext calls = UG.map(tracker, ref, subContext);
if (calls != null && calls.genotypes != null && calls.genotypes.size() > 0) {
Genotype call = calls.genotypes.get(0);
String callType = (call.isVariant(call.getReference())) ? ((call.isHom()) ? "HomozygousSNP" : "HeterozygousSNP") : "HomozygousReference";
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+callType+"\t"+toGeliString(call));
}

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -80,10 +81,10 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
double altBalance = ((double) altCount)/((double) totalCount);
if (altBalance > 0.70) {
Pair<VariationCall, List<Genotype>> ugResult = ug.map(tracker, ref, context);
VariantCallContext ugResult = ug.map(tracker, ref, context);
if (ugResult != null && ugResult.second != null && ugResult.second.size() > 0) {
return ugResult.second.get(0).isHet();
if (ugResult != null && ugResult.genotypes != null && ugResult.genotypes.size() > 0) {
return ugResult.genotypes.get(0).isHet();
}
}

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.playground.utils.NamedTable;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
@ -57,9 +58,9 @@ public class SecondaryBaseTransitionTableWalker extends LocusWalker<Integer, Int
char nextBase = Character.toUpperCase(contextBases[contextBases.length - 1]);
if (contextBases.length == 3 && refBase != 'N' && pileup.getBases() != null && pileup.getSecondaryBases() != null) {
Pair<VariationCall,List<Genotype>> ugResult = ug.map(tracker,ref,context);
if (ugResult != null && ugResult.first != null) {
Genotype res = ugResult.second.get(0);
VariantCallContext ugResult = ug.map(tracker,ref,context);
if (ugResult != null && ugResult.variation != null) {
Genotype res = ugResult.genotypes.get(0);
String call = res.getBases();
String type;
if (!res.isVariant(refBase)) {type = "homref";}