General support for both GL (log10) and PL (phred-scaled) genotype likelihoods. All walkers now use the Tribble GenotypeLikelihoods object for parsing VCFs with genotype likelihood fields. Please use GenotypeLikelihoods object from now on for seamless support for GL and PL tags. UGv2 now uses PL by default.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4589 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-10-28 01:48:47 +00:00
parent 15183ed778
commit cbce3e3c83
8 changed files with 30 additions and 19 deletions

View File

@ -200,17 +200,8 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
/**
* Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
*/
if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(",");
double[] likeArray = new double[glArray.length];
// convert to double array so we can normalize
int k=0;
for (String gl : glArray) {
likeArray[k++] = Double.valueOf(gl);
}
if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasLikelihoods()) {
double[] likeArray = genotype.getLikelihoods().getAsVector();
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
// see if we need to randomly mask out genotype in this position.
Double d = generator.nextDouble();
@ -234,7 +225,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
/**
* otherwise, use the prior uniformly
*/
else if (! isValidation && genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
else if (! isValidation && genotype.isCalled() && ! genotype.hasLikelihoods() ) {
// hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype
// is confident. This is useful for Hapmap and 1KG release VCFs.
double AA = (1.0-prior)/2.0;

View File

@ -143,7 +143,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
// FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.GENOTYPE_LIKELIHOODS_KEY));
// FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -86,6 +87,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
final Interval fromInterval = new Interval(vc.getChr(), vc.getStart(), vc.getStart(), false, String.format("%s:%d", vc.getChr(), vc.getStart()));
final int length = vc.getEnd() - vc.getStart();
final Interval toInterval = liftOver.liftOver(fromInterval);
VariantContext originalVC = vc;
if ( toInterval != null ) {
// check whether the strand flips, and if so reverse complement everything
@ -95,6 +97,14 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
}
vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createPotentiallyInvalidGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length));
VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, ref.getBase(), false);
if ( VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) {
logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s",
originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(),
originalVC.getReference(), originalVC.getAlternateAllele(0), newVC.getReference(), newVC.getAlternateAllele(0)));
}
writer.add(vc, ref.getBase());
successfulIntervals++;
} else {

View File

@ -258,7 +258,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
attributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,String.format("%4.2f", 10*qual));
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
calls.put(sample, new Genotype(sample, myAlleles, qual, null, attributes, false));
}

View File

@ -128,7 +128,7 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
}

View File

@ -156,7 +156,7 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
}
// FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
// FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||

View File

@ -31,6 +31,7 @@ import java.math.BigDecimal;
import java.util.*;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.playground.gatk.walkers.genotyper.AlleleFrequencyCalculationModel;
/**
* MathUtils is a static class (no instantiation allowed!) with some useful math methods.
@ -756,4 +757,4 @@ public class MathUtils {
public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); }
public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); }
}
}

View File

@ -27,11 +27,13 @@ package org.broadinstitute.sting.utils.vcf;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.apache.log4j.Logger;
import java.util.*;
@ -165,12 +167,19 @@ public class VCFUtils {
* return a set of supported format lines; what we currently support for output in the genotype fields of a VCF
* @return a set of VCF format lines
*/
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings() {
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings(String glType) {
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
if ( glType == VCFConstants.GENOTYPE_LIKELIHOODS_KEY )
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
else if ( glType == VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY )
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
else
throw new ReviewedStingException("Unexpected GL type " + glType);
return result;
}