In the process of optimizing ProduceBeagleInputWalker, discovered that the GenotypeLikelihoods, the UG, and Genotype objects were using old-style GL tags internally, and then converting from Likelihoods -> GL String -> Likelihoods -> PL String throughout the GATK. It was both painful and led to convoluted code throughout the system. Removed everything but GL conversion -> PL in the GenotypeLikelihoods objects, and now all of the codes in UG now immediately provides GenotypeLikelihoods to the Genotype objects, which is converted straight to PL now. Resulted in a 30% speed up in ProduceBeagleLikelihoods, passes integration tests without any modifications, and likely speeds up writing any VCFs with likelihoods.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5432 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-13 00:07:51 +00:00
parent ceb08f9ee6
commit b99e27bf9b
4 changed files with 20 additions and 35 deletions

View File

@ -49,6 +49,8 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File; import java.io.File;
import java.io.PrintStream; import java.io.PrintStream;
import java.text.DecimalFormat;
import java.text.MessageFormat;
import java.util.*; import java.util.*;
/** /**
@ -56,7 +58,6 @@ import java.util.*;
*/ */
@Requires(value={},referenceMetaData=@RMD(name=ProduceBeagleInputWalker.ROD_NAME, type=VariantContext.class)) @Requires(value={},referenceMetaData=@RMD(name=ProduceBeagleInputWalker.ROD_NAME, type=VariantContext.class))
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> { public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
public static final String ROD_NAME = "variant"; public static final String ROD_NAME = "variant";
public static final String VALIDATION_ROD_NAME = "validation"; public static final String VALIDATION_ROD_NAME = "validation";
@ -72,6 +73,11 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(doc="VQSqual key", shortName = "vqskey", required=false) @Argument(doc="VQSqual key", shortName = "vqskey", required=false)
protected String VQSLOD_KEY = "VQSqual"; protected String VQSLOD_KEY = "VQSqual";
@Hidden
@Argument(doc="REMOVE ME", shortName = "rcwpio", required=false)
private static final boolean REMAIN_COMPATIBLE_WITH_PREVIOUS_IO = false;
// @Hidden // @Hidden
// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false) // @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
// protected boolean includeFilteredRecords = false; // protected boolean includeFilteredRecords = false;
@ -267,15 +273,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
if ( VQSRCalibrator != null ) if ( VQSRCalibrator != null )
log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods); log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
// see if we need to randomly mask out genotype in this position. // see if we need to randomly mask out genotype in this position.
// todo -- remove me after testing // todo -- remove me after testing
if ( log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS ) if ( REMAIN_COMPATIBLE_WITH_PREVIOUS_IO && (log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS) )
for (double likeVal: normalizedLog10Likelihoods) for (double likeVal: normalizedLikelihoods)
out.append(String.format("%.2f ",likeVal)); out.append(String.format("%.2f ",likeVal));
else else
for (double likeVal: normalizedLog10Likelihoods) for (double likeVal: normalizedLikelihoods) {
out.append(String.format("%5.4f ",likeVal)); out.append(String.format("%5.4f ",likeVal));
}
} }

View File

@ -157,7 +157,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
} }
// FORMAT and INFO fields // FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)); headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
// FILTER fields // FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ) if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )

View File

@ -168,9 +168,7 @@ public class UnifiedGenotyperEngine {
if ( vc == null ) if ( vc == null )
return null; return null;
VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc); return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
vcc.vc = GLsToPLs(vcc.vc);
return vcc;
} }
/** /**
@ -186,8 +184,7 @@ public class UnifiedGenotyperEngine {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
if ( stratifiedContexts == null ) if ( stratifiedContexts == null )
return null; return null;
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse); return calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse);
return GLsToPLs(vc);
} }
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type, Allele alternateAlleleToUse) { private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type, Allele alternateAlleleToUse) {
@ -253,9 +250,10 @@ public class UnifiedGenotyperEngine {
} }
HashMap<String, Object> attributes = new HashMap<String, Object>(); HashMap<String, Object> attributes = new HashMap<String, Object>();
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); //GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth()); attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods); attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false)); genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
} }
@ -274,20 +272,6 @@ public class UnifiedGenotyperEngine {
null); null);
} }
private static VariantContext GLsToPLs(VariantContext vc) {
if ( vc == null )
return null;
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
HashMap<String, Object> attributes = new HashMap<String, Object>(genotype.getValue().getAttributes());
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector()));
calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes));
}
return VariantContext.modifyGenotypes(vc, calls);
}
/** /**
* Compute genotypes at a given locus. * Compute genotypes at a given locus.
* *

View File

@ -187,18 +187,12 @@ public class VCFUtils {
* return a set of supported format lines; what we currently support for output in the genotype fields of a VCF * return a set of supported format lines; what we currently support for output in the genotype fields of a VCF
* @return a set of VCF format lines * @return a set of VCF format lines
*/ */
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings(String glType) { public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings() {
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>(); Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
if ( glType == VCFConstants.GENOTYPE_LIKELIHOODS_KEY )
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
else if ( glType == VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY )
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
else
throw new ReviewedStingException("Unexpected GL type " + glType);
return result; return result;
} }