In the process of optimizing ProduceBeagleInputWalker, discovered that the GenotypeLikelihoods, the UG, and Genotype objects were using old-style GL tags internally, and then converting from Likelihoods -> GL String -> Likelihoods -> PL String throughout the GATK. It was both painful and led to convoluted code throughout the system. Removed everything but GL conversion -> PL in the GenotypeLikelihoods objects, and now all of the codes in UG now immediately provides GenotypeLikelihoods to the Genotype objects, which is converted straight to PL now. Resulted in a 30% speed up in ProduceBeagleLikelihoods, passes integration tests without any modifications, and likely speeds up writing any VCFs with likelihoods.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5432 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-13 00:07:51 +00:00
parent ceb08f9ee6
commit b99e27bf9b
4 changed files with 20 additions and 35 deletions

View File

@ -49,6 +49,8 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File;
import java.io.PrintStream;
import java.text.DecimalFormat;
import java.text.MessageFormat;
import java.util.*;
/**
@ -56,7 +58,6 @@ import java.util.*;
*/
@Requires(value={},referenceMetaData=@RMD(name=ProduceBeagleInputWalker.ROD_NAME, type=VariantContext.class))
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
public static final String ROD_NAME = "variant";
public static final String VALIDATION_ROD_NAME = "validation";
@ -72,6 +73,11 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(doc="VQSqual key", shortName = "vqskey", required=false)
protected String VQSLOD_KEY = "VQSqual";
@Hidden
@Argument(doc="REMOVE ME", shortName = "rcwpio", required=false)
private static final boolean REMAIN_COMPATIBLE_WITH_PREVIOUS_IO = false;
// @Hidden
// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
// protected boolean includeFilteredRecords = false;
@ -267,15 +273,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
if ( VQSRCalibrator != null )
log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
// see if we need to randomly mask out genotype in this position.
// todo -- remove me after testing
if ( log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS )
for (double likeVal: normalizedLog10Likelihoods)
if ( REMAIN_COMPATIBLE_WITH_PREVIOUS_IO && (log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS) )
for (double likeVal: normalizedLikelihoods)
out.append(String.format("%.2f ",likeVal));
else
for (double likeVal: normalizedLog10Likelihoods)
for (double likeVal: normalizedLikelihoods) {
out.append(String.format("%5.4f ",likeVal));
}
}

View File

@ -157,7 +157,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
}
// FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
// FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )

View File

@ -168,9 +168,7 @@ public class UnifiedGenotyperEngine {
if ( vc == null )
return null;
VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
vcc.vc = GLsToPLs(vcc.vc);
return vcc;
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
}
/**
@ -186,8 +184,7 @@ public class UnifiedGenotyperEngine {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
if ( stratifiedContexts == null )
return null;
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse);
return GLsToPLs(vc);
return calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse);
}
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type, Allele alternateAlleleToUse) {
@ -253,9 +250,10 @@ public class UnifiedGenotyperEngine {
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
//GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
}
@ -274,20 +272,6 @@ public class UnifiedGenotyperEngine {
null);
}
private static VariantContext GLsToPLs(VariantContext vc) {
if ( vc == null )
return null;
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
HashMap<String, Object> attributes = new HashMap<String, Object>(genotype.getValue().getAttributes());
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector()));
calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes));
}
return VariantContext.modifyGenotypes(vc, calls);
}
/**
* Compute genotypes at a given locus.
*

View File

@ -187,18 +187,12 @@ public class VCFUtils {
* return a set of supported format lines; what we currently support for output in the genotype fields of a VCF
* @return a set of VCF format lines
*/
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings(String glType) {
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings() {
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
if ( glType == VCFConstants.GENOTYPE_LIKELIHOODS_KEY )
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
else if ( glType == VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY )
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
else
throw new ReviewedStingException("Unexpected GL type " + glType);
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
return result;
}