In the process of optimizing ProduceBeagleInputWalker, discovered that the GenotypeLikelihoods, the UG, and Genotype objects were using old-style GL tags internally, and then converting from Likelihoods -> GL String -> Likelihoods -> PL String throughout the GATK. It was both painful and led to convoluted code throughout the system. Removed everything but GL conversion -> PL in the GenotypeLikelihoods objects, and now all of the codes in UG now immediately provides GenotypeLikelihoods to the Genotype objects, which is converted straight to PL now. Resulted in a 30% speed up in ProduceBeagleLikelihoods, passes integration tests without any modifications, and likely speeds up writing any VCFs with likelihoods.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5432 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ceb08f9ee6
commit
b99e27bf9b
|
|
@ -49,6 +49,8 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.text.DecimalFormat;
|
||||
import java.text.MessageFormat;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -56,7 +58,6 @@ import java.util.*;
|
|||
*/
|
||||
@Requires(value={},referenceMetaData=@RMD(name=ProduceBeagleInputWalker.ROD_NAME, type=VariantContext.class))
|
||||
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
public static final String ROD_NAME = "variant";
|
||||
public static final String VALIDATION_ROD_NAME = "validation";
|
||||
|
||||
|
|
@ -72,6 +73,11 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(doc="VQSqual key", shortName = "vqskey", required=false)
|
||||
protected String VQSLOD_KEY = "VQSqual";
|
||||
|
||||
@Hidden
|
||||
@Argument(doc="REMOVE ME", shortName = "rcwpio", required=false)
|
||||
private static final boolean REMAIN_COMPATIBLE_WITH_PREVIOUS_IO = false;
|
||||
|
||||
|
||||
// @Hidden
|
||||
// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
|
||||
// protected boolean includeFilteredRecords = false;
|
||||
|
|
@ -267,15 +273,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
if ( VQSRCalibrator != null )
|
||||
log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
|
||||
|
||||
double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
|
||||
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
|
||||
// see if we need to randomly mask out genotype in this position.
|
||||
// todo -- remove me after testing
|
||||
if ( log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS )
|
||||
for (double likeVal: normalizedLog10Likelihoods)
|
||||
if ( REMAIN_COMPATIBLE_WITH_PREVIOUS_IO && (log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS) )
|
||||
for (double likeVal: normalizedLikelihoods)
|
||||
out.append(String.format("%.2f ",likeVal));
|
||||
else
|
||||
for (double likeVal: normalizedLog10Likelihoods)
|
||||
for (double likeVal: normalizedLikelihoods) {
|
||||
out.append(String.format("%5.4f ",likeVal));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -157,7 +157,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
}
|
||||
|
||||
// FORMAT and INFO fields
|
||||
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
|
||||
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
|
||||
|
||||
// FILTER fields
|
||||
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING )
|
||||
|
|
|
|||
|
|
@ -168,9 +168,7 @@ public class UnifiedGenotyperEngine {
|
|||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
|
||||
vcc.vc = GLsToPLs(vcc.vc);
|
||||
return vcc;
|
||||
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -186,8 +184,7 @@ public class UnifiedGenotyperEngine {
|
|||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse);
|
||||
return GLsToPLs(vc);
|
||||
return calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse);
|
||||
}
|
||||
|
||||
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type, Allele alternateAlleleToUse) {
|
||||
|
|
@ -253,9 +250,10 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
//GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(GL.getLikelihoods());
|
||||
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
||||
|
||||
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
|
||||
}
|
||||
|
|
@ -274,20 +272,6 @@ public class UnifiedGenotyperEngine {
|
|||
null);
|
||||
}
|
||||
|
||||
private static VariantContext GLsToPLs(VariantContext vc) {
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>(genotype.getValue().getAttributes());
|
||||
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector()));
|
||||
calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes));
|
||||
}
|
||||
return VariantContext.modifyGenotypes(vc, calls);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute genotypes at a given locus.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -187,18 +187,12 @@ public class VCFUtils {
|
|||
* return a set of supported format lines; what we currently support for output in the genotype fields of a VCF
|
||||
* @return a set of VCF format lines
|
||||
*/
|
||||
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings(String glType) {
|
||||
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings() {
|
||||
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
|
||||
if ( glType == VCFConstants.GENOTYPE_LIKELIHOODS_KEY )
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
else if ( glType == VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY )
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
else
|
||||
throw new ReviewedStingException("Unexpected GL type " + glType);
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue