Optimized version of produce beagle tool, along with experimental (hidden) support for combining likelihoods depending on estimate false positive rate.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5430 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-12 02:06:28 +00:00
parent ee8f2871f7
commit d01d4fdeb5
5 changed files with 279 additions and 3 deletions

View File

@ -31,6 +31,7 @@ import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -39,12 +40,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration.VQSRCalibrationCurve;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
@ -60,6 +63,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Output(doc="File to which BEAGLE input should be written",required=true)
protected PrintStream beagleWriter = null;
@Hidden
@Input(doc="VQSqual calibration file", shortName = "cc", required=false)
protected File VQSRCalibrationFile = null;
protected VQSRCalibrationCurve VQSRCalibrator = null;
@Hidden
@Argument(doc="VQSqual key", shortName = "vqskey", required=false)
protected String VQSLOD_KEY = "VQSqual";
// @Hidden
// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
// protected boolean includeFilteredRecords = false;
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0;
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
@ -96,6 +112,12 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
if ( bootstrapVCFOutput != null ) {
initializeVcfWriter();
}
if ( VQSRCalibrationFile != null ) {
VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile);
logger.info("Read calibration curve");
VQSRCalibrator.printInfo(logger);
}
}
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
@ -133,6 +155,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
public boolean goodSite(VariantContext v) {
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
//return v != null && (includeFilteredRecords || ! v.isFiltered()) && v.isBiallelic() && v.hasGenotypes();
}
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
@ -234,13 +257,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
}
writeSampleLikelihoods(beagleOut, log10Likelihoods);
writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
}
beagleWriter.println(beagleOut.toString());
}
private void writeSampleLikelihoods( StringBuffer out, double[] log10Likelihoods ) {
private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) {
if ( VQSRCalibrator != null )
log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
// see if we need to randomly mask out genotype in this position.
// todo -- remove me after testing

View File

@ -0,0 +1,116 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantQualityScore;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: 3/11/11
* Time: 10:33 AM
* To change this template use File | Settings | File Templates.
*/
public class VQSRCalibrationCurve {
private final static boolean DEBUG = false;
List<VQSRRange> points;
private static class VQSRRange {
double start, stop, truePositiveRate;
public double getStart() {
return start;
}
public double getStop() {
return stop;
}
public double getTruePositiveRate() {
return truePositiveRate;
}
private VQSRRange(double start, double stop, double truePositiveRate) {
this.start = start;
this.stop = stop;
this.truePositiveRate = truePositiveRate;
}
}
public static VQSRCalibrationCurve readFromFile(File source) {
List<VQSRRange> points = new ArrayList<VQSRRange>();
try {
for ( String line : new XReadLines(source).readLines() ) {
if ( ! line.trim().isEmpty() ) {
String[] parts = line.split("\\s+");
points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), 1 - Double.parseDouble(parts[2])));
}
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(source, e);
}
// ensure that the entire range gets caught
points.get(0).start = Double.POSITIVE_INFINITY;
points.get(points.size()-1).stop = Double.NEGATIVE_INFINITY;
return new VQSRCalibrationCurve(points);
}
protected VQSRCalibrationCurve(List<VQSRRange> points) {
this.points = points;
}
public double probTrueVariant(double VQSRqual) {
for ( VQSRRange r : points ) {
if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() )
return r.getTruePositiveRate();
}
throw new ReviewedStingException("BUG: should not be able to reach this code");
}
public double probTrueVariant(String VQSRQualKey, VariantContext vc) {
if ( vc.isFiltered() )
return 0.0;
else if ( vc.hasAttribute(VQSRQualKey) ) {
double qual = vc.getAttributeAsDouble(VQSRQualKey);
return probTrueVariant(qual);
} else {
throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc);
}
}
public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) {
double[] updated = new double[log10Likelihoods.length];
double alpha = probTrueVariant(VQSRQualKey, vc);
double qual = vc.getAttributeAsDouble(VQSRQualKey); // todo -- remove me
double noInfoPr = 1.0 / 3;
if ( DEBUG ) System.out.printf("------------------------------%n");
for ( int i = 0; i < log10Likelihoods.length; i++) {
double p = Math.pow(10, log10Likelihoods[i]);
double q = alpha * p + (1-alpha) * noInfoPr;
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", qual, p, alpha, q);
updated[i] = Math.log10(q);
}
return updated;
}
public void printInfo(Logger logger) {
for ( VQSRRange r : points ) {
logger.info(String.format(" start=%f stop=%f TPrate=%.6e", r.getStart(), r.getStop(), r.getTruePositiveRate()));
}
}
}

View File

@ -28,6 +28,8 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import java.io.File;
import java.util.Arrays;
@ -147,6 +149,11 @@ public class UserException extends ReviewedStingException {
}
}
public static class VariantContextMissingRequiredField extends UserException {
public VariantContextMissingRequiredField(String field, VariantContext vc) {
super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field));
}
}
public static class MissortedFile extends UserException {
public MissortedFile(File file, String message, Exception e) {

View File

@ -98,7 +98,8 @@ class StandardVariantEvaluation extends QScript {
//
// SNP call sets
//
addEval(new Eval("gatk", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
addEval(new Eval("1000G.gatk.eurPlus.phase1", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
addEval(new Eval("1000G.high_specificity.phase1", "snps", "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf"))
// todo -- are there other good call sets for evaluation?
// todo -- add hg19 na12878 64x
}

View File

@ -0,0 +1,126 @@
package oneoffs.depristo
//import net.sf.picard.reference.FastaSequenceFile
//import org.broadinstitute.sting.datasources.pipeline.Pipeline
//import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
//import collection.JavaConversions._
class RefineGenotypesWithBeagle extends QScript {
qscript =>
@Argument(doc="VCF file to run beagle genotype refinement on",required=true,shortName="vcf") var vcfsToBeagle: List[File] = _
@Argument(doc="Path to GATK jar",required=true,shortName="gatkjarfile") var gatkJarFile: File = _
@Argument(doc="Path to BEAGLE jar",required=true,shortName="beagle") var beagleJar: File = _
@Argument(doc="Reference file",required=true,shortName="R") var reference: File = _
@Argument(doc="Beagle interval",required=false,shortName="L") var interval: String = null
@Argument(doc="Evaluation interval",required=false,shortName="Le") var EvalInterval: String = null
@Argument(doc="Memory in GB for beagle",required=false,shortName="BM") var BEAGLE_MEM_IN_GB: Int = 6
@Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve")
@Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false
val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf")
val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")
trait GATKArgs extends CommandLineGATK {
this.reference_sequence = qscript.reference
this.jarFile = qscript.gatkJarFile
this.memoryLimit = Some(2)
}
class GunzipFile(in: File, out:File ) extends CommandLineFunction {
@Input(doc="file to gunzip") var inp = in
@Output(doc="file to gunzip to") var outp = out
def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath)
}
class BeagleRefinement(moreBeagleArgs: String = "") extends CommandLineFunction {
@Input(doc="The beagle input file") var beagleInput: File = _
var beagleOutputBase: String = _
var beagleMemoryGigs: Int = BEAGLE_MEM_IN_GB
/**
* Note: These get set
*/
@Output(doc="The beagle phased file") var beaglePhasedFile: File = _
@Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
@Output(doc="The beagle r2 file") var beagleRSquared: File = _
var beagleOutputDir: String = _
def freezeOutputs = {
if ( beagleInput.getParent == null ) {
beagleOutputDir = ""
} else {
beagleOutputDir = beagleInput.getParent
}
beaglePhasedFile = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".phased.gz")
beagleLikelihoods = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
beagleRSquared = new File(beagleOutputDir + beagleOutputBase +"."+beagleInput.getName+".r2")
}
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s %s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,moreBeagleArgs,beagleOutputBase)
}
def RefineGenotypes(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") = {
var beagleInput = new ProduceBeagleInput with GATKArgs
if ( interval != null ) beagleInput.intervalsString = List(interval)
beagleInput.variantVCF = inputVCF
beagleInput.out = swapExt(outputVCF,".vcf",".beagle")
if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE
var refine = new BeagleRefinement(moreBeagleArgs)
refine.beagleInput = beagleInput.out
refine.beagleOutputBase = outputVCF.getName + ".bout"
refine.beagleMemoryGigs = BEAGLE_MEM_IN_GB
refine.memoryLimit = Some(BEAGLE_MEM_IN_GB)
refine.freezeOutputs
var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
// unzipPhased.isIntermediate = true
// unzipProbs.isIntermediate = true
var vcfConvert = new BeagleOutputToVCF with GATKArgs
vcfConvert.variantVCF = inputVCF
vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared)
vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp)
vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp)
vcfConvert.out = outputVCF
for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) )
add(cmd)
}
class MyEval(@Input(doc="Evaluation file") eval: File) extends VariantEval with GATKArgs {
this.noST = true
this.evalModule :+= "GenotypeConcordance"
this.o = swapExt(eval, ".vcf", ".vcf.eval")
this.rodBind :+= RodBind("eval", "VCF", eval)
this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF)
this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF)
if ( EvalInterval != null ) {
Console.printf("EvalInterval " + EvalInterval)
this.intervalsString = List(EvalInterval)
}
}
def script = {
for ( vcf <- vcfsToBeagle ) {
if ( ! TEST ) add(new MyEval(vcf))
// for ( niter <- List(10) ) {
for ( useCalibrationCurve <- List(true, false) ) {
// for ( includeFilteredRecords <- List(true, false) ) {
for ( niter <- List(10, 25, 50) ) {
if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) {
val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve))
RefineGenotypes(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter))
add(new MyEval(refine_out))
}
}
}
}
}
}