Optimized version of produce beagle tool, along with experimental (hidden) support for combining likelihoods depending on estimate false positive rate.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5430 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ee8f2871f7
commit
d01d4fdeb5
|
|
@ -31,6 +31,7 @@ import org.broad.tribble.util.variantcontext.VariantContext;
|
|||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -39,12 +40,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration.VQSRCalibrationCurve;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -60,6 +63,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
@Output(doc="File to which BEAGLE input should be written",required=true)
|
||||
protected PrintStream beagleWriter = null;
|
||||
|
||||
@Hidden
|
||||
@Input(doc="VQSqual calibration file", shortName = "cc", required=false)
|
||||
protected File VQSRCalibrationFile = null;
|
||||
protected VQSRCalibrationCurve VQSRCalibrator = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(doc="VQSqual key", shortName = "vqskey", required=false)
|
||||
protected String VQSLOD_KEY = "VQSqual";
|
||||
|
||||
// @Hidden
|
||||
// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
|
||||
// protected boolean includeFilteredRecords = false;
|
||||
|
||||
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
|
||||
public double insertedNoCallRate = 0;
|
||||
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
|
||||
|
|
@ -96,6 +112,12 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
if ( bootstrapVCFOutput != null ) {
|
||||
initializeVcfWriter();
|
||||
}
|
||||
|
||||
if ( VQSRCalibrationFile != null ) {
|
||||
VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile);
|
||||
logger.info("Read calibration curve");
|
||||
VQSRCalibrator.printInfo(logger);
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
|
@ -133,6 +155,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
public boolean goodSite(VariantContext v) {
|
||||
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
|
||||
//return v != null && (includeFilteredRecords || ! v.isFiltered()) && v.isBiallelic() && v.hasGenotypes();
|
||||
}
|
||||
|
||||
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
|
||||
|
|
@ -234,13 +257,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
|
||||
}
|
||||
|
||||
writeSampleLikelihoods(beagleOut, log10Likelihoods);
|
||||
writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
|
||||
}
|
||||
|
||||
beagleWriter.println(beagleOut.toString());
|
||||
}
|
||||
|
||||
private void writeSampleLikelihoods( StringBuffer out, double[] log10Likelihoods ) {
|
||||
private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) {
|
||||
if ( VQSRCalibrator != null )
|
||||
log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
|
||||
|
||||
double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
|
||||
// see if we need to randomly mask out genotype in this position.
|
||||
// todo -- remove me after testing
|
||||
|
|
|
|||
|
|
@ -0,0 +1,116 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantQualityScore;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 3/11/11
|
||||
* Time: 10:33 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class VQSRCalibrationCurve {
|
||||
private final static boolean DEBUG = false;
|
||||
List<VQSRRange> points;
|
||||
|
||||
private static class VQSRRange {
|
||||
double start, stop, truePositiveRate;
|
||||
|
||||
public double getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public double getStop() {
|
||||
return stop;
|
||||
}
|
||||
|
||||
public double getTruePositiveRate() {
|
||||
return truePositiveRate;
|
||||
}
|
||||
|
||||
private VQSRRange(double start, double stop, double truePositiveRate) {
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.truePositiveRate = truePositiveRate;
|
||||
}
|
||||
}
|
||||
|
||||
public static VQSRCalibrationCurve readFromFile(File source) {
|
||||
List<VQSRRange> points = new ArrayList<VQSRRange>();
|
||||
|
||||
try {
|
||||
for ( String line : new XReadLines(source).readLines() ) {
|
||||
if ( ! line.trim().isEmpty() ) {
|
||||
String[] parts = line.split("\\s+");
|
||||
points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), 1 - Double.parseDouble(parts[2])));
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(source, e);
|
||||
}
|
||||
|
||||
// ensure that the entire range gets caught
|
||||
points.get(0).start = Double.POSITIVE_INFINITY;
|
||||
points.get(points.size()-1).stop = Double.NEGATIVE_INFINITY;
|
||||
|
||||
return new VQSRCalibrationCurve(points);
|
||||
}
|
||||
|
||||
protected VQSRCalibrationCurve(List<VQSRRange> points) {
|
||||
this.points = points;
|
||||
}
|
||||
|
||||
public double probTrueVariant(double VQSRqual) {
|
||||
for ( VQSRRange r : points ) {
|
||||
if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() )
|
||||
return r.getTruePositiveRate();
|
||||
}
|
||||
|
||||
throw new ReviewedStingException("BUG: should not be able to reach this code");
|
||||
}
|
||||
|
||||
public double probTrueVariant(String VQSRQualKey, VariantContext vc) {
|
||||
if ( vc.isFiltered() )
|
||||
return 0.0;
|
||||
else if ( vc.hasAttribute(VQSRQualKey) ) {
|
||||
double qual = vc.getAttributeAsDouble(VQSRQualKey);
|
||||
return probTrueVariant(qual);
|
||||
} else {
|
||||
throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc);
|
||||
}
|
||||
}
|
||||
|
||||
public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) {
|
||||
double[] updated = new double[log10Likelihoods.length];
|
||||
|
||||
double alpha = probTrueVariant(VQSRQualKey, vc);
|
||||
double qual = vc.getAttributeAsDouble(VQSRQualKey); // todo -- remove me
|
||||
double noInfoPr = 1.0 / 3;
|
||||
if ( DEBUG ) System.out.printf("------------------------------%n");
|
||||
for ( int i = 0; i < log10Likelihoods.length; i++) {
|
||||
double p = Math.pow(10, log10Likelihoods[i]);
|
||||
double q = alpha * p + (1-alpha) * noInfoPr;
|
||||
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", qual, p, alpha, q);
|
||||
updated[i] = Math.log10(q);
|
||||
}
|
||||
|
||||
return updated;
|
||||
}
|
||||
|
||||
|
||||
public void printInfo(Logger logger) {
|
||||
for ( VQSRRange r : points ) {
|
||||
logger.info(String.format(" start=%f stop=%f TPrate=%.6e", r.getStart(), r.getStop(), r.getTruePositiveRate()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -28,6 +28,8 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
|
|
@ -147,6 +149,11 @@ public class UserException extends ReviewedStingException {
|
|||
}
|
||||
}
|
||||
|
||||
public static class VariantContextMissingRequiredField extends UserException {
|
||||
public VariantContextMissingRequiredField(String field, VariantContext vc) {
|
||||
super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field));
|
||||
}
|
||||
}
|
||||
|
||||
public static class MissortedFile extends UserException {
|
||||
public MissortedFile(File file, String message, Exception e) {
|
||||
|
|
|
|||
|
|
@ -98,7 +98,8 @@ class StandardVariantEvaluation extends QScript {
|
|||
//
|
||||
// SNP call sets
|
||||
//
|
||||
addEval(new Eval("gatk", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
|
||||
addEval(new Eval("1000G.gatk.eurPlus.phase1", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
|
||||
addEval(new Eval("1000G.high_specificity.phase1", "snps", "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf"))
|
||||
// todo -- are there other good call sets for evaluation?
|
||||
// todo -- add hg19 na12878 64x
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,126 @@
|
|||
package oneoffs.depristo
|
||||
|
||||
//import net.sf.picard.reference.FastaSequenceFile
|
||||
//import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
//import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
//import collection.JavaConversions._
|
||||
|
||||
class RefineGenotypesWithBeagle extends QScript {
|
||||
qscript =>
|
||||
|
||||
@Argument(doc="VCF file to run beagle genotype refinement on",required=true,shortName="vcf") var vcfsToBeagle: List[File] = _
|
||||
@Argument(doc="Path to GATK jar",required=true,shortName="gatkjarfile") var gatkJarFile: File = _
|
||||
@Argument(doc="Path to BEAGLE jar",required=true,shortName="beagle") var beagleJar: File = _
|
||||
@Argument(doc="Reference file",required=true,shortName="R") var reference: File = _
|
||||
@Argument(doc="Beagle interval",required=false,shortName="L") var interval: String = null
|
||||
@Argument(doc="Evaluation interval",required=false,shortName="Le") var EvalInterval: String = null
|
||||
@Argument(doc="Memory in GB for beagle",required=false,shortName="BM") var BEAGLE_MEM_IN_GB: Int = 6
|
||||
@Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve")
|
||||
|
||||
@Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false
|
||||
|
||||
val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf")
|
||||
val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")
|
||||
|
||||
trait GATKArgs extends CommandLineGATK {
|
||||
this.reference_sequence = qscript.reference
|
||||
this.jarFile = qscript.gatkJarFile
|
||||
this.memoryLimit = Some(2)
|
||||
}
|
||||
|
||||
class GunzipFile(in: File, out:File ) extends CommandLineFunction {
|
||||
@Input(doc="file to gunzip") var inp = in
|
||||
@Output(doc="file to gunzip to") var outp = out
|
||||
|
||||
def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath)
|
||||
}
|
||||
|
||||
class BeagleRefinement(moreBeagleArgs: String = "") extends CommandLineFunction {
|
||||
@Input(doc="The beagle input file") var beagleInput: File = _
|
||||
var beagleOutputBase: String = _
|
||||
var beagleMemoryGigs: Int = BEAGLE_MEM_IN_GB
|
||||
|
||||
/**
|
||||
* Note: These get set
|
||||
*/
|
||||
@Output(doc="The beagle phased file") var beaglePhasedFile: File = _
|
||||
@Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
|
||||
@Output(doc="The beagle r2 file") var beagleRSquared: File = _
|
||||
var beagleOutputDir: String = _
|
||||
|
||||
def freezeOutputs = {
|
||||
if ( beagleInput.getParent == null ) {
|
||||
beagleOutputDir = ""
|
||||
} else {
|
||||
beagleOutputDir = beagleInput.getParent
|
||||
}
|
||||
beaglePhasedFile = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".phased.gz")
|
||||
beagleLikelihoods = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
|
||||
beagleRSquared = new File(beagleOutputDir + beagleOutputBase +"."+beagleInput.getName+".r2")
|
||||
}
|
||||
|
||||
def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s %s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,moreBeagleArgs,beagleOutputBase)
|
||||
}
|
||||
|
||||
def RefineGenotypes(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") = {
|
||||
var beagleInput = new ProduceBeagleInput with GATKArgs
|
||||
if ( interval != null ) beagleInput.intervalsString = List(interval)
|
||||
beagleInput.variantVCF = inputVCF
|
||||
beagleInput.out = swapExt(outputVCF,".vcf",".beagle")
|
||||
if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE
|
||||
|
||||
var refine = new BeagleRefinement(moreBeagleArgs)
|
||||
refine.beagleInput = beagleInput.out
|
||||
refine.beagleOutputBase = outputVCF.getName + ".bout"
|
||||
refine.beagleMemoryGigs = BEAGLE_MEM_IN_GB
|
||||
refine.memoryLimit = Some(BEAGLE_MEM_IN_GB)
|
||||
refine.freezeOutputs
|
||||
|
||||
var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
|
||||
var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
|
||||
// unzipPhased.isIntermediate = true
|
||||
// unzipProbs.isIntermediate = true
|
||||
|
||||
var vcfConvert = new BeagleOutputToVCF with GATKArgs
|
||||
vcfConvert.variantVCF = inputVCF
|
||||
vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared)
|
||||
vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp)
|
||||
vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp)
|
||||
vcfConvert.out = outputVCF
|
||||
|
||||
for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) )
|
||||
add(cmd)
|
||||
}
|
||||
|
||||
class MyEval(@Input(doc="Evaluation file") eval: File) extends VariantEval with GATKArgs {
|
||||
this.noST = true
|
||||
this.evalModule :+= "GenotypeConcordance"
|
||||
this.o = swapExt(eval, ".vcf", ".vcf.eval")
|
||||
this.rodBind :+= RodBind("eval", "VCF", eval)
|
||||
this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF)
|
||||
this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF)
|
||||
if ( EvalInterval != null ) {
|
||||
Console.printf("EvalInterval " + EvalInterval)
|
||||
this.intervalsString = List(EvalInterval)
|
||||
}
|
||||
}
|
||||
|
||||
def script = {
|
||||
for ( vcf <- vcfsToBeagle ) {
|
||||
if ( ! TEST ) add(new MyEval(vcf))
|
||||
// for ( niter <- List(10) ) {
|
||||
for ( useCalibrationCurve <- List(true, false) ) {
|
||||
// for ( includeFilteredRecords <- List(true, false) ) {
|
||||
for ( niter <- List(10, 25, 50) ) {
|
||||
if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) {
|
||||
val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve))
|
||||
RefineGenotypes(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter))
|
||||
add(new MyEval(refine_out))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue