diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 4c4b34479..7965eefb1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -31,6 +31,7 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -39,12 +40,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration.VQSRCalibrationCurve; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.vcf.VCFUtils; +import java.io.File; import java.io.PrintStream; import java.util.*; @@ -60,6 +63,19 @@ public class ProduceBeagleInputWalker extends RodWalker { @Output(doc="File to which BEAGLE input should be written",required=true) protected PrintStream beagleWriter = null; + @Hidden + @Input(doc="VQSqual calibration file", shortName = "cc", required=false) + protected File VQSRCalibrationFile = null; + protected VQSRCalibrationCurve VQSRCalibrator = null; + + @Hidden + @Argument(doc="VQSqual key", shortName = "vqskey", required=false) + protected String VQSLOD_KEY = "VQSqual"; + +// @Hidden +// @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false) +// protected boolean includeFilteredRecords = false; + @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) @@ -96,6 +112,12 @@ public class ProduceBeagleInputWalker extends RodWalker { if ( bootstrapVCFOutput != null ) { initializeVcfWriter(); } + + if ( VQSRCalibrationFile != null ) { + VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile); + logger.info("Read calibration curve"); + VQSRCalibrator.printInfo(logger); + } } public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { @@ -133,6 +155,7 @@ public class ProduceBeagleInputWalker extends RodWalker { public boolean goodSite(VariantContext v) { return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); + //return v != null && (includeFilteredRecords || ! v.isFiltered()) && v.isBiallelic() && v.hasGenotypes(); } public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) { @@ -234,13 +257,16 @@ public class ProduceBeagleInputWalker extends RodWalker { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } - writeSampleLikelihoods(beagleOut, log10Likelihoods); + writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods); } beagleWriter.println(beagleOut.toString()); } - private void writeSampleLikelihoods( StringBuffer out, double[] log10Likelihoods ) { + private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) { + if ( VQSRCalibrator != null ) + log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods); + double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); // see if we need to randomly mask out genotype in this position. // todo -- remove me after testing diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java new file mode 100644 index 000000000..f67b51970 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java @@ -0,0 +1,116 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration; + +import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantQualityScore; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 3/11/11 + * Time: 10:33 AM + * To change this template use File | Settings | File Templates. + */ +public class VQSRCalibrationCurve { + private final static boolean DEBUG = false; + List points; + + private static class VQSRRange { + double start, stop, truePositiveRate; + + public double getStart() { + return start; + } + + public double getStop() { + return stop; + } + + public double getTruePositiveRate() { + return truePositiveRate; + } + + private VQSRRange(double start, double stop, double truePositiveRate) { + this.start = start; + this.stop = stop; + this.truePositiveRate = truePositiveRate; + } + } + + public static VQSRCalibrationCurve readFromFile(File source) { + List points = new ArrayList(); + + try { + for ( String line : new XReadLines(source).readLines() ) { + if ( ! line.trim().isEmpty() ) { + String[] parts = line.split("\\s+"); + points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), 1 - Double.parseDouble(parts[2]))); + } + } + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(source, e); + } + + // ensure that the entire range gets caught + points.get(0).start = Double.POSITIVE_INFINITY; + points.get(points.size()-1).stop = Double.NEGATIVE_INFINITY; + + return new VQSRCalibrationCurve(points); + } + + protected VQSRCalibrationCurve(List points) { + this.points = points; + } + + public double probTrueVariant(double VQSRqual) { + for ( VQSRRange r : points ) { + if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() ) + return r.getTruePositiveRate(); + } + + throw new ReviewedStingException("BUG: should not be able to reach this code"); + } + + public double probTrueVariant(String VQSRQualKey, VariantContext vc) { + if ( vc.isFiltered() ) + return 0.0; + else if ( vc.hasAttribute(VQSRQualKey) ) { + double qual = vc.getAttributeAsDouble(VQSRQualKey); + return probTrueVariant(qual); + } else { + throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc); + } + } + + public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) { + double[] updated = new double[log10Likelihoods.length]; + + double alpha = probTrueVariant(VQSRQualKey, vc); + double qual = vc.getAttributeAsDouble(VQSRQualKey); // todo -- remove me + double noInfoPr = 1.0 / 3; + if ( DEBUG ) System.out.printf("------------------------------%n"); + for ( int i = 0; i < log10Likelihoods.length; i++) { + double p = Math.pow(10, log10Likelihoods[i]); + double q = alpha * p + (1-alpha) * noInfoPr; + if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", qual, p, alpha, q); + updated[i] = Math.log10(q); + } + + return updated; + } + + + public void printInfo(Logger logger) { + for ( VQSRRange r : points ) { + logger.info(String.format(" start=%f stop=%f TPrate=%.6e", r.getStart(), r.getStop(), r.getTruePositiveRate())); + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 2b21fc473..41499af1c 100755 --- a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -28,6 +28,8 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import java.io.File; import java.util.Arrays; @@ -147,6 +149,11 @@ public class UserException extends ReviewedStingException { } } + public static class VariantContextMissingRequiredField extends UserException { + public VariantContextMissingRequiredField(String field, VariantContext vc) { + super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field)); + } + } public static class MissortedFile extends UserException { public MissortedFile(File file, String message, Exception e) { diff --git a/scala/qscript/core/StandardVariantEvaluation.scala b/scala/qscript/core/StandardVariantEvaluation.scala index e316b7ad5..34db7f9bb 100755 --- a/scala/qscript/core/StandardVariantEvaluation.scala +++ b/scala/qscript/core/StandardVariantEvaluation.scala @@ -98,7 +98,8 @@ class StandardVariantEvaluation extends QScript { // // SNP call sets // - addEval(new Eval("gatk", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")) + addEval(new Eval("1000G.gatk.eurPlus.phase1", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")) + addEval(new Eval("1000G.high_specificity.phase1", "snps", "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf")) // todo -- are there other good call sets for evaluation? // todo -- add hg19 na12878 64x } diff --git a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q new file mode 100755 index 000000000..402d7845d --- /dev/null +++ b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q @@ -0,0 +1,126 @@ +package oneoffs.depristo + +//import net.sf.picard.reference.FastaSequenceFile +//import org.broadinstitute.sting.datasources.pipeline.Pipeline +//import org.broadinstitute.sting.gatk.DownsampleType +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript +//import collection.JavaConversions._ + +class RefineGenotypesWithBeagle extends QScript { + qscript => + + @Argument(doc="VCF file to run beagle genotype refinement on",required=true,shortName="vcf") var vcfsToBeagle: List[File] = _ + @Argument(doc="Path to GATK jar",required=true,shortName="gatkjarfile") var gatkJarFile: File = _ + @Argument(doc="Path to BEAGLE jar",required=true,shortName="beagle") var beagleJar: File = _ + @Argument(doc="Reference file",required=true,shortName="R") var reference: File = _ + @Argument(doc="Beagle interval",required=false,shortName="L") var interval: String = null + @Argument(doc="Evaluation interval",required=false,shortName="Le") var EvalInterval: String = null + @Argument(doc="Memory in GB for beagle",required=false,shortName="BM") var BEAGLE_MEM_IN_GB: Int = 6 + @Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve") + + @Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false + + val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf") + val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf") + + trait GATKArgs extends CommandLineGATK { + this.reference_sequence = qscript.reference + this.jarFile = qscript.gatkJarFile + this.memoryLimit = Some(2) + } + + class GunzipFile(in: File, out:File ) extends CommandLineFunction { + @Input(doc="file to gunzip") var inp = in + @Output(doc="file to gunzip to") var outp = out + + def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath) + } + + class BeagleRefinement(moreBeagleArgs: String = "") extends CommandLineFunction { + @Input(doc="The beagle input file") var beagleInput: File = _ + var beagleOutputBase: String = _ + var beagleMemoryGigs: Int = BEAGLE_MEM_IN_GB + + /** + * Note: These get set + */ + @Output(doc="The beagle phased file") var beaglePhasedFile: File = _ + @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _ + @Output(doc="The beagle r2 file") var beagleRSquared: File = _ + var beagleOutputDir: String = _ + + def freezeOutputs = { + if ( beagleInput.getParent == null ) { + beagleOutputDir = "" + } else { + beagleOutputDir = beagleInput.getParent + } + beaglePhasedFile = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".phased.gz") + beagleLikelihoods = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".gprobs.gz") + beagleRSquared = new File(beagleOutputDir + beagleOutputBase +"."+beagleInput.getName+".r2") + } + + def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s %s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,moreBeagleArgs,beagleOutputBase) + } + + def RefineGenotypes(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") = { + var beagleInput = new ProduceBeagleInput with GATKArgs + if ( interval != null ) beagleInput.intervalsString = List(interval) + beagleInput.variantVCF = inputVCF + beagleInput.out = swapExt(outputVCF,".vcf",".beagle") + if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE + + var refine = new BeagleRefinement(moreBeagleArgs) + refine.beagleInput = beagleInput.out + refine.beagleOutputBase = outputVCF.getName + ".bout" + refine.beagleMemoryGigs = BEAGLE_MEM_IN_GB + refine.memoryLimit = Some(BEAGLE_MEM_IN_GB) + refine.freezeOutputs + + var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) + var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) +// unzipPhased.isIntermediate = true +// unzipProbs.isIntermediate = true + + var vcfConvert = new BeagleOutputToVCF with GATKArgs + vcfConvert.variantVCF = inputVCF + vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared) + vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp) + vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp) + vcfConvert.out = outputVCF + + for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) ) + add(cmd) + } + + class MyEval(@Input(doc="Evaluation file") eval: File) extends VariantEval with GATKArgs { + this.noST = true + this.evalModule :+= "GenotypeConcordance" + this.o = swapExt(eval, ".vcf", ".vcf.eval") + this.rodBind :+= RodBind("eval", "VCF", eval) + this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF) + this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF) + if ( EvalInterval != null ) { + Console.printf("EvalInterval " + EvalInterval) + this.intervalsString = List(EvalInterval) + } + } + + def script = { + for ( vcf <- vcfsToBeagle ) { + if ( ! TEST ) add(new MyEval(vcf)) +// for ( niter <- List(10) ) { + for ( useCalibrationCurve <- List(true, false) ) { +// for ( includeFilteredRecords <- List(true, false) ) { + for ( niter <- List(10, 25, 50) ) { + if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) { + val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve)) + RefineGenotypes(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter)) + add(new MyEval(refine_out)) + } + } + } + } + } +} \ No newline at end of file