Optimized version of produce beagle tool, along with experimental (hidden) support for combining likelihoods depending on estimate false positive rate.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5430 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-12 02:06:28 +00:00 · 2011-03-12 02:06:28 +00:00 · d01d4fdeb5
parent ee8f2871f7
commit d01d4fdeb5
5 changed files with 279 additions and 3 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
@ -31,6 +31,7 @@ import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broad.tribble.vcf.*;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.commandline.Hidden;
+import org.broadinstitute.sting.commandline.Input;
 import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -39,12 +40,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.gatk.walkers.RMD;
 import org.broadinstitute.sting.gatk.walkers.Requires;
+import org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration.VQSRCalibrationCurve;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.utils.exceptions.StingException;
 import org.broadinstitute.sting.utils.vcf.VCFUtils;

+import java.io.File;
 import java.io.PrintStream;
 import java.util.*;

@ -60,6 +63,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
    @Output(doc="File to which BEAGLE input should be written",required=true)
    protected PrintStream  beagleWriter = null;

+    @Hidden
+    @Input(doc="VQSqual calibration file", shortName = "cc", required=false)
+    protected File VQSRCalibrationFile = null;
+    protected VQSRCalibrationCurve VQSRCalibrator = null;
+
+    @Hidden
+    @Argument(doc="VQSqual key", shortName = "vqskey", required=false)
+    protected String VQSLOD_KEY = "VQSqual";
+
+//    @Hidden
+//    @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false)
+//    protected boolean includeFilteredRecords = false;
+
    @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
    public double insertedNoCallRate  = 0;
    @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
@ -96,6 +112,12 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
        if ( bootstrapVCFOutput != null ) {
            initializeVcfWriter();
        }
+
+        if ( VQSRCalibrationFile != null ) {
+            VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile);
+            logger.info("Read calibration curve");
+            VQSRCalibrator.printInfo(logger);
+        }
    }

    public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
@ -133,6 +155,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {

    public boolean goodSite(VariantContext v) {
        return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
+        //return v != null && (includeFilteredRecords || ! v.isFiltered()) && v.isBiallelic() && v.hasGenotypes();
    }

    public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
@ -234,13 +257,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
                log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
            }

-            writeSampleLikelihoods(beagleOut, log10Likelihoods);
+            writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
        }

        beagleWriter.println(beagleOut.toString());
    }

-    private void writeSampleLikelihoods( StringBuffer out, double[] log10Likelihoods ) {
+    private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) {
+        if ( VQSRCalibrator != null )
+            log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
+
        double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
        // see if we need to randomly mask out genotype in this position.
        // todo -- remove me after testing
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java
@ -0,0 +1,116 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
+
+import org.apache.log4j.Logger;
+import org.broad.tribble.util.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantQualityScore;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.text.XReadLines;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: depristo
+ * Date: 3/11/11
+ * Time: 10:33 AM
+ * To change this template use File | Settings | File Templates.
+ */
+public class VQSRCalibrationCurve {
+    private final static boolean DEBUG = false;
+    List<VQSRRange> points;
+
+    private static class VQSRRange {
+        double start, stop, truePositiveRate;
+
+        public double getStart() {
+            return start;
+        }
+
+        public double getStop() {
+            return stop;
+        }
+
+        public double getTruePositiveRate() {
+            return truePositiveRate;
+        }
+
+        private VQSRRange(double start, double stop, double truePositiveRate) {
+            this.start = start;
+            this.stop = stop;
+            this.truePositiveRate = truePositiveRate;
+        }
+    }
+
+    public static VQSRCalibrationCurve readFromFile(File source) {
+        List<VQSRRange> points = new ArrayList<VQSRRange>();
+
+        try {
+            for ( String line : new XReadLines(source).readLines() ) {
+                if ( ! line.trim().isEmpty() ) {
+                    String[] parts = line.split("\\s+");
+                    points.add(new VQSRRange(Double.parseDouble(parts[0]), Double.parseDouble(parts[1]), 1 - Double.parseDouble(parts[2])));
+                }
+            }
+        } catch ( FileNotFoundException e ) {
+            throw new UserException.CouldNotReadInputFile(source, e);
+        }
+
+        // ensure that the entire range gets caught
+        points.get(0).start = Double.POSITIVE_INFINITY;
+        points.get(points.size()-1).stop = Double.NEGATIVE_INFINITY;
+
+        return new VQSRCalibrationCurve(points);
+    }
+
+    protected VQSRCalibrationCurve(List<VQSRRange> points) {
+        this.points = points;
+    }
+
+    public double probTrueVariant(double VQSRqual) {
+        for ( VQSRRange r : points ) {
+            if ( VQSRqual <= r.getStart() && VQSRqual > r.getStop() )
+                return r.getTruePositiveRate();
+        }
+
+        throw new ReviewedStingException("BUG: should not be able to reach this code");
+    }
+
+    public double probTrueVariant(String VQSRQualKey, VariantContext vc) {
+        if ( vc.isFiltered() )
+            return 0.0;
+        else if ( vc.hasAttribute(VQSRQualKey) ) {
+            double qual = vc.getAttributeAsDouble(VQSRQualKey);
+            return probTrueVariant(qual);
+        } else {
+            throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc);
+        }
+    }
+
+    public double[] includeErrorRateInLikelihoods(String VQSRQualKey, VariantContext vc, double[] log10Likelihoods) {
+        double[] updated = new double[log10Likelihoods.length];
+
+        double alpha = probTrueVariant(VQSRQualKey, vc);
+        double qual = vc.getAttributeAsDouble(VQSRQualKey); // todo -- remove me
+        double noInfoPr = 1.0 / 3;
+        if ( DEBUG ) System.out.printf("------------------------------%n");
+        for ( int i = 0; i < log10Likelihoods.length; i++) {
+            double p = Math.pow(10, log10Likelihoods[i]);
+            double q = alpha * p + (1-alpha) * noInfoPr;
+            if ( DEBUG ) System.out.printf("  vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", qual, p, alpha, q);
+            updated[i] = Math.log10(q);
+        }
+
+        return updated;
+    }
+
+
+    public void printInfo(Logger logger) {
+        for ( VQSRRange r : points ) {
+            logger.info(String.format("  start=%f stop=%f TPrate=%.6e", r.getStart(), r.getStop(), r.getTruePositiveRate()));
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@ -28,6 +28,8 @@ import net.sf.samtools.SAMFileHeader;
 import net.sf.samtools.SAMRecord;
 import net.sf.samtools.SAMSequenceDictionary;
 import net.sf.samtools.SAMSequenceRecord;
+import org.broad.tribble.util.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;

 import java.io.File;
 import java.util.Arrays;
@ -147,6 +149,11 @@ public class UserException extends ReviewedStingException {
        }
    }

+    public static class VariantContextMissingRequiredField extends UserException {
+        public VariantContextMissingRequiredField(String field, VariantContext vc) {
+            super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field));
+        }
+    }

    public static class MissortedFile extends UserException {
        public MissortedFile(File file, String message, Exception e) {
--- a/scala/qscript/core/StandardVariantEvaluation.scala
+++ b/scala/qscript/core/StandardVariantEvaluation.scala
@ -98,7 +98,8 @@ class StandardVariantEvaluation extends QScript {
    //
    // SNP call sets
    //
-    addEval(new Eval("gatk", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
+    addEval(new Eval("1000G.gatk.eurPlus.phase1", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
+    addEval(new Eval("1000G.high_specificity.phase1", "snps", "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf"))
    // todo -- are there other good call sets for evaluation?
    // todo -- add hg19 na12878 64x
  }
--- a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q
+++ b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q
@ -0,0 +1,126 @@
+package oneoffs.depristo
+
+//import net.sf.picard.reference.FastaSequenceFile
+//import org.broadinstitute.sting.datasources.pipeline.Pipeline
+//import org.broadinstitute.sting.gatk.DownsampleType
+import org.broadinstitute.sting.queue.extensions.gatk._
+import org.broadinstitute.sting.queue.QScript
+//import collection.JavaConversions._
+
+class RefineGenotypesWithBeagle extends QScript {
+  qscript =>
+
+  @Argument(doc="VCF file to run beagle genotype refinement on",required=true,shortName="vcf") var vcfsToBeagle: List[File] = _
+  @Argument(doc="Path to GATK jar",required=true,shortName="gatkjarfile") var gatkJarFile: File = _
+  @Argument(doc="Path to BEAGLE jar",required=true,shortName="beagle") var beagleJar: File = _
+  @Argument(doc="Reference file",required=true,shortName="R") var reference: File = _
+  @Argument(doc="Beagle interval",required=false,shortName="L") var interval: String = null
+  @Argument(doc="Evaluation interval",required=false,shortName="Le") var EvalInterval: String = null
+  @Argument(doc="Memory in GB for beagle",required=false,shortName="BM") var BEAGLE_MEM_IN_GB: Int = 6
+  @Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve")
+
+  @Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false
+
+  val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf")
+  val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf")
+
+  trait GATKArgs extends CommandLineGATK {
+    this.reference_sequence = qscript.reference
+    this.jarFile = qscript.gatkJarFile
+    this.memoryLimit = Some(2)
+  }
+
+  class GunzipFile(in: File, out:File ) extends CommandLineFunction {
+    @Input(doc="file to gunzip") var inp = in
+    @Output(doc="file to gunzip to") var outp = out
+
+    def commandLine = "gunzip -c %s > %s".format(inp.getAbsolutePath, outp.getAbsolutePath)
+  }
+
+  class BeagleRefinement(moreBeagleArgs: String = "") extends CommandLineFunction {
+    @Input(doc="The beagle input file") var beagleInput: File = _
+    var beagleOutputBase: String = _
+    var beagleMemoryGigs: Int = BEAGLE_MEM_IN_GB
+
+    /**
+     * Note: These get set
+     */
+    @Output(doc="The beagle phased file") var beaglePhasedFile: File = _
+    @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
+    @Output(doc="The beagle r2 file") var beagleRSquared: File = _
+    var beagleOutputDir: String = _
+
+    def freezeOutputs = {
+      if ( beagleInput.getParent == null ) {
+        beagleOutputDir = ""
+      } else {
+        beagleOutputDir = beagleInput.getParent
+      }
+      beaglePhasedFile = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".phased.gz")
+      beagleLikelihoods = new File(beagleOutputDir + beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
+      beagleRSquared = new File(beagleOutputDir + beagleOutputBase +"."+beagleInput.getName+".r2")
+    }
+
+    def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s %s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,moreBeagleArgs,beagleOutputBase)
+  }
+
+  def RefineGenotypes(inputVCF: File, outputVCF: File, useCalibrationCurve: Boolean, moreBeagleArgs: String = "") = {
+    var beagleInput = new ProduceBeagleInput with GATKArgs
+    if ( interval != null ) beagleInput.intervalsString = List(interval)
+    beagleInput.variantVCF = inputVCF
+    beagleInput.out = swapExt(outputVCF,".vcf",".beagle")
+    if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE
+
+    var refine = new BeagleRefinement(moreBeagleArgs)
+    refine.beagleInput = beagleInput.out
+    refine.beagleOutputBase = outputVCF.getName + ".bout"
+    refine.beagleMemoryGigs = BEAGLE_MEM_IN_GB
+    refine.memoryLimit = Some(BEAGLE_MEM_IN_GB)
+    refine.freezeOutputs
+
+    var unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl"))
+    var unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl"))
+//    unzipPhased.isIntermediate = true
+//    unzipProbs.isIntermediate = true
+
+    var vcfConvert = new BeagleOutputToVCF with GATKArgs
+    vcfConvert.variantVCF = inputVCF
+    vcfConvert.rodBind :+= new RodBind("beagleR2","BEAGLE",refine.beagleRSquared)
+    vcfConvert.rodBind :+= new RodBind("beaglePhased","BEAGLE",unzipPhased.outp)
+    vcfConvert.rodBind :+= new RodBind("beagleProbs","BEAGLE",unzipProbs.outp)
+    vcfConvert.out = outputVCF
+
+    for ( cmd: CommandLineFunction <- List(beagleInput, refine, unzipPhased, unzipProbs, vcfConvert) )
+      add(cmd)
+  }
+
+  class MyEval(@Input(doc="Evaluation file") eval: File) extends VariantEval with GATKArgs {
+    this.noST = true
+    this.evalModule :+= "GenotypeConcordance"
+    this.o = swapExt(eval, ".vcf", ".vcf.eval")
+    this.rodBind :+= RodBind("eval", "VCF", eval)
+    this.rodBind :+= RodBind("comp_hm3", "VCF", HM3_VCF)
+    this.rodBind :+= RodBind("comp_omni", "VCF", OMNI_VCF)
+    if ( EvalInterval != null ) {
+      Console.printf("EvalInterval " + EvalInterval)
+      this.intervalsString = List(EvalInterval)
+    }
+  }
+
+  def script = {
+    for ( vcf <- vcfsToBeagle ) {
+      if ( ! TEST ) add(new MyEval(vcf))
+//      for ( niter <- List(10) ) {
+      for ( useCalibrationCurve <- List(true, false) ) {
+//        for ( includeFilteredRecords <- List(true, false) ) {
+        for ( niter <- List(10, 25, 50) ) {
+          if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) {
+            val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve))
+            RefineGenotypes(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter))
+            add(new MyEval(refine_out))
+          }
+        }
+      }
+    }
+  }
+}