From bae0b6cba8b30bef6784914c6a28ec18a5dc2ffb Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 27 Mar 2011 12:44:25 +0000 Subject: [PATCH] A script for playing with BEAGLE refinement parameters. Supports construction of reference panels from NGS data sets with varying niteration and calibration curve parameters, as well as imputing missing genotypes in a VCF with this reference panel, and comparison to a deeply sequenced individual. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5523 348d0f76-0448-11de-a6fe-93d51630548a --- .../depristo/RefineGenotypesWithBeagle.q | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q index e9bed79d3..73e57b7f0 100755 --- a/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q +++ b/scala/qscript/oneoffs/depristo/RefineGenotypesWithBeagle.q @@ -16,18 +16,21 @@ class RefineGenotypesWithBeagle extends QScript { @Argument(doc="Reference file",required=true,shortName="R") var reference: File = _ @Argument(doc="Beagle interval",required=false,shortName="L") var interval: String = null @Argument(doc="Evaluation interval",required=false,shortName="Le") var EvalInterval: String = null - @Argument(doc="Memory in GB for beagle",required=false,shortName="BM") var BEAGLE_MEM_IN_GB: Int = 6 + @Argument(doc="Memory in GB for beagle",required=false,shortName="BM") var BEAGLE_MEM_IN_GB: Int = 12 @Argument(doc="X",required=false,shortName="cc") var CALIBRATION_CURVE: File = new File("vqsr.calibration.curve") @Argument(doc="X",required=false,shortName="test") var TEST: Boolean = false + @Argument(doc="If provided, we'll skip over creating the reference panels, even if apparently required",required=false,shortName="assumeReferencePanelsExist") var assumeReferencePanelsExist: Boolean = false @Argument(doc="Tmpdir",required=false,shortName="tmpdir") var TMPDIR: File = new File("./") // assessing imputation performance + @Argument(doc="VCF sites and alleles for genotyping assessment",required=false,shortName="assessmentSites") var assessmentSites: File = null @Argument(doc="BAM File for genotyping",required=false, shortName="bam") var bam: File = null @Argument(doc="Percent of sites that should be left out of BAM VCF to assess imputation", required=false, shortName="flo") var fractionsLeftOut: List[Double] = List(0.1, 0.2, 0.5, 0.9) // todo -- this might be best to think about in a different unit -- marker density per bp + val MISSING_KEY = "?" val HM3_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf") val OMNI_VCF: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf") @@ -116,14 +119,11 @@ class RefineGenotypesWithBeagle extends QScript { beagleInput.out = swapExt(outputVCF,".vcf",".beagle") if ( useCalibrationCurve ) beagleInput.cc = CALIBRATION_CURVE beagleInput.markers = swapExt(outputVCF, ".vcf", ".markers.txt") - beagleInput.isIntermediate = true val refine = new RefineGenotypesWithBeagle(beagleInput.out, moreBeagleArgs) val unzipPhased = new GunzipFile(refine.beaglePhasedFile,swapExt(refine.beaglePhasedFile,".gz",".bgl")) val unzipProbs = new GunzipFile(refine.beagleLikelihoods,swapExt(refine.beagleLikelihoods,".gz",".bgl")) - //unzipPhased.isIntermediate = true - //unzipProbs.isIntermediate = true val vcfConvert = new BeagleOutputToVCF with GATKArgs vcfConvert.variantVCF = inputVCF @@ -200,27 +200,29 @@ class RefineGenotypesWithBeagle extends QScript { } def script = { - for ( vcf <- vcfsToBeagle ) { - var bamGenotypes: File = null - if ( bam != null ) { - bamGenotypes = new File(swapExt(bam, ".bam", "_genotyped_at." + vcf.getName).getName) - add(new GenotypeBAMAtSites(bam, vcf, bamGenotypes)) - } + var bamGenotypes: File = null + if ( bam != null ) { + bamGenotypes = new File(swapExt(bam, ".bam", "_genotyped_at." + assessmentSites.getName).getName) + add(new GenotypeBAMAtSites(bam, assessmentSites, bamGenotypes)) + } + + for ( vcf <- vcfsToBeagle ) { if ( ! TEST ) add(new EvalPanelAtChipSites(vcf)) for ( useCalibrationCurve <- List(true, false) ) { for ( niter <- List(10, 20, 50) ) { if ( ! TEST || (niter == 10 && ! useCalibrationCurve)) { - val refine_out = swapExt(vcf, ".vcf", ".refined.niter_%d_cc_%b.vcf".format(niter, useCalibrationCurve)) + val refineFilenamePart = "niter_%d_cc_%b".format(niter, useCalibrationCurve) + val refine_out = swapExt(vcf, ".vcf", ".refined.%s.vcf".format(refineFilenamePart)) val refPanel = new ReferencePanelBuilder(vcf, refine_out, useCalibrationCurve, "niterations=%d".format(niter)) - refPanel.enqueueCommands() + if ( ! assumeReferencePanelsExist ) refPanel.enqueueCommands() // start up VE add(new EvalPanelAtChipSites(refine_out)) if ( bamGenotypes != null ) { - for ( fractionLeftOut <- fractionsLeftOut ) { - val bamGenotypesImputed = swapExt(bamGenotypes, ".vcf", "_flo_%.2f.imputed.vcf".format(fractionLeftOut)) + for ( fractionLeftOut <- if ( TEST ) List(0.1) else fractionsLeftOut ) { + val bamGenotypesImputed = swapExt(bamGenotypes, ".vcf", "_flo_%.2f.imputed_with_%s".format(fractionLeftOut, refine_out)) val args = "niterations=%d missing=\"%s\"".format(niter, MISSING_KEY) val panelEval = new EvaluateReferencePanel(bamGenotypes, bamGenotypesImputed, refPanel, fractionLeftOut, args) panelEval.enqueueCommands()