From eb4b9a743ae4ea120d78378d96ca575b759a09ae Mon Sep 17 00:00:00 2001 From: andrewk Date: Fri, 31 Jul 2009 00:18:45 +0000 Subject: [PATCH] Script that runs most of the steps involved in validating the CoverageEval system that predicts performance for given depth of sequencing coverage across a genome. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1353 348d0f76-0448-11de-a6fe-93d51630548a --- python/CoverageMeta.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100755 python/CoverageMeta.py diff --git a/python/CoverageMeta.py b/python/CoverageMeta.py new file mode 100755 index 000000000..1371f6238 --- /dev/null +++ b/python/CoverageMeta.py @@ -0,0 +1,98 @@ +#!/usr/bin/python + +import sys, os +from farm_commands import cmd + +#if __name__ == "__main__": +from optparse import OptionParser +class OptionParser (OptionParser): + + def check_required (self, opts): + for opt in opts: + option = self.get_option(opt) + + # Assumes the option's 'default' is set to None! + if getattr(self.values, option.dest) is None: + self.error("%s option not supplied" % option) + +def exists_and_non_zero(file): + return os.path.exists(file) and os.path.getsize(file) != 0 + +# Global files +gatk_exe = "java -Xmx8192m -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar " +ref = " -R /broad/1KG/reference/human_b36_both.fasta" + +parser = OptionParser() +parser.add_option("-b", "--bam-file", help="Input BAM filename", dest="bam_file", default=None) + +(options, args) = parser.parse_args() +parser.check_required(("-b",)) +output_base = os.path.splitext(options.bam_file)[0] +print "Input BAM file:",options.bam_file + +# Assure that hapmap chip file is here +hapmapchip_file = output_base+".hapmapchip.gff" +if not exists_and_non_zero(hapmapchip_file): + sys.exit("Expected to find "+hapmapchip_file+" but it's not around") + +# Run SSG calls +ssg_outfile = output_base+".SSG.calls" +if not exists_and_non_zero(ssg_outfile): + print "Running SingleSampleGenotyper" + Cmd = gatk_exe + ref + " -T SingleSampleGenotyper"+\ + " -I "+options.bam_file+\ + " -varout "+ssg_outfile+\ + " -L 1"+\ + " -fmq0 -l INFO " + cmd(Cmd) + +# Run VariantEval on SSG calls +varianteval_outdir = "VariantEval" +varianteval_outbase = "VariantEval/"+output_base +varianteval_outfile = "VariantEval/"+output_base+".all.genotype_concordance" + +if not exists_and_non_zero(varianteval_outfile): + print "Running VarianteEval with output to "+varianteval_outbase + if not os.path.exists(varianteval_outdir): + os.mkdir(varianteval_outdir) + Cmd = gatk_exe + ref + "-T VariantEval"+\ + " -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\ + " -B eval,Variants,"+ssg_outfile+\ + " -hc "+hapmapchip_file+\ + " -o "+varianteval_outbase+\ + " -L /humgen/gsa-scr1/andrewk/coverage/b36.hapmap_intervals.chr1"+\ + " -evalContainsGenotypes"+\ + " -minDiscoveryQ 5"+\ + " -fmq0 -l INFO" + cmd(Cmd) + +# Run CoverageHist on BAM file +hist_outfile = output_base+".hapmap.coverage_hist" +if not exists_and_non_zero(hist_outfile): + print "Running CoverageHist on "+options.bam_file + Cmd = gatk_exe + ref + "-T CoverageHistogram"+\ + " -I "+options.bam_file+\ + " -o "+hist_outfile+\ + " -L /humgen/gsa-scr1/andrewk/coverage/b36.hapmap_intervals.chr1"+\ + " -fmq0 -l INFO" + #" -hc ~/coverage/NA12878.b36.gff"+\ + #" -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\ + + cmd(Cmd)#,"gsa", output_base) + +# Now show CoverageEval predictions +Cmd = "CoverageEval.py -e"+\ + " -s /humgen/gsa-scr1/projects/1kg_pilot2/coverage_eval/fixed_stats/NA12878.CLEANED.chr1.full.hapmap.stats"+\ + " -g "+hist_outfile +cmd(Cmd) + + + + + + + + + + + \ No newline at end of file