Many generalization improvements - parameters, files as options - to script that runs pieces for predicting SNP calling performance for given SNP calling coverage

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1619 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-09-14 23:37:53 +00:00
parent 7eb21e55c1
commit e06e31d99f
1 changed files with 111 additions and 69 deletions

View File

@ -1,4 +1,4 @@
#!/usr/bin/python #!/usr/bin/env python
import sys, os import sys, os
from farm_commands import cmd from farm_commands import cmd
@ -18,74 +18,116 @@ class OptionParser (OptionParser):
def exists_and_non_zero(file): def exists_and_non_zero(file):
return os.path.exists(file) and os.path.getsize(file) != 0 return os.path.exists(file) and os.path.getsize(file) != 0
# Global files def process_bam(bam_file):
gatk_exe = "java -Xmx8192m -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar " gatk_exe = "java -Xmx8192m -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar "
ref = " -R /broad/1KG/reference/human_b36_both.fasta" output_basepath = os.path.splitext(bam_file)[0]
output_basefile = os.path.basename(output_basepath)
parser = OptionParser() print "Input BAM file:",bam_file
parser.add_option("-b", "--bam-file", help="Input BAM filename", dest="bam_file", default=None)
(options, args) = parser.parse_args() using_hapmapchip = None
parser.check_required(("-b",)) if False:
output_base = os.path.splitext(options.bam_file)[0] # Config for chr1 of pilot 2 people
print "Input BAM file:",options.bam_file ref = " -R /broad/1KG/reference/human_b36_both.fasta"
#interval_string = "1"
# Assure that hapmap chip file is here # Assure existance of intervals file
hapmapchip_file = output_base+".hapmapchip.gff" intervals_file = output_basepath+".hapmap.intervals.chr1"
if not exists_and_non_zero(hapmapchip_file): if not exists_and_non_zero(intervals_file):
Cmd = "awk -F' ' '{ if ($1 == \""+interval_string+"\") {print $1 \":\" $4} }' "+hapmapchip_file+" > "+intervals_file
cmd(Cmd, die_on_fail = True)
# Setup hapmapchip GFFs; assure that hapmap chip file is here
using_hapmapchip = True
hapmapchip_file = output_basepath+".hapmapchip.gff"
if not exists_and_non_zero(hapmapchip_file):
sys.exit("Expected to find "+hapmapchip_file+" but it's not around") sys.exit("Expected to find "+hapmapchip_file+" but it's not around")
# Run SSG calls else:
ssg_outfile = output_base+".SSG.calls" # Config for pilot 3
if not exists_and_non_zero(ssg_outfile): ref = " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
intervals_file = "/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list"
# Assure that BAM index exists
bam_index_file = output_basepath+".bam.bai"
if not exists_and_non_zero(bam_index_file):
Cmd = "samtools index "+bam_file
cmd(Cmd, die_on_fail = True)
# Run SSG calls
ssg_outfile = output_basepath+".SSG.calls"
if not exists_and_non_zero(ssg_outfile):
print "Running SingleSampleGenotyper" print "Running SingleSampleGenotyper"
Cmd = gatk_exe + ref + " -T SingleSampleGenotyper"+\ Cmd = gatk_exe + ref + " -T SingleSampleGenotyper"+\
" -I "+options.bam_file+\ " -I "+bam_file+\
" -varout "+ssg_outfile+\ " -varout "+ssg_outfile+\
" -L 1"+\ " -L "+intervals_file+\
" -fmq0 -l INFO " " -fmq0 -l INFO "
cmd(Cmd) #" --genotype -lod 5"
# Run VariantEval on SSG calls #+interval_string+\
varianteval_outdir = "VariantEval" cmd(Cmd, die_on_fail=True)
varianteval_outbase = "VariantEval/"+output_base
varianteval_outfile = "VariantEval/"+output_base+".all.genotype_concordance"
if not exists_and_non_zero(varianteval_outfile): # Run VariantEval on SSG calls
print "Running VarianteEval with output to "+varianteval_outbase varianteval_outdir = output_basepath+".VariantEval"
varianteval_outbase = output_basepath+".VariantEval/"+output_basepath
varianteval_outfile = output_basepath+".VariantEval/"+output_basepath #+".all.genotype_concordance"
if not exists_and_non_zero(varianteval_outfile):
print "Running VariantEval with output to "+varianteval_outbase
if not os.path.exists(varianteval_outdir): if not os.path.exists(varianteval_outdir):
os.mkdir(varianteval_outdir) os.mkdir(varianteval_outdir)
Cmd = gatk_exe + ref + "-T VariantEval"+\ Cmd = gatk_exe + ref + " -T VariantEval"+\
" -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\ " -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\
" -B eval,Variants,"+ssg_outfile+\ " -B eval,Variants,"+ssg_outfile+\
" -hc "+hapmapchip_file+\
" -o "+varianteval_outbase+\ " -o "+varianteval_outbase+\
" -L /humgen/gsa-scr1/andrewk/coverage/b36.hapmap_intervals.chr1"+\ " -L "+intervals_file+\
" -evalContainsGenotypes"+\ " --evalContainsGenotypes"+\
" -minDiscoveryQ 5"+\ " -minConfidenceScore 5"+\
" -fmq0 -l INFO" " -fmq0 -l INFO"
cmd(Cmd) if using_hapmapchip:
Cmd = Cmd + " -hc "+hapmapchip_file
# Run CoverageHist on BAM file cmd(Cmd, die_on_fail=True)
hist_outfile = output_base+".hapmap.coverage_hist"
if not exists_and_non_zero(hist_outfile): # Run CoverageHist on BAM file
print "Running CoverageHist on "+options.bam_file hist_outfile = output_basepath+".hapmap.coverage_hist"
Cmd = gatk_exe + ref + "-T CoverageHistogram"+\ if not exists_and_non_zero(hist_outfile):
" -I "+options.bam_file+\ print "Running CoverageHist on "+bam_file
Cmd = gatk_exe + ref + " -T CoverageHistogram"+\
" -I "+bam_file+\
" -o "+hist_outfile+\ " -o "+hist_outfile+\
" -L /humgen/gsa-scr1/andrewk/coverage/b36.hapmap_intervals.chr1"+\ " -L "+intervals_file+\
" -fmq0 -l INFO" " -fmq0 -l INFO"
#" -hc ~/coverage/NA12878.b36.gff"+\
#" -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\
cmd(Cmd)#,"gsa", output_base) cmd(Cmd, die_on_fail=True)#,"gsa", output_basepath)
# Now show CoverageEval predictions
Cmd = "CoverageEval.py -e"+\
" -s /humgen/gsa-scr1/projects/1kg_pilot2/coverage_eval/fixed_stats/NA12878.CLEANED.chr1.full.hapmap.stats"+\
" -g "+hist_outfile
cmd(Cmd)
# Now do CoverageEval predictions if they haven't been done
oneline_stats_file = output_basepath+".oneline_stats"
if not exists_and_non_zero(oneline_stats_file):
Cmd = "CoverageEval.py -e"+\
" -s /humgen/gsa-scr1/projects/1kg_pilot2/coverage_eval/fixed_stats/current/NA12878.chr1.full.stats"+\
" -g "+hist_outfile+\
" -v "+varianteval_outfile+\
" -o "+oneline_stats_file
cmd(Cmd, die_on_fail=True)
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-b", "--bam-file", help="Input BAM filename", dest="bam_file", default=None)
parser.add_option("-a", "--all_bams", help="Process all BAMs in current directory", default=False, dest="all_bams", action="store_true")
(options, args) = parser.parse_args()
if not options.all_bams:
parser.check_required(("-b",))
process_bam(options.bam_file)
else:
bams = [f for f in os.listdir(".") if f.endswith(".bam")]
print bams
for bam in bams:
process_bam(bam)
#print