First commit of recalibration master control script for recalibrating quality scores.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@885 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fa93661133
commit
b2eb724456
|
|
@ -0,0 +1,120 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Hg18 ref and rods
|
||||
reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
||||
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.assembled_and_random_contigs_only.rod.out"
|
||||
|
||||
## 1KG ref and rods
|
||||
reference = "/broad/1KG/reference/human_b36_both.fasta"
|
||||
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.1kg.rod.out"
|
||||
|
||||
# Flags - many of these are not necessary now but were used for debugging
|
||||
training_range_str = "" #" -L chr1"
|
||||
gatk_flags = ""
|
||||
downsample_fraction = "" #" --DOWNSAMPLE_FRACTION 0.01"
|
||||
read_group = "" #" --READ_GROUP SRR005814"
|
||||
min_mapping_quality = " --MIN_MAPPING_QUALITY 1"
|
||||
recal_limit = 0
|
||||
|
||||
javaexe = "java -ea"
|
||||
|
||||
justPrintCommands = False
|
||||
|
||||
import os, sys
|
||||
sys.path.append(".")
|
||||
if len(sys.argv) > 1:
|
||||
#from recal_args import *
|
||||
#print "Using args imported from recal_args.py"
|
||||
#else:
|
||||
#from default_broad_recal_args import *
|
||||
#print "Using default Broad recalibration arguments"
|
||||
file = sys.argv[1]
|
||||
fileroot = file
|
||||
if file.endswith(".bam"):
|
||||
fileroot = fileroot.replace(".bam", "")
|
||||
fileroot = os.path.basename(fileroot)
|
||||
print "fileroot: "+fileroot
|
||||
#training_range_str = " -L chr1:1-20000000"
|
||||
|
||||
plot = False
|
||||
recal_only = False
|
||||
for arg in sys.argv:
|
||||
if arg == "-plot":
|
||||
plot = True
|
||||
if arg == "-recal_only":
|
||||
recal_only = True
|
||||
|
||||
def cmd(cmdstr):
|
||||
print ">>> Running: "+cmdstr
|
||||
if (not justPrintCommands):
|
||||
ret_val = str(os.system(cmdstr))
|
||||
print "<<< Returned: "+str(ret_val)
|
||||
|
||||
if int(ret_val) != 0:
|
||||
sys.exit("Last command failed: "+cmdstr)
|
||||
|
||||
# Assure that BAM index is available
|
||||
def assert_bam_index(bam_filename):
|
||||
if not os.path.exists(bam_filename+".bai"):
|
||||
cmd("samtools index "+bam_filename)
|
||||
|
||||
def ensure_dir(dir):
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
|
||||
# Process arguments
|
||||
if recal_limit == 0:
|
||||
recal_limit_str = ""
|
||||
else:
|
||||
recal_limit_str = "_"+str(recal_limit)
|
||||
|
||||
# This was the central directory where I put the files, I've changed it to "."
|
||||
# for simplicity now even though it probably needs to be read group aware
|
||||
#covar_path = "/humgen/gsa-scr1/andrewk/recalibration_work"
|
||||
covar_path = "."
|
||||
central_fileroot= covar_path+"/"+fileroot
|
||||
|
||||
bam = central_fileroot+".bam"
|
||||
if not os.path.lexists(bam):
|
||||
os.symlink(file, bam)
|
||||
|
||||
correction_str = ".corrected"+recal_limit_str
|
||||
corrected_fileroot = central_fileroot+correction_str
|
||||
corrected_bam = corrected_fileroot+".bam"
|
||||
|
||||
assert_bam_index(bam)
|
||||
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+bam+" --OUTPUT_FILEROOT "+central_fileroot+" -l INFO --CREATE_TRAINING_DATA"+training_range_str+downsample_fraction+gatk_flags+read_group+min_mapping_quality)
|
||||
|
||||
# Make plots
|
||||
if plot:
|
||||
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+central_fileroot+".empirical_v_reported_quality.csv")
|
||||
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+central_fileroot)
|
||||
|
||||
#cmd("python ~andrewk/dev/Sting/trunk/python/LogRegression.py "+central_fileroot)
|
||||
|
||||
if recal_only: # Stop if we are only making recalibration tables
|
||||
sys.exit()
|
||||
|
||||
cmd(javaexe+" -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T LogisticRecalibration -I "+bam+" -R "+reference+" --logisticparamsfile "+central_fileroot+".recalibration_table --outputbamfile "+corrected_bam+" -l DEBUG -M "+str(recal_limit)+gatk_flags)
|
||||
cmd("samtools index "+corrected_bam)
|
||||
|
||||
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+corrected_bam+" --OUTPUT_FILEROOT "+corrected_fileroot+" -l INFO"+gatk_flags+read_group+min_mapping_quality)
|
||||
|
||||
# Make plots
|
||||
if plot:
|
||||
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+corrected_fileroot+".empirical_v_reported_quality.csv")
|
||||
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+corrected_fileroot)
|
||||
|
||||
# Merge corrected and uncorrected files
|
||||
output_dir = "comparison_png"
|
||||
ensure_dir(output_dir)
|
||||
corrected_pngs = [f for f in os.listdir(".") if f.endswith(".png") and "corrected" in f]
|
||||
print "\n".join(corrected_pngs)
|
||||
|
||||
for corrected_png in corrected_pngs:
|
||||
raw_png = corrected_png.replace(correction_str, "")
|
||||
both_png = corrected_png.replace(correction_str, "raw_and_corrected")
|
||||
cmd(" ".join(["montage", raw_png, corrected_png, "-geometry +0+0", output_dir+"/"+both_png]))
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue