gatk-3.8/python/recal_qual.py

121 lines
4.3 KiB
Python
Executable File

#!/usr/bin/env python
# Hg18 ref and rods
reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.assembled_and_random_contigs_only.rod.out"
## 1KG ref and rods
reference = "/broad/1KG/reference/human_b36_both.fasta"
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.1kg.rod.out"
# Flags - many of these are not necessary now but were used for debugging
training_range_str = "" #" -L chr1"
gatk_flags = ""
downsample_fraction = "" #" --DOWNSAMPLE_FRACTION 0.01"
read_group = "" #" --READ_GROUP SRR005814"
min_mapping_quality = " --MIN_MAPPING_QUALITY 1"
recal_limit = 0
javaexe = "java -ea"
justPrintCommands = False
import os, sys
sys.path.append(".")
if len(sys.argv) > 1:
#from recal_args import *
#print "Using args imported from recal_args.py"
#else:
#from default_broad_recal_args import *
#print "Using default Broad recalibration arguments"
file = sys.argv[1]
fileroot = file
if file.endswith(".bam"):
fileroot = fileroot.replace(".bam", "")
fileroot = os.path.basename(fileroot)
print "fileroot: "+fileroot
#training_range_str = " -L chr1:1-20000000"
plot = False
recal_only = False
for arg in sys.argv:
if arg == "-plot":
plot = True
if arg == "-recal_only":
recal_only = True
def cmd(cmdstr):
print ">>> Running: "+cmdstr
if (not justPrintCommands):
ret_val = str(os.system(cmdstr))
print "<<< Returned: "+str(ret_val)
if int(ret_val) != 0:
sys.exit("Last command failed: "+cmdstr)
# Assure that BAM index is available
def assert_bam_index(bam_filename):
if not os.path.exists(bam_filename+".bai"):
cmd("samtools index "+bam_filename)
def ensure_dir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
# Process arguments
if recal_limit == 0:
recal_limit_str = ""
else:
recal_limit_str = "_"+str(recal_limit)
# This was the central directory where I put the files, I've changed it to "."
# for simplicity now even though it probably needs to be read group aware
#covar_path = "/humgen/gsa-scr1/andrewk/recalibration_work"
covar_path = "."
central_fileroot= covar_path+"/"+fileroot
bam = central_fileroot+".bam"
if not os.path.lexists(bam):
os.symlink(file, bam)
correction_str = ".corrected"+recal_limit_str
corrected_fileroot = central_fileroot+correction_str
corrected_bam = corrected_fileroot+".bam"
assert_bam_index(bam)
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+bam+" --OUTPUT_FILEROOT "+central_fileroot+" -l INFO --CREATE_TRAINING_DATA"+training_range_str+downsample_fraction+gatk_flags+read_group+min_mapping_quality)
# Make plots
if plot:
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+central_fileroot+".empirical_v_reported_quality.csv")
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+central_fileroot)
cmd("python ~andrewk/dev/Sting/trunk/python/LogRegression.py "+central_fileroot)
if recal_only: # Stop if we are only making recalibration tables
sys.exit()
cmd(javaexe+" -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T LogisticRecalibration -I "+bam+" -R "+reference+" --logisticparamsfile "+central_fileroot+".recalibration_table --outputbamfile "+corrected_bam+" -l DEBUG -M "+str(recal_limit)+gatk_flags)
cmd("samtools index "+corrected_bam)
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+corrected_bam+" --OUTPUT_FILEROOT "+corrected_fileroot+" -l INFO"+gatk_flags+read_group+min_mapping_quality)
# Make plots
if plot:
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+corrected_fileroot+".empirical_v_reported_quality.csv")
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+corrected_fileroot)
# Merge corrected and uncorrected files
output_dir = "comparison_png"
ensure_dir(output_dir)
corrected_pngs = [f for f in os.listdir(".") if f.endswith(".png") and "corrected" in f]
print "\n".join(corrected_pngs)
for corrected_png in corrected_pngs:
raw_png = corrected_png.replace(correction_str, "")
both_png = corrected_png.replace(correction_str, "raw_and_corrected")
cmd(" ".join(["montage", raw_png, corrected_png, "-geometry +0+0", output_dir+"/"+both_png]))