121 lines
4.3 KiB
Python
Executable File
121 lines
4.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
# Hg18 ref and rods
|
|
reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
|
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.assembled_and_random_contigs_only.rod.out"
|
|
|
|
## 1KG ref and rods
|
|
reference = "/broad/1KG/reference/human_b36_both.fasta"
|
|
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.1kg.rod.out"
|
|
|
|
# Flags - many of these are not necessary now but were used for debugging
|
|
training_range_str = "" #" -L chr1"
|
|
gatk_flags = ""
|
|
downsample_fraction = "" #" --DOWNSAMPLE_FRACTION 0.01"
|
|
read_group = "" #" --READ_GROUP SRR005814"
|
|
min_mapping_quality = " --MIN_MAPPING_QUALITY 1"
|
|
recal_limit = 0
|
|
|
|
javaexe = "java -ea"
|
|
|
|
justPrintCommands = False
|
|
|
|
import os, sys
|
|
sys.path.append(".")
|
|
if len(sys.argv) > 1:
|
|
#from recal_args import *
|
|
#print "Using args imported from recal_args.py"
|
|
#else:
|
|
#from default_broad_recal_args import *
|
|
#print "Using default Broad recalibration arguments"
|
|
file = sys.argv[1]
|
|
fileroot = file
|
|
if file.endswith(".bam"):
|
|
fileroot = fileroot.replace(".bam", "")
|
|
fileroot = os.path.basename(fileroot)
|
|
print "fileroot: "+fileroot
|
|
#training_range_str = " -L chr1:1-20000000"
|
|
|
|
plot = False
|
|
recal_only = False
|
|
for arg in sys.argv:
|
|
if arg == "-plot":
|
|
plot = True
|
|
if arg == "-recal_only":
|
|
recal_only = True
|
|
|
|
def cmd(cmdstr):
|
|
print ">>> Running: "+cmdstr
|
|
if (not justPrintCommands):
|
|
ret_val = str(os.system(cmdstr))
|
|
print "<<< Returned: "+str(ret_val)
|
|
|
|
if int(ret_val) != 0:
|
|
sys.exit("Last command failed: "+cmdstr)
|
|
|
|
# Assure that BAM index is available
|
|
def assert_bam_index(bam_filename):
|
|
if not os.path.exists(bam_filename+".bai"):
|
|
cmd("samtools index "+bam_filename)
|
|
|
|
def ensure_dir(dir):
|
|
if not os.path.exists(dir):
|
|
os.makedirs(dir)
|
|
|
|
# Process arguments
|
|
if recal_limit == 0:
|
|
recal_limit_str = ""
|
|
else:
|
|
recal_limit_str = "_"+str(recal_limit)
|
|
|
|
# This was the central directory where I put the files, I've changed it to "."
|
|
# for simplicity now even though it probably needs to be read group aware
|
|
#covar_path = "/humgen/gsa-scr1/andrewk/recalibration_work"
|
|
covar_path = "."
|
|
central_fileroot= covar_path+"/"+fileroot
|
|
|
|
bam = central_fileroot+".bam"
|
|
if not os.path.lexists(bam):
|
|
os.symlink(file, bam)
|
|
|
|
correction_str = ".corrected"+recal_limit_str
|
|
corrected_fileroot = central_fileroot+correction_str
|
|
corrected_bam = corrected_fileroot+".bam"
|
|
|
|
assert_bam_index(bam)
|
|
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+bam+" --OUTPUT_FILEROOT "+central_fileroot+" -l INFO --CREATE_TRAINING_DATA"+training_range_str+downsample_fraction+gatk_flags+read_group+min_mapping_quality)
|
|
|
|
# Make plots
|
|
if plot:
|
|
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+central_fileroot+".empirical_v_reported_quality.csv")
|
|
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+central_fileroot)
|
|
|
|
cmd("python ~andrewk/dev/Sting/trunk/python/LogRegression.py "+central_fileroot)
|
|
|
|
if recal_only: # Stop if we are only making recalibration tables
|
|
sys.exit()
|
|
|
|
cmd(javaexe+" -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T LogisticRecalibration -I "+bam+" -R "+reference+" --logisticparamsfile "+central_fileroot+".recalibration_table --outputbamfile "+corrected_bam+" -l DEBUG -M "+str(recal_limit)+gatk_flags)
|
|
cmd("samtools index "+corrected_bam)
|
|
|
|
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+corrected_bam+" --OUTPUT_FILEROOT "+corrected_fileroot+" -l INFO"+gatk_flags+read_group+min_mapping_quality)
|
|
|
|
# Make plots
|
|
if plot:
|
|
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+corrected_fileroot+".empirical_v_reported_quality.csv")
|
|
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+corrected_fileroot)
|
|
|
|
# Merge corrected and uncorrected files
|
|
output_dir = "comparison_png"
|
|
ensure_dir(output_dir)
|
|
corrected_pngs = [f for f in os.listdir(".") if f.endswith(".png") and "corrected" in f]
|
|
print "\n".join(corrected_pngs)
|
|
|
|
for corrected_png in corrected_pngs:
|
|
raw_png = corrected_png.replace(correction_str, "")
|
|
both_png = corrected_png.replace(correction_str, "raw_and_corrected")
|
|
cmd(" ".join(["montage", raw_png, corrected_png, "-geometry +0+0", output_dir+"/"+both_png]))
|
|
|
|
|
|
|