diff --git a/python/recal_qual.py b/python/recal_qual.py new file mode 100755 index 000000000..056719b0e --- /dev/null +++ b/python/recal_qual.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +# Hg18 ref and rods +reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" +dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.assembled_and_random_contigs_only.rod.out" + +## 1KG ref and rods +reference = "/broad/1KG/reference/human_b36_both.fasta" +dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.1kg.rod.out" + +# Flags - many of these are not necessary now but were used for debugging +training_range_str = "" #" -L chr1" +gatk_flags = "" +downsample_fraction = "" #" --DOWNSAMPLE_FRACTION 0.01" +read_group = "" #" --READ_GROUP SRR005814" +min_mapping_quality = " --MIN_MAPPING_QUALITY 1" +recal_limit = 0 + +javaexe = "java -ea" + +justPrintCommands = False + +import os, sys +sys.path.append(".") +if len(sys.argv) > 1: + #from recal_args import * + #print "Using args imported from recal_args.py" +#else: + #from default_broad_recal_args import * + #print "Using default Broad recalibration arguments" + file = sys.argv[1] + fileroot = file + if file.endswith(".bam"): + fileroot = fileroot.replace(".bam", "") + fileroot = os.path.basename(fileroot) + print "fileroot: "+fileroot + #training_range_str = " -L chr1:1-20000000" + +plot = False +recal_only = False +for arg in sys.argv: + if arg == "-plot": + plot = True + if arg == "-recal_only": + recal_only = True + +def cmd(cmdstr): + print ">>> Running: "+cmdstr + if (not justPrintCommands): + ret_val = str(os.system(cmdstr)) + print "<<< Returned: "+str(ret_val) + + if int(ret_val) != 0: + sys.exit("Last command failed: "+cmdstr) + +# Assure that BAM index is available +def assert_bam_index(bam_filename): + if not os.path.exists(bam_filename+".bai"): + cmd("samtools index "+bam_filename) + +def ensure_dir(dir): + if not os.path.exists(dir): + os.makedirs(dir) + +# Process arguments +if recal_limit == 0: + recal_limit_str = "" +else: + recal_limit_str = "_"+str(recal_limit) + +# This was the central directory where I put the files, I've changed it to "." +# for simplicity now even though it probably needs to be read group aware +#covar_path = "/humgen/gsa-scr1/andrewk/recalibration_work" +covar_path = "." +central_fileroot= covar_path+"/"+fileroot + +bam = central_fileroot+".bam" +if not os.path.lexists(bam): + os.symlink(file, bam) + +correction_str = ".corrected"+recal_limit_str +corrected_fileroot = central_fileroot+correction_str +corrected_bam = corrected_fileroot+".bam" + +assert_bam_index(bam) +cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+bam+" --OUTPUT_FILEROOT "+central_fileroot+" -l INFO --CREATE_TRAINING_DATA"+training_range_str+downsample_fraction+gatk_flags+read_group+min_mapping_quality) + +# Make plots +if plot: + cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+central_fileroot+".empirical_v_reported_quality.csv") + cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+central_fileroot) + +#cmd("python ~andrewk/dev/Sting/trunk/python/LogRegression.py "+central_fileroot) + +if recal_only: # Stop if we are only making recalibration tables + sys.exit() + +cmd(javaexe+" -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T LogisticRecalibration -I "+bam+" -R "+reference+" --logisticparamsfile "+central_fileroot+".recalibration_table --outputbamfile "+corrected_bam+" -l DEBUG -M "+str(recal_limit)+gatk_flags) +cmd("samtools index "+corrected_bam) + +cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+corrected_bam+" --OUTPUT_FILEROOT "+corrected_fileroot+" -l INFO"+gatk_flags+read_group+min_mapping_quality) + +# Make plots +if plot: + cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+corrected_fileroot+".empirical_v_reported_quality.csv") + cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+corrected_fileroot) + +# Merge corrected and uncorrected files +output_dir = "comparison_png" +ensure_dir(output_dir) +corrected_pngs = [f for f in os.listdir(".") if f.endswith(".png") and "corrected" in f] +print "\n".join(corrected_pngs) + +for corrected_png in corrected_pngs: + raw_png = corrected_png.replace(correction_str, "") + both_png = corrected_png.replace(correction_str, "raw_and_corrected") + cmd(" ".join(["montage", raw_png, corrected_png, "-geometry +0+0", output_dir+"/"+both_png])) + + +