Apprach v2. Added python analysis script, so java no longer must be used to analyses quality score data. About to refactor out lots of unneeded code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1063 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-20 16:00:23 +00:00
parent dde52e33eb
commit 9e26550b0d
10 changed files with 429 additions and 158 deletions

View File

@ -0,0 +1,26 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
input = args[1]
t=read.table(input, header=T)
#t=read.csv(input)
#par(mfrow=c(2,1), cex=1.2)
#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
plot(t$Qreported, t$Qempirical, type="p", col="blue", xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
abline(0,1)
dev.off()
#outfile = paste(input, ".quality_emp_hist.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,40), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -0,0 +1,16 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
#X11(width=7, height=14)
#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
plot(c$Cycle, c$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", ylim=c(-10, 10))

View File

@ -0,0 +1,16 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
#outfile = paste(input, ".qual_diff_v_dinuc.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
outfile = paste(input, ".qual_diff_v_dinuc.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
#in_dinuc = paste(input, ".quality_difference_v_dinucleotide.csv", sep="")
#d <- read.csv(input)
d <- read.table(input, header=T)
plot(d$Dinuc, d$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Dinucleotide", ylim=c(-10,10))

View File

@ -22,20 +22,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
public int buggyMaxReadLen = 100000;
//@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", doc="max quality score", required=false)
//public int MAX_QUAL_SCORE = 63;
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files")
public String OUTPUT_FILEROOT = "output";
//@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
//public boolean CREATE_TRAINING_DATA = true;
//@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample")
//public float DOWNSAMPLE_FRACTION=1.0f;
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
public int MIN_MAPPING_QUALITY = 0;
public int MIN_MAPPING_QUALITY = 1;
@Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
public String READ_GROUP = "none";
@ -47,9 +38,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
public List<String> platforms = Collections.singletonList("*");
//public List<String> platforms = Collections.singletonList("ILLUMINA");
//@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
//public boolean outputRawData = true;
@Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations")
public boolean writeDetailedAnalysisFiles = false;
@ -59,10 +47,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
public boolean collapseDinuc = false;
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
//RecalData[][][] data;
long counted_sites = 0; // number of sites used to count covariates
long counted_bases = 0; // number of bases used to count covariates
@ -161,21 +146,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
}
public void onTraversalDone(Integer result) {
/*
PrintStream covars_out;
try {
covars_out = new PrintStream(OUTPUT_FILEROOT+".covars.out");
covars_out.println(RecalData.headerString());
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( RecalData datum : getRecalData(readGroup.getReadGroupId()) ) {
covars_out.println(datum);
}
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException: " + e.getMessage());
}
*/
printInfo(out);
writeTrainingData();
writeAnalysisData();
@ -194,7 +164,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
private void writeTrainingData() {
out.printf("Writing raw recalibration data%n");
writeRawTable();
writeRecalTable();
out.printf("...done%n");
out.printf("Writing logistic recalibration data%n");
@ -228,10 +198,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
}
}
private void writeRawTable() {
private void writeRecalTable() {
PrintStream table_out = null;
try {
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
table_out = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv");
printInfo(table_out);
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {

View File

@ -229,7 +229,7 @@ class CombinatorialRecalMapping implements RecalMapping {
byte[][] dataTable = index == -1 ? null : cache.get(index);
if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, prevBase, base));
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;

View File

@ -69,5 +69,5 @@ if __name__ == "__main__":
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, '', waitID = jobid, just_print_commands = OPTIONS.dry)
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)

View File

@ -0,0 +1,253 @@
from __future__ import with_statement
import farm_commands
import os.path
import sys
from optparse import OptionParser
import picard_utils
from gatkConfigParser import *
import re
from itertools import *
import math
def phredQScore( nMismatches, nBases ):
#print 'phredQScore', nMismatches, nBases
if nMismatches == 0:
return 40
elif nBases == 0:
return 0
else:
return -10 * math.log10(float(nMismatches) / nBases)
def phredScore2ErrorProp(qual):
#print 'phredScore2ErrorProp', qual
return math.pow(10.0, float(qual) / -10.0)
expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',')
defaultValues = '0,**,0,0,0,0,0'.split(',')
class RecalData(dict):
def __init__(self):
self.parse(expectedHeader, defaultValues)
def parse(self, header, data):
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
types = [str, str, int, int, int, int, int]
for head, expected, datum, type in zip(header, expectedHeader, data, types):
if head <> expected:
raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum))
#print 'Binding => ', head, type(datum)
self[head] = type(datum)
#print self
return self
def set(self, header, values):
for head, val in zip(header, values):
self[head] = val
def __getattr__(self, name):
return self[name]
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
def readGroup(self): return self.rg
def dinuc(self): return self.dn
def qReported(self): return self.Qrep
def qEmpirical(self): return self.Qemp
def cycle(self): return self.pos
def nBases(self): return self.NBases
def nMismatches(self): return self.MMismatches
def nExpectedMismatches(self): return self.nBases() * phredScore2ErrorProp(self.qReported())
def combine(self, moreData):
# grab useful info
sumErrors = self.nExpectedMismatches()
for datum in moreData:
self.NBases += datum.nBases()
self.MMismatches += datum.nMismatches()
sumErrors += datum.nExpectedMismatches()
self.updateQemp()
self.Qrep = phredQScore(sumErrors, self.nBases())
#print 'self.Qrep is now', self.Qrep
return self
def updateQemp(self):
newQemp = phredQScore( self.nMismatches(), self.nBases() )
#print 'Updating qEmp', self.Qemp, newQemp
self.Qemp = newQemp
return newQemp
def __str__(self):
return "rg=%s cycle=%d dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d" % ( self.readGroup(), self.cycle(), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches())
# def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ):
# self.dinuc = dinuc
# self.Qrep = Qrep
def rawDataStream(file):
"""Yields successive lists containing the CSVs in the data file; excludes headers"""
header = None
for line in open(file):
if line.find("#") <> -1: continue
else:
data = line.strip().split(',')
if line.find("rg,") <> -1:
header = data
else:
yield RecalData().parse(header, data)
def rawDataByReadGroup(rawDataFile):
"""Yields a stream of the data in rawDataFile, grouped by readGroup"""
for readGroup, generator in groupby(rawDataStream(rawDataFile), key=RecalData.readGroup):
yield (readGroup, list(generator))
def combineRecalData(separateData):
return RecalData().combine(separateData)
def groupRecalData(allData, key=None):
s = sorted(allData, key=key)
values = [ [key, combineRecalData(vals)] for key, vals in groupby(s, key=key) ]
return sorted( values, key=lambda x: x[0])
#
# let's actually analyze the data!
#
def analyzeReadGroup(readGroup, data, outputRoot):
print 'Read group => ', readGroup
print 'Number of elements => ', len(data)
files = []
if OPTIONS.toStdout:
qReportedVsqEmpirical(readGroup, data, sys.stdout )
qDiffByCycle(readGroup, data, sys.stdout)
qDiffByDinuc(readGroup, data, sys.stdout)
else:
def outputFile(tail):
file = outputRoot + tail
files.append(file)
return file
with open(outputFile(".empirical_v_reported_quality.dat"), 'w') as output:
qReportedVsqEmpirical(readGroup, data, output )
with open(outputFile(".quality_difference_v_cycle.dat"), 'w') as output:
qDiffByCycle(readGroup, data, output)
with open(outputFile(".quality_difference_v_dinucleotide.dat"), 'w') as output:
qDiffByDinuc(readGroup, data, output)
print 'Files', files
return analyzeFiles(files)
def qDiffByCycle(readGroup, allData, output):
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases'
for cycle, datum in groupRecalData(allData, key=RecalData.cycle):
datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle])
diff = datum.qEmpirical() - datum.qReported()
print >> output, "%d %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
#
# public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) {
# ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
# ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
# file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
# RecalData All = new RecalData(0,0,readGroup,"");
# MeanReportedQuality AllReported = new MeanReportedQuality();
# for (int c=0; c < RecalData.NDINUCS; c++) {
# ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c)));
# ByCycleReportedQ.add(new MeanReportedQuality());
# }
#
# for ( RecalData datum: getRecalData(readGroup) ) {
# int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
# ByCycle.get(dinucIndex).inc(datum.N, datum.B);
# ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
# All.inc(datum.N, datum.B);
# AllReported.inc(datum.qual, datum.N);
# }
#
# for (int c=0; c < RecalData.NDINUCS; c++) {
# double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
# double reportedQual = ByCycleReportedQ.get(c).result();
# file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
# }
# }
def qDiffByDinuc(readGroup, allData, output):
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
print >> output, 'Dinuc Qreported Qempirical Qempirical_Qreported nMismatches nBases'
for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc):
datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*'])
diff = datum.qEmpirical() - datum.qReported()
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
def qReportedVsqEmpirical(readGroup, allData, output):
print >> output, 'Qreported Qempirical nMismatches nBases'
for key, datum in groupRecalData(allData, key=RecalData.qReported):
datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*'])
print >> output, "%2d %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases())
def analyzeRawData(rawDataFile):
for readGroup, data in rawDataByReadGroup(rawDataFile):
if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups:
root, sourceFilename = os.path.split(rawDataFile)
if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
analyzeReadGroup(readGroup, data, outputRoot)
plottersByFile = {
"raw_data.csv$" : analyzeRawData,
"recal_data.csv$" : analyzeRawData,
"empirical_v_reported_quality" : 'PlotQEmpStated',
"quality_difference_v_dinucleotide" : 'PlotQDiffByDinuc',
"quality_difference_v_cycle" : 'PlotQDiffByCycle' }
def getPlotterForFile(file):
for pat, analysis in plottersByFile.iteritems():
if re.search(pat, file):
if type(analysis) == str:
return config.getOption('R', analysis, 'input_file')
else:
analysis(file)
return None
def analyzeFiles(files):
#print 'analyzeFiles', files
Rscript = config.getOption('R', 'Rscript', 'input_file')
for file in files:
plotter = getPlotterForFile(file)
if plotter <> None:
cmd = ' '.join([Rscript, plotter, file])
farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
if __name__ == "__main__":
usage = """usage: %prog -c config.cfg files*"""
parser = OptionParser(usage=usage)
parser.add_option("-q", "--farm", dest="farmQueue",
type="string", default=None,
help="Farm queue to send processing jobs to")
parser.add_option("-d", "--dir", dest="outputDir",
type="string", default=None,
help="If provided, analysis output files will be written to this directory")
parser.add_option("-c", "--config", dest="configs",
action="append", type="string", default=[],
help="Configuration file")
parser.add_option("-s", "--stdout", dest="toStdout",
action='store_true', default=False,
help="If provided, writes output to standard output, not to files")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
parser.add_option("-g", "--readGroup", dest="selectedReadGroups",
action="append", type="string", default=[],
help="If provided, only the provided read groups will be analyzed")
(OPTIONS, args) = parser.parse_args()
#if len(args) != 3:
# parser.error("incorrect number of arguments")
if len(OPTIONS.configs) == 0:
parser.error("Requires at least one configuration file be provided")
config = gatkConfigParser(OPTIONS.configs)
analyzeFiles(args)

View File

@ -0,0 +1,107 @@
#
#
#
# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.bam --OUTPUT_FILEROOT test/NA07048_test --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.raw_data.csv
#
# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta -T TableRecalibration -I NA07048.bam -params test/NA07048_test.raw_data.csv -outputBAM NA07048.test.bam -l INFO -compress 1 -L chr1:1-10,000,000
# samtools index NA07048.test.bam
#
# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.test.bam --OUTPUT_FILEROOT test/NA07048_test.recal --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.recal.raw_data.csv
#
import farm_commands
import os.path
import sys
from optparse import OptionParser
import picard_utils
from gatkConfigParser import *
import glob
if __name__ == "__main__":
usage = """usage: %prog config.cfg* input.bam output.bam"""
parser = OptionParser(usage=usage)
parser.add_option("-A", "--args", dest="args",
type="string", default="",
help="arguments to GATK")
parser.add_option("-C", "--CovariateArgs", dest="CovariateArgs",
type="string", default="",
help="arguments to GATK")
parser.add_option("-q", "--farm", dest="farmQueue",
type="string", default=None,
help="Farm queue to send processing jobs to")
parser.add_option("-d", "--dir", dest="scratchDir",
type="string", default="test",
help="Output directory")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
parser.add_option("", "--plot", dest="plot",
action='store_true', default=False,
help="If provided, will call R to generate convenient plots about the Q scores of the pre and post calibrated files")
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=False,
help="Ignores already written files, if present")
(OPTIONS, args) = parser.parse_args()
#if len(args) != 3:
# parser.error("incorrect number of arguments")
configFiles = args[0:len(args)-2]
config = gatkConfigParser(configFiles)
inputBAM = args[len(args)-2]
outputBAM = args[len(args)-1]
rootname = os.path.split(os.path.splitext(outputBAM)[0])[1]
covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
covariateInitial = covariateRoot + '.init'
initDataFile = covariateInitial + '.raw_data.csv'
covariateRecal = covariateRoot + '.recal'
recalDataFile = covariateRecal + '.raw_data.csv'
if not os.path.exists(OPTIONS.scratchDir):
os.mkdir(OPTIONS.scratchDir)
def covariateCmd(bam, outputDir, ignoreAdds):
add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
if not ignoreAdds:
add += " " + OPTIONS.CovariateArgs
return config.gatkCmd('CountCovariates') + add
def recalibrateCmd(inputBAM, dataFile, outputBAM):
return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s" % (inputBAM, dataFile, outputBAM)
def runCovariateCmd(inputBAM, dataFile, dir, jobid, ignoreAdds = False):
if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
cmd = covariateCmd(inputBAM, dir, ignoreAdds)
return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
if OPTIONS.plot:
def plotCmd(cmd):
farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
Rscript = config.getOption('R', 'Rscript', 'input_file')
plotEmpStated = config.getOption('R', 'PlotQEmpStated', 'input_file')
plotByCycleDinuc = config.getOption('R', 'PlotQByCycleDinuc', 'input_file')
empQualFiles = map( lambda x: glob.glob(''.join([x,'.RG_*.empirical_v_reported_quality.csv'])), [covariateInitial, covariateRecal])
empQualFiles = empQualFiles[0] + empQualFiles[1]
readGroupRoots = map(lambda x: x.replace(".empirical_v_reported_quality.csv", ""), empQualFiles)
print readGroupRoots
for file in empQualFiles:
plotCmd(' '.join([Rscript, plotEmpStated, file]))
for root in readGroupRoots:
plotCmd(' '.join([Rscript, plotByCycleDinuc, root]))
else:
jobid = None
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False)
if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True)

View File

@ -1,120 +0,0 @@
#!/usr/bin/env python
# Hg18 ref and rods
reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.assembled_and_random_contigs_only.rod.out"
## 1KG ref and rods
reference = "/broad/1KG/reference/human_b36_both.fasta"
dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.1kg.rod.out"
# Flags - many of these are not necessary now but were used for debugging
training_range_str = "" #" -L chr1"
gatk_flags = ""
downsample_fraction = "" #" --DOWNSAMPLE_FRACTION 0.01"
read_group = "" #" --READ_GROUP SRR005814"
min_mapping_quality = " --MIN_MAPPING_QUALITY 1"
recal_limit = 0
javaexe = "java -ea"
justPrintCommands = False
import os, sys
sys.path.append(".")
if len(sys.argv) > 1:
#from recal_args import *
#print "Using args imported from recal_args.py"
#else:
#from default_broad_recal_args import *
#print "Using default Broad recalibration arguments"
file = sys.argv[1]
fileroot = file
if file.endswith(".bam"):
fileroot = fileroot.replace(".bam", "")
fileroot = os.path.basename(fileroot)
print "fileroot: "+fileroot
#training_range_str = " -L chr1:1-20000000"
plot = False
recal_only = False
for arg in sys.argv:
if arg == "-plot":
plot = True
if arg == "-recal_only":
recal_only = True
def cmd(cmdstr):
print ">>> Running: "+cmdstr
if (not justPrintCommands):
ret_val = str(os.system(cmdstr))
print "<<< Returned: "+str(ret_val)
if int(ret_val) != 0:
sys.exit("Last command failed: "+cmdstr)
# Assure that BAM index is available
def assert_bam_index(bam_filename):
if not os.path.exists(bam_filename+".bai"):
cmd("samtools index "+bam_filename)
def ensure_dir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
# Process arguments
if recal_limit == 0:
recal_limit_str = ""
else:
recal_limit_str = "_"+str(recal_limit)
# This was the central directory where I put the files, I've changed it to "."
# for simplicity now even though it probably needs to be read group aware
#covar_path = "/humgen/gsa-scr1/andrewk/recalibration_work"
covar_path = "."
central_fileroot= covar_path+"/"+fileroot
bam = central_fileroot+".bam"
if not os.path.lexists(bam):
os.symlink(file, bam)
correction_str = ".corrected"+recal_limit_str
corrected_fileroot = central_fileroot+correction_str
corrected_bam = corrected_fileroot+".bam"
assert_bam_index(bam)
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+bam+" --OUTPUT_FILEROOT "+central_fileroot+" -l INFO --CREATE_TRAINING_DATA"+training_range_str+downsample_fraction+gatk_flags+read_group+min_mapping_quality)
# Make plots
if plot:
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+central_fileroot+".empirical_v_reported_quality.csv")
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+central_fileroot)
cmd("python ~andrewk/dev/Sting/trunk/python/LogRegression.py "+central_fileroot)
if recal_only: # Stop if we are only making recalibration tables
sys.exit()
cmd(javaexe+" -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T LogisticRecalibration -I "+bam+" -R "+reference+" --logisticparamsfile "+central_fileroot+".recalibration_table --outputbamfile "+corrected_bam+" -l DEBUG -M "+str(recal_limit)+gatk_flags)
cmd("samtools index "+corrected_bam)
cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+corrected_bam+" --OUTPUT_FILEROOT "+corrected_fileroot+" -l INFO"+gatk_flags+read_group+min_mapping_quality)
# Make plots
if plot:
cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+corrected_fileroot+".empirical_v_reported_quality.csv")
cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+corrected_fileroot)
# Merge corrected and uncorrected files
output_dir = "comparison_png"
ensure_dir(output_dir)
corrected_pngs = [f for f in os.listdir(".") if f.endswith(".png") and "corrected" in f]
print "\n".join(corrected_pngs)
for corrected_png in corrected_pngs:
raw_png = corrected_png.replace(correction_str, "")
both_png = corrected_png.replace(correction_str, "raw_and_corrected")
cmd(" ".join(["montage", raw_png, corrected_png, "-geometry +0+0", output_dir+"/"+both_png]))

View File

@ -10,14 +10,17 @@ gatkData: /humgen/gsa-scr1/GATK_Data
dbsnp: %(gatkData)s/dbsnp_129_hg18.rod
tmp: /tmp/
args: -l INFO
Sting: ~/dev/GenomeAnalysisTK/trunk
#args: -l INFO -L chr1:1-1,000,000
[CountCovariates]
args: --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s
args: --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s
[TableRecalibration]
args: -compress 1
[R]
Rscript: /broad/tools/apps/R-2.6.0/bin/Rscript
PlotQEmpStated: ~andrewk/recalQual_resources/plot_q_emp_stated_hst.R
PlotQEmpStated: %(Sting)s/R/plot_q_emp_stated_hst.R
PlotQDiffByCycle: %(Sting)s/R/plot_qual_diff_v_cycle.R
PlotQDiffByDinuc: %(Sting)s/R/plot_qual_diff_v_dinuc.R