Much improved python analysis routines, as well as easier / more correct merging utility. Better R scripts, which now close recalibration data by the confidence of the quality score itself

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1081 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-24 01:12:35 +00:00
parent ef546868bf
commit caf5aef0f8
4 changed files with 63 additions and 45 deletions

View File

@ -63,7 +63,7 @@ if __name__ == "__main__":
jobid = None
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
output = spec.getMergedBase() + '.stdout'
output = spec.getMergedBase()
cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools)
#print cmd
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)

View File

@ -59,7 +59,7 @@ class MergeFilesSpec:
def filename(self):
if self.merged_filename_base <> None:
return self.merged_filename_base + '.' + self.group()
return self.merged_filename_base + '.'.join([self.group()])
else:
return self.group()

View File

@ -10,14 +10,17 @@ from itertools import *
import math
import operator
MAX_QUAL_SCORE = 50
def phredQScore( nMismatches, nBases ):
#print 'phredQScore', nMismatches, nBases
if nMismatches == 0:
return 40
return MAX_QUAL_SCORE
elif nBases == 0:
return 0
else:
return -10 * math.log10(float(nMismatches) / nBases)
return min(-10 * math.log10(float(nMismatches) / nBases), MAX_QUAL_SCORE)
return r
def phredScore2ErrorProp(qual):
@ -30,16 +33,16 @@ def tryByInt(s):
except:
return s
expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',')
defaultValues = '0,**,0,0,0,0,0'.split(',')
expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
defaultValues = '0,0,0,**,0,0,0'.split(',')
class RecalData(dict):
def __init__(self):
self.parse(expectedHeader, defaultValues)
def parse(self, header, data):
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
types = [str, str, int, tryByInt, int, int, int]
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
types = [str, tryByInt, int, str, int, int, int]
for head, expected, datum, type in zip(header, expectedHeader, data, types):
if head <> expected:
raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum))
@ -59,32 +62,41 @@ class RecalData(dict):
def readGroup(self): return self.rg
def dinuc(self): return self.dn
def qReported(self): return self.Qrep
def qEmpirical(self): return self.Qemp
def cycle(self): return self.pos
def nBases(self): return self.NBases
def nMismatches(self): return self.MMismatches
def nExpectedMismatches(self): return self.nBases() * phredScore2ErrorProp(self.qReported())
def getNBases(self): return self.nBases
def getNMismatches(self): return self.nMismatches
def nExpectedMismatches(self): return self.getNBases() * phredScore2ErrorProp(self.qReported())
def qEmpirical(self):
#if OPTIONS.raw:
return self.Qemp
#else:
# r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1)
# #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp
# return r
def combine(self, moreData):
# grab useful info
sumErrors = self.nExpectedMismatches()
for datum in moreData:
self.NBases += datum.nBases()
self.MMismatches += datum.nMismatches()
self.nBases += datum.getNBases()
self.nMismatches += datum.getNMismatches()
sumErrors += datum.nExpectedMismatches()
self.updateQemp()
self.Qrep = phredQScore(sumErrors, self.nBases())
self.Qrep = phredQScore(sumErrors, self.getNBases())
#print 'self.Qrep is now', self.Qrep
return self
def updateQemp(self):
newQemp = phredQScore( self.nMismatches(), self.nBases() )
newQemp = phredQScore( self.getNMismatches(), self.getNBases() )
#print 'Updating qEmp', self.Qemp, newQemp
self.Qemp = newQemp
return newQemp
def __str__(self):
return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches())
return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.getNBases(), self.getNMismatches())
def __repr__(self):
return self.__str__()
@ -157,7 +169,7 @@ def countQsOfMinQuality(thres, data):
return qDeclared, qDeclaredTrue
def medianQreported(jaffe, allBases):
i, ignore = medianByCounts(map( RecalData.nBases, jaffe ))
i, ignore = medianByCounts(map( RecalData.getNBases, jaffe ))
return jaffe[i].qReported()
def medianByCounts(counts):
@ -170,7 +182,7 @@ def medianByCounts(counts):
return i, counts[i]
def modeQreported(jaffe, allBases):
ordered = sorted(jaffe, key=RecalData.nBases, reverse=True )
ordered = sorted(jaffe, key=RecalData.getNBases, reverse=True )
#print ordered
return ordered[0].qReported()
@ -202,21 +214,21 @@ def rmse(reportedList, empiricalList, counts):
diff = reported - empirical
inc = count * diff * diff
sum += inc
#print reported, empirical, sum, inc, count, diff
#print sum, math.sqrt(sum)
return math.sqrt(sum)
def stdevQReported(jaffe, allBases):
mean = averageQreported(jaffe, allBases)
return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.nBases, jaffe ), mean)
return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.getNBases, jaffe ), mean)
def coeffOfVariationQreported(jaffe, allBases):
mean = averageQreported(jaffe, allBases)
stdev = stdevQReported(jaffe, allBases)
return stdev / mean
# o("variance_Qreported %2.2f" % varianceQreported(jaffe))
def rmseJaffe(jaffe):
return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.nBases, jaffe ) )
return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.getNBases, jaffe ) )
def basicQualScoreStats(readGroup, data, output ):
def o(s):
@ -227,8 +239,8 @@ def basicQualScoreStats(readGroup, data, output ):
o("read_group %s" % rg)
#o("number_of_cycles %d" % 0)
#o("maximum_reported_quality_score %d" % 0)
o("number_of_bases %d" % allBases.nBases())
o("number_of_mismatching_bases %d" % allBases.nMismatches())
o("number_of_bases %d" % allBases.getNBases())
o("number_of_mismatching_bases %d" % allBases.getNMismatches())
o("lane_wide_Qreported %2.2f" % allBases.qReported())
o("lane_wide_Qempirical %2.2f" % allBases.qEmpirical())
o("lane_wide_Qempirical_minus_Qreported %2.2f" % (allBases.qEmpirical()-allBases.qReported()))
@ -241,21 +253,22 @@ def basicQualScoreStats(readGroup, data, output ):
o("stdev_Qreported %2.2f" % stdevQReported(jaffe, allBases))
o("coeff_of_variation_Qreported %2.2f" % coeffOfVariationQreported(jaffe, allBases))
o("RMSE(qReported,qEmpirical) %2.2f" % rmseJaffe(jaffe))
o("RMSE_qReported_qEmpirical %2.2f" % rmseJaffe(jaffe))
for thres in [20, 25, 30]:
qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe)
o("number_of_q%d_bases %d" % (thres, qDeclared.nBases()))
o("percent_of_q%d_bases %2.2f" % (thres, 100 * qDeclared.nBases() / float(allBases.nBases())))
o("number_of_q%d_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.nBases()))
o("percent_of_q%d_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.nBases() / float(allBases.nBases())))
o("number_of_q%d+_bases %d" % (thres, qDeclared.getNBases()))
o("percent_of_q%d+_bases %2.2f" % (thres, 100 * qDeclared.getNBases() / float(allBases.getNBases())))
o("number_of_q%d+_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.getNBases()))
o("percent_of_q%d+_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.getNBases() / float(allBases.getNBases())))
def qDiffByCycle(readGroup, allData, output):
#print '#### qDiffByCycle ####'
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases'
for cycle, datum in groupRecalData(allData, key=RecalData.cycle):
datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle])
diff = datum.qEmpirical() - datum.qReported()
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
def qDiffByDinuc(readGroup, allData, output):
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
@ -263,7 +276,7 @@ def qDiffByDinuc(readGroup, allData, output):
for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc):
datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*'])
diff = datum.qEmpirical() - datum.qReported()
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
def qReportedVsqEmpiricalStream(readGroup, data):
for key, datum in groupRecalData(data, key=RecalData.qReported):
@ -273,7 +286,9 @@ def qReportedVsqEmpiricalStream(readGroup, data):
def qReportedVsqEmpirical(readGroup, allData, output):
print >> output, 'Qreported Qempirical nMismatches nBases'
for key, datum in qReportedVsqEmpiricalStream(readGroup, allData):
print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases())
#if datum.qReported() > 35:
# print datum
print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases())
def analyzeRawData(rawDataFile):
for readGroup, data in rawDataByReadGroup(rawDataFile):
@ -303,6 +318,7 @@ def analyzeFiles(files):
#print 'analyzeFiles', files
Rscript = config.getOption('R', 'Rscript', 'input_file')
for file in files:
print 'Analyzing file', file
plotter = getPlotterForFile(file)
if plotter <> None:
cmd = ' '.join([Rscript, plotter, file])
@ -328,6 +344,9 @@ def main():
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
#parser.add_option("-r", "--raw", dest="raw",
# action='store_true', default=False,
# help="If provided, analyze data w.r.t. the raw empirical qulaity scores # mmismatches / # bases, as opposed to the Yates correction of +1 to each")
parser.add_option("-g", "--readGroup", dest="selectedReadGroups",
action="append", type="string", default=[],
help="If provided, only the provided read groups will be analyzed")

View File

@ -12,9 +12,9 @@ if __name__ == "__main__":
parser.add_option("-A", "--args", dest="args",
type="string", default="",
help="arguments to GATK")
parser.add_option("-C", "--CovariateArgs", dest="CovariateArgs",
parser.add_option("-m", "--mode", dest="RecalMode",
type="string", default="",
help="arguments to GATK")
help="Mode argument to provide to table recalibrator")
parser.add_option("-q", "--farm", dest="farmQueue",
type="string", default=None,
help="Farm queue to send processing jobs to")
@ -45,37 +45,36 @@ if __name__ == "__main__":
covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
covariateInitial = covariateRoot + '.init'
initDataFile = covariateInitial + '.raw_data.csv'
initDataFile = covariateInitial + '.recal_data.csv'
covariateRecal = covariateRoot + '.recal'
recalDataFile = covariateRecal + '.raw_data.csv'
recalDataFile = covariateRecal + '.recal_data.csv'
if not os.path.exists(OPTIONS.scratchDir):
os.mkdir(OPTIONS.scratchDir)
def covariateCmd(bam, outputDir, ignoreAdds):
def covariateCmd(bam, outputDir):
add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
if not ignoreAdds:
add += " " + OPTIONS.CovariateArgs
return config.gatkCmd('CountCovariates') + add
def recalibrateCmd(inputBAM, dataFile, outputBAM):
return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s" % (inputBAM, dataFile, outputBAM)
return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
def runCovariateCmd(inputBAM, dataFile, dir, jobid, ignoreAdds = False):
def runCovariateCmd(inputBAM, dataFile, dir, jobid):
if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
cmd = covariateCmd(inputBAM, dir, ignoreAdds)
cmd = covariateCmd(inputBAM, dir)
return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
#
# Actually do some work here
#
jobid = None
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False)
if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile):
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True)
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)