Much improved python analysis routines, as well as easier / more correct merging utility. Better R scripts, which now close recalibration data by the confidence of the quality score itself
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1081 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ef546868bf
commit
caf5aef0f8
|
|
@ -63,7 +63,7 @@ if __name__ == "__main__":
|
|||
|
||||
jobid = None
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
|
||||
output = spec.getMergedBase() + '.stdout'
|
||||
output = spec.getMergedBase()
|
||||
cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools)
|
||||
#print cmd
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ class MergeFilesSpec:
|
|||
|
||||
def filename(self):
|
||||
if self.merged_filename_base <> None:
|
||||
return self.merged_filename_base + '.' + self.group()
|
||||
return self.merged_filename_base + '.'.join([self.group()])
|
||||
else:
|
||||
return self.group()
|
||||
|
||||
|
|
|
|||
|
|
@ -10,14 +10,17 @@ from itertools import *
|
|||
import math
|
||||
import operator
|
||||
|
||||
MAX_QUAL_SCORE = 50
|
||||
|
||||
def phredQScore( nMismatches, nBases ):
|
||||
#print 'phredQScore', nMismatches, nBases
|
||||
if nMismatches == 0:
|
||||
return 40
|
||||
return MAX_QUAL_SCORE
|
||||
elif nBases == 0:
|
||||
return 0
|
||||
else:
|
||||
return -10 * math.log10(float(nMismatches) / nBases)
|
||||
return min(-10 * math.log10(float(nMismatches) / nBases), MAX_QUAL_SCORE)
|
||||
return r
|
||||
|
||||
|
||||
def phredScore2ErrorProp(qual):
|
||||
|
|
@ -30,16 +33,16 @@ def tryByInt(s):
|
|||
except:
|
||||
return s
|
||||
|
||||
expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',')
|
||||
defaultValues = '0,**,0,0,0,0,0'.split(',')
|
||||
expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
|
||||
defaultValues = '0,0,0,**,0,0,0'.split(',')
|
||||
class RecalData(dict):
|
||||
|
||||
def __init__(self):
|
||||
self.parse(expectedHeader, defaultValues)
|
||||
|
||||
def parse(self, header, data):
|
||||
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
|
||||
types = [str, str, int, tryByInt, int, int, int]
|
||||
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
|
||||
types = [str, tryByInt, int, str, int, int, int]
|
||||
for head, expected, datum, type in zip(header, expectedHeader, data, types):
|
||||
if head <> expected:
|
||||
raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum))
|
||||
|
|
@ -59,32 +62,41 @@ class RecalData(dict):
|
|||
def readGroup(self): return self.rg
|
||||
def dinuc(self): return self.dn
|
||||
def qReported(self): return self.Qrep
|
||||
def qEmpirical(self): return self.Qemp
|
||||
def cycle(self): return self.pos
|
||||
def nBases(self): return self.NBases
|
||||
def nMismatches(self): return self.MMismatches
|
||||
def nExpectedMismatches(self): return self.nBases() * phredScore2ErrorProp(self.qReported())
|
||||
def getNBases(self): return self.nBases
|
||||
def getNMismatches(self): return self.nMismatches
|
||||
def nExpectedMismatches(self): return self.getNBases() * phredScore2ErrorProp(self.qReported())
|
||||
|
||||
|
||||
def qEmpirical(self):
|
||||
#if OPTIONS.raw:
|
||||
return self.Qemp
|
||||
#else:
|
||||
# r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1)
|
||||
# #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp
|
||||
# return r
|
||||
|
||||
|
||||
def combine(self, moreData):
|
||||
# grab useful info
|
||||
sumErrors = self.nExpectedMismatches()
|
||||
for datum in moreData:
|
||||
self.NBases += datum.nBases()
|
||||
self.MMismatches += datum.nMismatches()
|
||||
self.nBases += datum.getNBases()
|
||||
self.nMismatches += datum.getNMismatches()
|
||||
sumErrors += datum.nExpectedMismatches()
|
||||
self.updateQemp()
|
||||
self.Qrep = phredQScore(sumErrors, self.nBases())
|
||||
self.Qrep = phredQScore(sumErrors, self.getNBases())
|
||||
#print 'self.Qrep is now', self.Qrep
|
||||
return self
|
||||
|
||||
def updateQemp(self):
|
||||
newQemp = phredQScore( self.nMismatches(), self.nBases() )
|
||||
newQemp = phredQScore( self.getNMismatches(), self.getNBases() )
|
||||
#print 'Updating qEmp', self.Qemp, newQemp
|
||||
self.Qemp = newQemp
|
||||
return newQemp
|
||||
|
||||
def __str__(self):
|
||||
return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches())
|
||||
return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.getNBases(), self.getNMismatches())
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
|
@ -157,7 +169,7 @@ def countQsOfMinQuality(thres, data):
|
|||
return qDeclared, qDeclaredTrue
|
||||
|
||||
def medianQreported(jaffe, allBases):
|
||||
i, ignore = medianByCounts(map( RecalData.nBases, jaffe ))
|
||||
i, ignore = medianByCounts(map( RecalData.getNBases, jaffe ))
|
||||
return jaffe[i].qReported()
|
||||
|
||||
def medianByCounts(counts):
|
||||
|
|
@ -170,7 +182,7 @@ def medianByCounts(counts):
|
|||
return i, counts[i]
|
||||
|
||||
def modeQreported(jaffe, allBases):
|
||||
ordered = sorted(jaffe, key=RecalData.nBases, reverse=True )
|
||||
ordered = sorted(jaffe, key=RecalData.getNBases, reverse=True )
|
||||
#print ordered
|
||||
return ordered[0].qReported()
|
||||
|
||||
|
|
@ -202,21 +214,21 @@ def rmse(reportedList, empiricalList, counts):
|
|||
diff = reported - empirical
|
||||
inc = count * diff * diff
|
||||
sum += inc
|
||||
#print reported, empirical, sum, inc, count, diff
|
||||
#print sum, math.sqrt(sum)
|
||||
return math.sqrt(sum)
|
||||
|
||||
def stdevQReported(jaffe, allBases):
|
||||
mean = averageQreported(jaffe, allBases)
|
||||
return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.nBases, jaffe ), mean)
|
||||
return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.getNBases, jaffe ), mean)
|
||||
|
||||
def coeffOfVariationQreported(jaffe, allBases):
|
||||
mean = averageQreported(jaffe, allBases)
|
||||
stdev = stdevQReported(jaffe, allBases)
|
||||
return stdev / mean
|
||||
|
||||
# o("variance_Qreported %2.2f" % varianceQreported(jaffe))
|
||||
|
||||
def rmseJaffe(jaffe):
|
||||
return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.nBases, jaffe ) )
|
||||
return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.getNBases, jaffe ) )
|
||||
|
||||
def basicQualScoreStats(readGroup, data, output ):
|
||||
def o(s):
|
||||
|
|
@ -227,8 +239,8 @@ def basicQualScoreStats(readGroup, data, output ):
|
|||
o("read_group %s" % rg)
|
||||
#o("number_of_cycles %d" % 0)
|
||||
#o("maximum_reported_quality_score %d" % 0)
|
||||
o("number_of_bases %d" % allBases.nBases())
|
||||
o("number_of_mismatching_bases %d" % allBases.nMismatches())
|
||||
o("number_of_bases %d" % allBases.getNBases())
|
||||
o("number_of_mismatching_bases %d" % allBases.getNMismatches())
|
||||
o("lane_wide_Qreported %2.2f" % allBases.qReported())
|
||||
o("lane_wide_Qempirical %2.2f" % allBases.qEmpirical())
|
||||
o("lane_wide_Qempirical_minus_Qreported %2.2f" % (allBases.qEmpirical()-allBases.qReported()))
|
||||
|
|
@ -241,21 +253,22 @@ def basicQualScoreStats(readGroup, data, output ):
|
|||
o("stdev_Qreported %2.2f" % stdevQReported(jaffe, allBases))
|
||||
o("coeff_of_variation_Qreported %2.2f" % coeffOfVariationQreported(jaffe, allBases))
|
||||
|
||||
o("RMSE(qReported,qEmpirical) %2.2f" % rmseJaffe(jaffe))
|
||||
o("RMSE_qReported_qEmpirical %2.2f" % rmseJaffe(jaffe))
|
||||
for thres in [20, 25, 30]:
|
||||
qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe)
|
||||
o("number_of_q%d_bases %d" % (thres, qDeclared.nBases()))
|
||||
o("percent_of_q%d_bases %2.2f" % (thres, 100 * qDeclared.nBases() / float(allBases.nBases())))
|
||||
o("number_of_q%d_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.nBases()))
|
||||
o("percent_of_q%d_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.nBases() / float(allBases.nBases())))
|
||||
o("number_of_q%d+_bases %d" % (thres, qDeclared.getNBases()))
|
||||
o("percent_of_q%d+_bases %2.2f" % (thres, 100 * qDeclared.getNBases() / float(allBases.getNBases())))
|
||||
o("number_of_q%d+_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.getNBases()))
|
||||
o("percent_of_q%d+_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.getNBases() / float(allBases.getNBases())))
|
||||
|
||||
def qDiffByCycle(readGroup, allData, output):
|
||||
#print '#### qDiffByCycle ####'
|
||||
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
|
||||
print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases'
|
||||
for cycle, datum in groupRecalData(allData, key=RecalData.cycle):
|
||||
datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle])
|
||||
diff = datum.qEmpirical() - datum.qReported()
|
||||
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
|
||||
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
|
||||
|
||||
def qDiffByDinuc(readGroup, allData, output):
|
||||
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
|
||||
|
|
@ -263,7 +276,7 @@ def qDiffByDinuc(readGroup, allData, output):
|
|||
for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc):
|
||||
datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*'])
|
||||
diff = datum.qEmpirical() - datum.qReported()
|
||||
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
|
||||
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
|
||||
|
||||
def qReportedVsqEmpiricalStream(readGroup, data):
|
||||
for key, datum in groupRecalData(data, key=RecalData.qReported):
|
||||
|
|
@ -273,7 +286,9 @@ def qReportedVsqEmpiricalStream(readGroup, data):
|
|||
def qReportedVsqEmpirical(readGroup, allData, output):
|
||||
print >> output, 'Qreported Qempirical nMismatches nBases'
|
||||
for key, datum in qReportedVsqEmpiricalStream(readGroup, allData):
|
||||
print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases())
|
||||
#if datum.qReported() > 35:
|
||||
# print datum
|
||||
print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases())
|
||||
|
||||
def analyzeRawData(rawDataFile):
|
||||
for readGroup, data in rawDataByReadGroup(rawDataFile):
|
||||
|
|
@ -303,6 +318,7 @@ def analyzeFiles(files):
|
|||
#print 'analyzeFiles', files
|
||||
Rscript = config.getOption('R', 'Rscript', 'input_file')
|
||||
for file in files:
|
||||
print 'Analyzing file', file
|
||||
plotter = getPlotterForFile(file)
|
||||
if plotter <> None:
|
||||
cmd = ' '.join([Rscript, plotter, file])
|
||||
|
|
@ -328,6 +344,9 @@ def main():
|
|||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
#parser.add_option("-r", "--raw", dest="raw",
|
||||
# action='store_true', default=False,
|
||||
# help="If provided, analyze data w.r.t. the raw empirical qulaity scores # mmismatches / # bases, as opposed to the Yates correction of +1 to each")
|
||||
parser.add_option("-g", "--readGroup", dest="selectedReadGroups",
|
||||
action="append", type="string", default=[],
|
||||
help="If provided, only the provided read groups will be analyzed")
|
||||
|
|
|
|||
|
|
@ -12,9 +12,9 @@ if __name__ == "__main__":
|
|||
parser.add_option("-A", "--args", dest="args",
|
||||
type="string", default="",
|
||||
help="arguments to GATK")
|
||||
parser.add_option("-C", "--CovariateArgs", dest="CovariateArgs",
|
||||
parser.add_option("-m", "--mode", dest="RecalMode",
|
||||
type="string", default="",
|
||||
help="arguments to GATK")
|
||||
help="Mode argument to provide to table recalibrator")
|
||||
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
help="Farm queue to send processing jobs to")
|
||||
|
|
@ -45,37 +45,36 @@ if __name__ == "__main__":
|
|||
|
||||
covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
|
||||
covariateInitial = covariateRoot + '.init'
|
||||
initDataFile = covariateInitial + '.raw_data.csv'
|
||||
initDataFile = covariateInitial + '.recal_data.csv'
|
||||
covariateRecal = covariateRoot + '.recal'
|
||||
recalDataFile = covariateRecal + '.raw_data.csv'
|
||||
recalDataFile = covariateRecal + '.recal_data.csv'
|
||||
|
||||
if not os.path.exists(OPTIONS.scratchDir):
|
||||
os.mkdir(OPTIONS.scratchDir)
|
||||
|
||||
def covariateCmd(bam, outputDir, ignoreAdds):
|
||||
def covariateCmd(bam, outputDir):
|
||||
add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
|
||||
if not ignoreAdds:
|
||||
add += " " + OPTIONS.CovariateArgs
|
||||
return config.gatkCmd('CountCovariates') + add
|
||||
|
||||
def recalibrateCmd(inputBAM, dataFile, outputBAM):
|
||||
return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s" % (inputBAM, dataFile, outputBAM)
|
||||
return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
|
||||
|
||||
def runCovariateCmd(inputBAM, dataFile, dir, jobid, ignoreAdds = False):
|
||||
def runCovariateCmd(inputBAM, dataFile, dir, jobid):
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
|
||||
cmd = covariateCmd(inputBAM, dir, ignoreAdds)
|
||||
cmd = covariateCmd(inputBAM, dir)
|
||||
return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
#
|
||||
# Actually do some work here
|
||||
#
|
||||
jobid = None
|
||||
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False)
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile):
|
||||
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
|
||||
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
|
||||
cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True)
|
||||
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)
|
||||
|
||||
Loading…
Reference in New Issue