I've pulled out the functionality of the analyzer into a single python file which doesn't require all of the irrelevant config parameters (which would cause problems for external users). I'll release this and the simple config file to 1KG for use in analyzing recalibration efforts.

Please note that this is literally my first foray into the wonderful world of python.  There could very well be a much more elegant way of releasing the script to external users without having to duplicate the file.  If so, anyone out there should (please) feel free to do so in a second release; but, for now, this needs to be online by tomorrow morning.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1404 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-08-11 02:56:43 +00:00
parent dd228880ed
commit 0e7c158949
2 changed files with 459 additions and 0 deletions

View File

@ -0,0 +1,449 @@
from __future__ import with_statement
import os.path
import sys
from optparse import OptionParser
import re
from itertools import *
import math
import operator
import ConfigParser
MAX_QUAL_SCORE = 40
defaultRequiredOptions = {}
class gatkConfigParser(ConfigParser.SafeConfigParser):
GATK = 'DEFAULT'
def __init__(self, configFiles):
ConfigParser.SafeConfigParser.__init__(self)
files = filter(None, configFiles)
print 'Reading configuration file(s):', files
self.read(files)
self.validateRequiredOptions()
def validateRequiredOptions(self):
for key, value in defaultRequiredOptions.iteritems():
self.validateOption(self.GATK, key, value)
def validateOption(self, section, name, type = str):
v = self.getOption(section, name, type)
#print ' => Validated option', name, v
def getGATKOption(self, name, type = str):
return self.getOption(self.GATK, name, type)
def getGATKModeOption(self, name, mode, type = str):
return self.getOption(mode, name, type)
def getOption(self, section, name, typeF = None):
if not self.has_option(section, name):
raise "Option %s not found in section %s" % (name, section)
else:
val = self.get(section, name)
if typeF == 'input_file' or typeF == 'output_file':
path = os.path.abspath(os.path.expanduser(val))
if typeF == 'input_file':
if not os.path.exists(path):
raise "Input file does not exist", path
if not os.access(path, os.R_OK):
raise "Input file cannot be read", path
if typeF == 'output_file':
if not os.access(path, os.W_OK):
raise "Output file cannot be written", path
return path
elif type(typeF) == str:
return str(val)
elif typeF == None:
return val
else:
return typeF(val)
def phredQScore( nMismatches, nBases ):
"""Calculates a phred-scaled score for nMismatches in nBases"""
#print 'phredQScore', nMismatches, nBases
if nMismatches == 0:
return MAX_QUAL_SCORE
elif nBases == 0:
return 0
else:
return min(-10 * math.log10(float(nMismatches) / nBases), MAX_QUAL_SCORE)
return r
def phredScore2ErrorProp(qual):
"""Converts a phred-scaled quality score to an error probability"""
#print 'phredScore2ErrorProp', qual
return math.pow(10.0, float(qual) / -10.0)
def tryByInt(s):
"""Try to cast something to an int, or return it as a string"""
try:
return int(s)
except:
return s
expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
defaultValues = '0,0,0,**,0,0,0'.split(',')
class RecalData(dict):
"""Basic recalibration data -- corresponds exactly to the Java version in GATK"""
def __init__(self):
self.parse(expectedHeader, defaultValues)
def parse(self, header, data):
"""Parse the comma-separated data line with corresponding header. Throws an error
if the header doesn't correspond to the expectedHeader"""
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
types = [str, tryByInt, int, str, int, int, int]
for head, expected, datum, type in zip(header, expectedHeader, data, types):
if head <> expected:
raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum))
#print 'Binding => ', head, type(datum)
self[head] = type(datum)
#print self
return self
def set(self, header, values):
for head, val in zip(header, values):
self[head] = val
def __getattr__(self, name):
return self[name]
#
# Trivial accessor functions
#
def readGroup(self): return self.rg
def dinuc(self): return self.dn
def qReported(self): return self.Qrep
def cycle(self): return self.pos
def getNBases(self): return self.nBases
def getNMismatches(self): return self.nMismatches
def nExpectedMismatches(self): return self.getNBases() * phredScore2ErrorProp(self.qReported())
def qEmpirical(self):
#if OPTIONS.raw:
return self.Qemp
#else:
# r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1)
# #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp
# return r
def combine(self, moreData):
# grab useful info
sumErrors = self.nExpectedMismatches()
for datum in moreData:
self.nBases += datum.getNBases()
self.nMismatches += datum.getNMismatches()
sumErrors += datum.nExpectedMismatches()
self.updateQemp()
self.Qrep = phredQScore(sumErrors, self.getNBases())
#print 'self.Qrep is now', self.Qrep
return self
def updateQemp(self):
newQemp = phredQScore( self.getNMismatches(), self.getNBases() )
#print 'Updating qEmp', self.Qemp, newQemp
self.Qemp = newQemp
return newQemp
def __str__(self):
return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.getNBases(), self.getNMismatches())
def __repr__(self):
return self.__str__()
# def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ):
# self.dinuc = dinuc
# self.Qrep = Qrep
def rawDataStream(file):
"""Yields successive lists containing the CSVs in the data file; excludes headers"""
header = None
for line in open(file):
if line.find("#") <> -1: continue
else:
data = line.strip().split(',')
if line.find("rg,") <> -1:
header = data
else:
yield RecalData().parse(header, data)
def rawDataByReadGroup(rawDataFile):
"""Yields a stream of the data in rawDataFile, grouped by readGroup"""
for readGroup, generator in groupby(rawDataStream(rawDataFile), key=RecalData.readGroup):
yield (readGroup, list(generator))
def combineRecalData(separateData):
return RecalData().combine(separateData)
def groupRecalData(allData, key=None):
s = sorted(allData, key=key)
values = [ [key, combineRecalData(vals)] for key, vals in groupby(s, key=key) ]
return sorted( values, key=lambda x: x[0])
#
# let's actually analyze the data!
#
def analyzeReadGroup(readGroup, data, outputRoot):
print 'Read group => ', readGroup
print 'Number of elements => ', len(data)
files = []
if OPTIONS.toStdout:
basicQualScoreStats(readGroup, data, sys.stdout )
qReportedVsqEmpirical(readGroup, data, sys.stdout )
qDiffByCycle(readGroup, data, sys.stdout)
qDiffByDinuc(readGroup, data, sys.stdout)
else:
def outputFile(tail):
file = outputRoot + tail
files.append(file)
return file
with open(outputFile(".basic_info.dat"), 'w') as output:
basicQualScoreStats(readGroup, data, output )
with open(outputFile(".empirical_v_reported_quality.dat"), 'w') as output:
qReportedVsqEmpirical(readGroup, data, output )
with open(outputFile(".quality_difference_v_cycle.dat"), 'w') as output:
qDiffByCycle(readGroup, data, output)
with open(outputFile(".quality_difference_v_dinucleotide.dat"), 'w') as output:
qDiffByDinuc(readGroup, data, output)
print 'Files', files
return analyzeFiles(files)
def countQsOfMinQuality(thres, data):
"""Returns RecalData lists for each of the following:
All quality score bins with qRep > thres, and all quality scores with qRep and qRemp > thres"""
qDeclared = RecalData().combine(filter(lambda x: x.qReported() > thres, data))
qDeclaredTrue = RecalData().combine(filter(lambda x: x.qReported() > thres and x.qEmpirical() > thres, data))
#print qDeclared
return qDeclared, qDeclaredTrue
def medianQreported(jaffe, allBases):
i, ignore = medianByCounts(map( RecalData.getNBases, jaffe ))
return jaffe[i].qReported()
def medianByCounts(counts):
nTotal = lsum(counts)
sum = 0.0
for i in range(len(counts)):
sum += counts[i]
if sum / nTotal > 0.5:
# The current datum contains the median
return i, counts[i]
def modeQreported(jaffe, allBases):
ordered = sorted(jaffe, key=RecalData.getNBases, reverse=True )
#print ordered
return ordered[0].qReported()
def averageQreported(jaffe, allBases):
# the average reported quality score is already calculated and stored as qRep!
return allBases.qReported()
def lsum(inlist):
return reduce(operator.__add__, inlist, 0)
def lsamplestdev (inlist, counts, mean):
"""
Returns the variance of the values in the passed list using
N for the denominator (i.e., DESCRIBES the sample variance only).
Usage: lsamplevar(inlist)"""
n = lsum(counts)
sum = 0.0
for item, count in zip(inlist, counts):
diff = item - mean
inc = count * diff * diff
#print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum
sum += inc
#print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1))
return math.sqrt(sum / float(n-1))
def rmse(reportedList, empiricalList, counts):
sum = 0.0
for reported, empirical, count in zip(reportedList, empiricalList, counts):
diff = reported - empirical
inc = count * diff * diff
sum += inc
#print reported, empirical, sum, inc, count, diff
#print sum, math.sqrt(sum)
return math.sqrt(sum)
def stdevQReported(jaffe, allBases):
mean = averageQreported(jaffe, allBases)
return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.getNBases, jaffe ), mean)
def coeffOfVariationQreported(jaffe, allBases):
mean = averageQreported(jaffe, allBases)
stdev = stdevQReported(jaffe, allBases)
return stdev / mean
def rmseJaffe(jaffe):
return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.getNBases, jaffe ) )
def basicQualScoreStats(readGroup, data, output ):
def o(s):
print >> output, s
# aggregate all the data into a single datum
rg, allBases = groupRecalData(data, key=RecalData.readGroup)[0]
#o(allBases)
o("read_group %s" % rg)
#o("number_of_cycles %d" % 0)
#o("maximum_reported_quality_score %d" % 0)
o("number_of_bases %d" % allBases.getNBases())
o("number_of_mismatching_bases %d" % allBases.getNMismatches())
o("lane_wide_Qreported %2.2f" % allBases.qReported())
o("lane_wide_Qempirical %2.2f" % allBases.qEmpirical())
o("lane_wide_Qempirical_minus_Qreported %2.2f" % (allBases.qEmpirical()-allBases.qReported()))
jaffe = [datum for key, datum in qReportedVsqEmpiricalStream(readGroup, data)]
o("median_Qreported %2.2f" % medianQreported(jaffe, allBases))
o("mode_Qreported %2.2f" % modeQreported(jaffe, allBases))
o("average_Qreported %2.2f" % averageQreported(jaffe, allBases))
o("stdev_Qreported %2.2f" % stdevQReported(jaffe, allBases))
o("coeff_of_variation_Qreported %2.2f" % coeffOfVariationQreported(jaffe, allBases))
o("RMSE_qReported_qEmpirical %2.2f" % rmseJaffe(jaffe))
for thres in [20, 25, 30]:
qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe)
o("number_of_q%d+_bases %d" % (thres, qDeclared.getNBases()))
o("percent_of_q%d+_bases %2.2f" % (thres, 100 * qDeclared.getNBases() / float(allBases.getNBases())))
o("number_of_q%d+_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.getNBases()))
o("percent_of_q%d+_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.getNBases() / float(allBases.getNBases())))
def qDiffByCycle(readGroup, allData, output):
#print '#### qDiffByCycle ####'
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases'
for cycle, datum in groupRecalData(allData, key=RecalData.cycle):
datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle])
diff = datum.qEmpirical() - datum.qReported()
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
def qDiffByDinuc(readGroup, allData, output):
print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
print >> output, 'Dinuc Qreported Qempirical Qempirical_Qreported nMismatches nBases'
for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc):
datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*'])
diff = datum.qEmpirical() - datum.qReported()
print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
def qReportedVsqEmpiricalStream(readGroup, data):
for key, datum in groupRecalData(data, key=RecalData.qReported):
datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*'])
yield key, datum
def qReportedVsqEmpirical(readGroup, allData, output):
print >> output, 'Qreported Qempirical nMismatches nBases PercentBases'
rg, allBases = groupRecalData(allData, key=RecalData.readGroup)[0]
for key, datum in qReportedVsqEmpiricalStream(readGroup, allData):
#if datum.qReported() > 35:
# print datum
print >> output, "%2.2f %2.2f %12d %12d %.2f" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases(), 100.0*datum.getNBases() / float(allBases.getNBases()))
def analyzeRawData(rawDataFile):
nReadGroups = 0
for readGroup, data in rawDataByReadGroup(rawDataFile):
if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups:
nReadGroups += 1
if nReadGroups > OPTIONS.maxReadGroups and OPTIONS.maxReadGroups <> -1:
break
else:
root, sourceFilename = os.path.split(rawDataFile)
if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
analyzeReadGroup(readGroup, data, outputRoot)
plottersByFile = {
"raw_data.csv$" : analyzeRawData,
"recal_data.csv$" : analyzeRawData,
"empirical_v_reported_quality" : 'PlotQEmpStated',
"quality_difference_v_dinucleotide" : 'PlotQDiffByDinuc',
"quality_difference_v_cycle" : 'PlotQDiffByCycle' }
def getPlotterForFile(file):
for pat, analysis in plottersByFile.iteritems():
if re.search(pat, file):
if type(analysis) == str:
return config.getOption('R', analysis, 'input_file')
else:
analysis(file)
return None
def analyzeFiles(files):
#print 'analyzeFiles', files
Rscript = config.getOption('R', 'Rscript', 'input_file')
for file in files:
print 'Analyzing file', file
plotter = getPlotterForFile(file)
if plotter <> None and not OPTIONS.noplots:
cmd = ' '.join([Rscript, plotter, file])
status = os.system(cmd)
def main():
global config, OPTIONS
usage = """usage: %prog -c config.cfg files*"""
parser = OptionParser(usage=usage)
parser.add_option("-d", "--dir", dest="outputDir",
type="string", default=None,
help="If provided, analysis output files will be written to this directory")
parser.add_option("-m", "--maxReadGroups", dest="maxReadGroups",
type="int", default=-1,
help="Maximum number of read groups to process. The default of -1 indicates that all read groups will be processed")
parser.add_option("-c", "--config", dest="configs",
action="append", type="string", default=[],
help="Configuration file")
parser.add_option("-s", "--stdout", dest="toStdout",
action='store_true', default=False,
help="If provided, writes output to standard output, not to files")
parser.add_option("", "--no_plots", dest="noplots",
action='store_true', default=False,
help="If provided, no plots will be generated")
parser.add_option("-g", "--readGroup", dest="selectedReadGroups",
action="append", type="string", default=[],
help="If provided, only the provided read groups will be analyzed")
(OPTIONS, args) = parser.parse_args()
if len(OPTIONS.configs) == 0:
parser.error("Requires at least one configuration file be provided")
config = gatkConfigParser(OPTIONS.configs)
if OPTIONS.selectedReadGroups <> []: print 'Analyzing only the following read groups', OPTIONS.selectedReadGroups
analyzeFiles(args)
import unittest
class TestanalzyeRecalQuals(unittest.TestCase):
def setUp(self):
self.numbers = [0, 1, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6, 6]
self.numbersItems = [0, 1, 2, 3, 4, 5, 6]
self.numbersCounts = [1, 1, 2, 1, 3, 3, 2]
self.numbers_sum = 47
self.numbers_mean = 3.615385
self.numbers_mode = 4
self.numbers_median = 4
self.numbers_stdev = 1.894662
self.numbers_var = 3.589744
self.numbers_cov = self.numbers_stdev / self.numbers_mean
def testSum(self):
self.assertEquals(self.numbers_sum, lsum(self.numbers))
self.assertEquals(0, lsum(self.numbers[0:0]))
self.assertEquals(1, lsum(self.numbers[0:2]))
self.assertEquals(3, lsum(self.numbers[0:3]))
def teststdev(self):
self.assertAlmostEqual(self.numbers_stdev, lsamplestdev(self.numbersItems, self.numbersCounts, self.numbers_mean), 4)
if __name__ == '__main__':
main()
#unittest.main()

10
testdata/recalConfig_1KG.cfg vendored 100644
View File

@ -0,0 +1,10 @@
[DEFAULT]
resources: resources
#dbsnp: %(resources)s/dbsnp_129_hg18.rod
dbsnp: %(resources)s/dbsnp_129_b36.rod
[R]
Rscript: /broad/tools/apps/R-2.6.0/bin/Rscript
PlotQEmpStated: %(resources)s/plot_q_emp_stated_hst.R
PlotQDiffByCycle: %(resources)s/plot_qual_diff_v_cycle.R
PlotQDiffByDinuc: %(resources)s/plot_qual_diff_v_dinuc.R