2009-06-21 00:00:23 +08:00
from __future__ import with_statement
import farm_commands
import os . path
import sys
from optparse import OptionParser
import picard_utils
from gatkConfigParser import *
import re
from itertools import *
import math
2009-06-21 03:50:37 +08:00
import operator
2009-06-21 00:00:23 +08:00
2009-07-07 22:15:39 +08:00
MAX_QUAL_SCORE = 40
2009-06-24 09:12:35 +08:00
2009-06-21 00:00:23 +08:00
def phredQScore ( nMismatches , nBases ) :
2009-06-26 06:50:55 +08:00
""" Calculates a phred-scaled score for nMismatches in nBases """
2009-06-21 00:00:23 +08:00
#print 'phredQScore', nMismatches, nBases
if nMismatches == 0 :
2009-06-24 09:12:35 +08:00
return MAX_QUAL_SCORE
2009-06-21 00:00:23 +08:00
elif nBases == 0 :
return 0
else :
2009-06-24 09:12:35 +08:00
return min ( - 10 * math . log10 ( float ( nMismatches ) / nBases ) , MAX_QUAL_SCORE )
return r
2009-06-21 00:00:23 +08:00
def phredScore2ErrorProp ( qual ) :
2009-06-26 06:50:55 +08:00
""" Converts a phred-scaled quality score to an error probability """
2009-06-21 00:00:23 +08:00
#print 'phredScore2ErrorProp', qual
return math . pow ( 10.0 , float ( qual ) / - 10.0 )
2009-06-21 03:50:37 +08:00
def tryByInt ( s ) :
2009-06-26 06:50:55 +08:00
""" Try to cast something to an int, or return it as a string """
2009-06-21 03:50:37 +08:00
try :
return int ( s )
except :
return s
2009-06-24 09:12:35 +08:00
expectedHeader = ' rg,pos,Qrep,dn,nBases,nMismatches,Qemp ' . split ( ' , ' )
defaultValues = ' 0,0,0,**,0,0,0 ' . split ( ' , ' )
2009-06-21 00:00:23 +08:00
class RecalData ( dict ) :
2009-06-26 06:50:55 +08:00
""" Basic recalibration data -- corresponds exactly to the Java version in GATK """
2009-06-21 00:00:23 +08:00
def __init__ ( self ) :
self . parse ( expectedHeader , defaultValues )
def parse ( self , header , data ) :
2009-06-26 06:50:55 +08:00
""" Parse the comma-separated data line with corresponding header. Throws an error
if the header doesn ' t correspond to the expectedHeader " " "
2009-06-24 09:12:35 +08:00
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
types = [ str , tryByInt , int , str , int , int , int ]
2009-06-21 00:00:23 +08:00
for head , expected , datum , type in zip ( header , expectedHeader , data , types ) :
if head < > expected :
raise ( " Unexpected header in rawData %s %s %s " % ( head , expected , datum ) )
#print 'Binding => ', head, type(datum)
self [ head ] = type ( datum )
#print self
return self
def set ( self , header , values ) :
for head , val in zip ( header , values ) :
self [ head ] = val
def __getattr__ ( self , name ) :
return self [ name ]
2009-06-26 06:50:55 +08:00
#
# Trivial accessor functions
#
2009-06-21 00:00:23 +08:00
def readGroup ( self ) : return self . rg
def dinuc ( self ) : return self . dn
def qReported ( self ) : return self . Qrep
def cycle ( self ) : return self . pos
2009-06-24 09:12:35 +08:00
def getNBases ( self ) : return self . nBases
def getNMismatches ( self ) : return self . nMismatches
def nExpectedMismatches ( self ) : return self . getNBases ( ) * phredScore2ErrorProp ( self . qReported ( ) )
def qEmpirical ( self ) :
#if OPTIONS.raw:
return self . Qemp
#else:
# r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1)
# #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp
# return r
2009-06-21 00:00:23 +08:00
def combine ( self , moreData ) :
# grab useful info
sumErrors = self . nExpectedMismatches ( )
for datum in moreData :
2009-06-24 09:12:35 +08:00
self . nBases + = datum . getNBases ( )
self . nMismatches + = datum . getNMismatches ( )
2009-06-21 00:00:23 +08:00
sumErrors + = datum . nExpectedMismatches ( )
self . updateQemp ( )
2009-06-24 09:12:35 +08:00
self . Qrep = phredQScore ( sumErrors , self . getNBases ( ) )
2009-06-21 00:00:23 +08:00
#print 'self.Qrep is now', self.Qrep
return self
def updateQemp ( self ) :
2009-06-24 09:12:35 +08:00
newQemp = phredQScore ( self . getNMismatches ( ) , self . getNBases ( ) )
2009-06-21 00:00:23 +08:00
#print 'Updating qEmp', self.Qemp, newQemp
self . Qemp = newQemp
return newQemp
def __str__ ( self ) :
2009-06-24 09:12:35 +08:00
return " [rg= %s cycle= %s dinuc= %s qrep= %.1f qemp= %.1f nbases= %d nmismatchs= %d ] " % ( self . readGroup ( ) , str ( self . cycle ( ) ) , self . dinuc ( ) , self . qReported ( ) , self . qEmpirical ( ) , self . getNBases ( ) , self . getNMismatches ( ) )
2009-06-21 03:50:37 +08:00
def __repr__ ( self ) :
return self . __str__ ( )
2009-06-21 00:00:23 +08:00
# def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ):
# self.dinuc = dinuc
# self.Qrep = Qrep
def rawDataStream ( file ) :
""" Yields successive lists containing the CSVs in the data file; excludes headers """
header = None
for line in open ( file ) :
if line . find ( " # " ) < > - 1 : continue
else :
data = line . strip ( ) . split ( ' , ' )
if line . find ( " rg, " ) < > - 1 :
header = data
else :
yield RecalData ( ) . parse ( header , data )
def rawDataByReadGroup ( rawDataFile ) :
""" Yields a stream of the data in rawDataFile, grouped by readGroup """
for readGroup , generator in groupby ( rawDataStream ( rawDataFile ) , key = RecalData . readGroup ) :
yield ( readGroup , list ( generator ) )
def combineRecalData ( separateData ) :
return RecalData ( ) . combine ( separateData )
def groupRecalData ( allData , key = None ) :
s = sorted ( allData , key = key )
values = [ [ key , combineRecalData ( vals ) ] for key , vals in groupby ( s , key = key ) ]
return sorted ( values , key = lambda x : x [ 0 ] )
#
# let's actually analyze the data!
#
def analyzeReadGroup ( readGroup , data , outputRoot ) :
print ' Read group => ' , readGroup
print ' Number of elements => ' , len ( data )
files = [ ]
if OPTIONS . toStdout :
2009-06-21 03:50:37 +08:00
basicQualScoreStats ( readGroup , data , sys . stdout )
2009-06-21 00:00:23 +08:00
qReportedVsqEmpirical ( readGroup , data , sys . stdout )
qDiffByCycle ( readGroup , data , sys . stdout )
qDiffByDinuc ( readGroup , data , sys . stdout )
else :
def outputFile ( tail ) :
file = outputRoot + tail
files . append ( file )
return file
2009-06-21 03:50:37 +08:00
with open ( outputFile ( " .basic_info.dat " ) , ' w ' ) as output :
basicQualScoreStats ( readGroup , data , output )
2009-06-21 00:00:23 +08:00
with open ( outputFile ( " .empirical_v_reported_quality.dat " ) , ' w ' ) as output :
qReportedVsqEmpirical ( readGroup , data , output )
with open ( outputFile ( " .quality_difference_v_cycle.dat " ) , ' w ' ) as output :
qDiffByCycle ( readGroup , data , output )
with open ( outputFile ( " .quality_difference_v_dinucleotide.dat " ) , ' w ' ) as output :
qDiffByDinuc ( readGroup , data , output )
print ' Files ' , files
return analyzeFiles ( files )
2009-06-21 03:50:37 +08:00
def countQsOfMinQuality ( thres , data ) :
""" Returns RecalData lists for each of the following:
All quality score bins with qRep > thres , and all quality scores with qRep and qRemp > thres """
qDeclared = RecalData ( ) . combine ( filter ( lambda x : x . qReported ( ) > thres , data ) )
qDeclaredTrue = RecalData ( ) . combine ( filter ( lambda x : x . qReported ( ) > thres and x . qEmpirical ( ) > thres , data ) )
#print qDeclared
return qDeclared , qDeclaredTrue
def medianQreported ( jaffe , allBases ) :
2009-06-24 09:12:35 +08:00
i , ignore = medianByCounts ( map ( RecalData . getNBases , jaffe ) )
2009-06-21 03:50:37 +08:00
return jaffe [ i ] . qReported ( )
def medianByCounts ( counts ) :
nTotal = lsum ( counts )
sum = 0.0
for i in range ( len ( counts ) ) :
sum + = counts [ i ]
if sum / nTotal > 0.5 :
# The current datum contains the median
return i , counts [ i ]
def modeQreported ( jaffe , allBases ) :
2009-06-24 09:12:35 +08:00
ordered = sorted ( jaffe , key = RecalData . getNBases , reverse = True )
2009-06-21 03:50:37 +08:00
#print ordered
return ordered [ 0 ] . qReported ( )
def averageQreported ( jaffe , allBases ) :
# the average reported quality score is already calculated and stored as qRep!
return allBases . qReported ( )
def lsum ( inlist ) :
return reduce ( operator . __add__ , inlist , 0 )
def lsamplestdev ( inlist , counts , mean ) :
"""
Returns the variance of the values in the passed list using
N for the denominator ( i . e . , DESCRIBES the sample variance only ) .
Usage : lsamplevar ( inlist ) """
n = lsum ( counts )
sum = 0.0
for item , count in zip ( inlist , counts ) :
diff = item - mean
inc = count * diff * diff
2009-06-26 06:50:55 +08:00
#print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum
2009-06-21 03:50:37 +08:00
sum + = inc
2009-06-26 06:50:55 +08:00
#print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1))
2009-06-21 03:50:37 +08:00
return math . sqrt ( sum / float ( n - 1 ) )
def rmse ( reportedList , empiricalList , counts ) :
sum = 0.0
for reported , empirical , count in zip ( reportedList , empiricalList , counts ) :
diff = reported - empirical
inc = count * diff * diff
sum + = inc
2009-06-24 09:12:35 +08:00
#print reported, empirical, sum, inc, count, diff
#print sum, math.sqrt(sum)
2009-06-21 03:50:37 +08:00
return math . sqrt ( sum )
def stdevQReported ( jaffe , allBases ) :
mean = averageQreported ( jaffe , allBases )
2009-06-24 09:12:35 +08:00
return lsamplestdev ( map ( RecalData . qReported , jaffe ) , map ( RecalData . getNBases , jaffe ) , mean )
2009-06-21 03:50:37 +08:00
def coeffOfVariationQreported ( jaffe , allBases ) :
mean = averageQreported ( jaffe , allBases )
stdev = stdevQReported ( jaffe , allBases )
return stdev / mean
def rmseJaffe ( jaffe ) :
2009-06-24 09:12:35 +08:00
return rmse ( map ( RecalData . qReported , jaffe ) , map ( RecalData . qEmpirical , jaffe ) , map ( RecalData . getNBases , jaffe ) )
2009-06-21 03:50:37 +08:00
def basicQualScoreStats ( readGroup , data , output ) :
def o ( s ) :
print >> output , s
# aggregate all the data into a single datum
rg , allBases = groupRecalData ( data , key = RecalData . readGroup ) [ 0 ]
#o(allBases)
o ( " read_group %s " % rg )
#o("number_of_cycles %d" % 0)
#o("maximum_reported_quality_score %d" % 0)
2009-06-24 09:12:35 +08:00
o ( " number_of_bases %d " % allBases . getNBases ( ) )
o ( " number_of_mismatching_bases %d " % allBases . getNMismatches ( ) )
2009-06-21 03:50:37 +08:00
o ( " lane_wide_Qreported %2.2f " % allBases . qReported ( ) )
o ( " lane_wide_Qempirical %2.2f " % allBases . qEmpirical ( ) )
o ( " lane_wide_Qempirical_minus_Qreported %2.2f " % ( allBases . qEmpirical ( ) - allBases . qReported ( ) ) )
jaffe = [ datum for key , datum in qReportedVsqEmpiricalStream ( readGroup , data ) ]
o ( " median_Qreported %2.2f " % medianQreported ( jaffe , allBases ) )
o ( " mode_Qreported %2.2f " % modeQreported ( jaffe , allBases ) )
o ( " average_Qreported %2.2f " % averageQreported ( jaffe , allBases ) )
o ( " stdev_Qreported %2.2f " % stdevQReported ( jaffe , allBases ) )
o ( " coeff_of_variation_Qreported %2.2f " % coeffOfVariationQreported ( jaffe , allBases ) )
2009-06-24 09:12:35 +08:00
o ( " RMSE_qReported_qEmpirical %2.2f " % rmseJaffe ( jaffe ) )
2009-06-21 03:50:37 +08:00
for thres in [ 20 , 25 , 30 ] :
qDeclared , qDeclaredTrue = countQsOfMinQuality ( thres , jaffe )
2009-06-24 09:12:35 +08:00
o ( " number_of_q %d +_bases %d " % ( thres , qDeclared . getNBases ( ) ) )
o ( " percent_of_q %d +_bases %2.2f " % ( thres , 100 * qDeclared . getNBases ( ) / float ( allBases . getNBases ( ) ) ) )
o ( " number_of_q %d +_bases_with_qemp_above_q %d %d " % ( thres , thres , qDeclaredTrue . getNBases ( ) ) )
o ( " percent_of_q %d +_bases_with_qemp_above_q %d %2.2f " % ( thres , thres , 100 * qDeclaredTrue . getNBases ( ) / float ( allBases . getNBases ( ) ) ) )
2009-06-21 03:50:37 +08:00
2009-06-21 00:00:23 +08:00
def qDiffByCycle ( readGroup , allData , output ) :
2009-06-24 09:12:35 +08:00
#print '#### qDiffByCycle ####'
2009-06-21 00:00:23 +08:00
print >> output , ' # Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases '
print >> output , ' Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases '
for cycle , datum in groupRecalData ( allData , key = RecalData . cycle ) :
datum . set ( [ ' rg ' , ' dn ' , ' pos ' ] , [ readGroup , ' ** ' , cycle ] )
diff = datum . qEmpirical ( ) - datum . qReported ( )
2009-06-24 09:12:35 +08:00
print >> output , " %s %2.2f %2.2f %2.2f %12d %12d " % ( datum . cycle ( ) , datum . qReported ( ) , datum . qEmpirical ( ) , diff , datum . getNMismatches ( ) , datum . getNBases ( ) )
2009-06-21 00:00:23 +08:00
def qDiffByDinuc ( readGroup , allData , output ) :
print >> output , ' # Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases '
print >> output , ' Dinuc Qreported Qempirical Qempirical_Qreported nMismatches nBases '
for dinuc , datum in groupRecalData ( allData , key = RecalData . dinuc ) :
datum . set ( [ ' rg ' , ' dn ' , ' pos ' ] , [ readGroup , dinuc , ' * ' ] )
diff = datum . qEmpirical ( ) - datum . qReported ( )
2009-06-24 09:12:35 +08:00
print >> output , " %s %2.2f %2.2f %2.2f %12d %12d " % ( datum . dinuc ( ) , datum . qReported ( ) , datum . qEmpirical ( ) , diff , datum . getNMismatches ( ) , datum . getNBases ( ) )
2009-06-21 00:00:23 +08:00
2009-06-21 03:50:37 +08:00
def qReportedVsqEmpiricalStream ( readGroup , data ) :
for key , datum in groupRecalData ( data , key = RecalData . qReported ) :
datum . set ( [ ' rg ' , ' dn ' , ' Qrep ' , ' pos ' ] , [ readGroup , ' ** ' , key , ' * ' ] )
yield key , datum
2009-06-21 00:00:23 +08:00
def qReportedVsqEmpirical ( readGroup , allData , output ) :
2009-07-07 22:15:39 +08:00
print >> output , ' Qreported Qempirical nMismatches nBases PercentBases '
rg , allBases = groupRecalData ( allData , key = RecalData . readGroup ) [ 0 ]
2009-06-21 03:50:37 +08:00
for key , datum in qReportedVsqEmpiricalStream ( readGroup , allData ) :
2009-06-24 09:12:35 +08:00
#if datum.qReported() > 35:
# print datum
2009-07-07 22:15:39 +08:00
print >> output , " %2.2f %2.2f %12d %12d %.2f " % ( datum . qReported ( ) , datum . qEmpirical ( ) , datum . getNMismatches ( ) , datum . getNBases ( ) , 100.0 * datum . getNBases ( ) / float ( allBases . getNBases ( ) ) )
2009-06-21 00:00:23 +08:00
def analyzeRawData ( rawDataFile ) :
2009-07-07 22:15:39 +08:00
nReadGroups = 0
2009-06-21 00:00:23 +08:00
for readGroup , data in rawDataByReadGroup ( rawDataFile ) :
if OPTIONS . selectedReadGroups == [ ] or readGroup in OPTIONS . selectedReadGroups :
2009-07-07 22:15:39 +08:00
nReadGroups + = 1
if nReadGroups > OPTIONS . maxReadGroups and OPTIONS . maxReadGroups < > - 1 :
break
else :
root , sourceFilename = os . path . split ( rawDataFile )
if ( OPTIONS . outputDir ) : root = OPTIONS . outputDir
outputRoot = os . path . join ( root , " %s . %s . %s " % ( sourceFilename , readGroup , ' analysis ' ) )
analyzeReadGroup ( readGroup , data , outputRoot )
2009-06-21 00:00:23 +08:00
plottersByFile = {
" raw_data.csv$ " : analyzeRawData ,
" recal_data.csv$ " : analyzeRawData ,
" empirical_v_reported_quality " : ' PlotQEmpStated ' ,
" quality_difference_v_dinucleotide " : ' PlotQDiffByDinuc ' ,
" quality_difference_v_cycle " : ' PlotQDiffByCycle ' }
def getPlotterForFile ( file ) :
for pat , analysis in plottersByFile . iteritems ( ) :
if re . search ( pat , file ) :
if type ( analysis ) == str :
return config . getOption ( ' R ' , analysis , ' input_file ' )
else :
analysis ( file )
return None
def analyzeFiles ( files ) :
#print 'analyzeFiles', files
Rscript = config . getOption ( ' R ' , ' Rscript ' , ' input_file ' )
for file in files :
2009-06-24 09:12:35 +08:00
print ' Analyzing file ' , file
2009-06-21 00:00:23 +08:00
plotter = getPlotterForFile ( file )
2009-06-26 06:50:55 +08:00
if plotter < > None and not OPTIONS . noplots :
2009-06-21 00:00:23 +08:00
cmd = ' ' . join ( [ Rscript , plotter , file ] )
farm_commands . cmd ( cmd , None , None , just_print_commands = OPTIONS . dry )
2009-06-21 03:50:37 +08:00
def main ( ) :
global config , OPTIONS
2009-06-21 00:00:23 +08:00
usage = """ usage: % prog -c config.cfg files* """
parser = OptionParser ( usage = usage )
parser . add_option ( " -q " , " --farm " , dest = " farmQueue " ,
type = " string " , default = None ,
help = " Farm queue to send processing jobs to " )
parser . add_option ( " -d " , " --dir " , dest = " outputDir " ,
type = " string " , default = None ,
help = " If provided, analysis output files will be written to this directory " )
2009-07-07 22:15:39 +08:00
parser . add_option ( " -m " , " --maxReadGroups " , dest = " maxReadGroups " ,
type = " int " , default = - 1 ,
help = " Maximum number of read groups to process. The default of -1 indicates that all read groups will be processed " )
2009-06-21 00:00:23 +08:00
parser . add_option ( " -c " , " --config " , dest = " configs " ,
action = " append " , type = " string " , default = [ ] ,
help = " Configuration file " )
parser . add_option ( " -s " , " --stdout " , dest = " toStdout " ,
action = ' store_true ' , default = False ,
help = " If provided, writes output to standard output, not to files " )
2009-06-26 06:50:55 +08:00
parser . add_option ( " " , " --no_plots " , dest = " noplots " ,
action = ' store_true ' , default = False ,
help = " If provided, no plots will be generated " )
2009-06-21 00:00:23 +08:00
parser . add_option ( " " , " --dry " , dest = " dry " ,
action = ' store_true ' , default = False ,
help = " If provided, nothing actually gets run, just a dry run " )
2009-06-24 09:12:35 +08:00
#parser.add_option("-r", "--raw", dest="raw",
# action='store_true', default=False,
# help="If provided, analyze data w.r.t. the raw empirical qulaity scores # mmismatches / # bases, as opposed to the Yates correction of +1 to each")
2009-06-21 00:00:23 +08:00
parser . add_option ( " -g " , " --readGroup " , dest = " selectedReadGroups " ,
action = " append " , type = " string " , default = [ ] ,
help = " If provided, only the provided read groups will be analyzed " )
( OPTIONS , args ) = parser . parse_args ( )
#if len(args) != 3:
# parser.error("incorrect number of arguments")
if len ( OPTIONS . configs ) == 0 :
parser . error ( " Requires at least one configuration file be provided " )
config = gatkConfigParser ( OPTIONS . configs )
2009-07-07 22:15:39 +08:00
if OPTIONS . selectedReadGroups < > [ ] : print ' Analyzing only the following read groups ' , OPTIONS . selectedReadGroups
2009-06-21 03:50:37 +08:00
analyzeFiles ( args )
import unittest
class TestanalzyeRecalQuals ( unittest . TestCase ) :
def setUp ( self ) :
self . numbers = [ 0 , 1 , 2 , 2 , 3 , 4 , 4 , 4 , 5 , 5 , 5 , 6 , 6 ]
self . numbersItems = [ 0 , 1 , 2 , 3 , 4 , 5 , 6 ]
self . numbersCounts = [ 1 , 1 , 2 , 1 , 3 , 3 , 2 ]
self . numbers_sum = 47
self . numbers_mean = 3.615385
self . numbers_mode = 4
self . numbers_median = 4
self . numbers_stdev = 1.894662
self . numbers_var = 3.589744
self . numbers_cov = self . numbers_stdev / self . numbers_mean
def testSum ( self ) :
self . assertEquals ( self . numbers_sum , lsum ( self . numbers ) )
self . assertEquals ( 0 , lsum ( self . numbers [ 0 : 0 ] ) )
self . assertEquals ( 1 , lsum ( self . numbers [ 0 : 2 ] ) )
self . assertEquals ( 3 , lsum ( self . numbers [ 0 : 3 ] ) )
def teststdev ( self ) :
self . assertAlmostEqual ( self . numbers_stdev , lsamplestdev ( self . numbersItems , self . numbersCounts , self . numbers_mean ) , 4 )
if __name__ == ' __main__ ' :
main ( )
#unittest.main()
2009-07-07 22:15:39 +08:00