2009-06-21 00:00:23 +08:00
from __future__ import with_statement
import farm_commands
import os . path
import sys
from optparse import OptionParser
import picard_utils
from gatkConfigParser import *
import re
from itertools import *
import math
2009-06-21 03:50:37 +08:00
import operator
2009-06-21 00:00:23 +08:00
def phredQScore ( nMismatches , nBases ) :
#print 'phredQScore', nMismatches, nBases
if nMismatches == 0 :
return 40
elif nBases == 0 :
return 0
else :
return - 10 * math . log10 ( float ( nMismatches ) / nBases )
def phredScore2ErrorProp ( qual ) :
#print 'phredScore2ErrorProp', qual
return math . pow ( 10.0 , float ( qual ) / - 10.0 )
2009-06-21 03:50:37 +08:00
def tryByInt ( s ) :
try :
return int ( s )
except :
return s
2009-06-21 00:00:23 +08:00
expectedHeader = ' rg,dn,Qrep,pos,NBases,MMismatches,Qemp ' . split ( ' , ' )
defaultValues = ' 0,**,0,0,0,0,0 ' . split ( ' , ' )
class RecalData ( dict ) :
def __init__ ( self ) :
self . parse ( expectedHeader , defaultValues )
def parse ( self , header , data ) :
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
2009-06-21 03:50:37 +08:00
types = [ str , str , int , tryByInt , int , int , int ]
2009-06-21 00:00:23 +08:00
for head , expected , datum , type in zip ( header , expectedHeader , data , types ) :
if head < > expected :
raise ( " Unexpected header in rawData %s %s %s " % ( head , expected , datum ) )
#print 'Binding => ', head, type(datum)
self [ head ] = type ( datum )
#print self
return self
def set ( self , header , values ) :
for head , val in zip ( header , values ) :
self [ head ] = val
def __getattr__ ( self , name ) :
return self [ name ]
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
def readGroup ( self ) : return self . rg
def dinuc ( self ) : return self . dn
def qReported ( self ) : return self . Qrep
def qEmpirical ( self ) : return self . Qemp
def cycle ( self ) : return self . pos
def nBases ( self ) : return self . NBases
def nMismatches ( self ) : return self . MMismatches
def nExpectedMismatches ( self ) : return self . nBases ( ) * phredScore2ErrorProp ( self . qReported ( ) )
def combine ( self , moreData ) :
# grab useful info
sumErrors = self . nExpectedMismatches ( )
for datum in moreData :
self . NBases + = datum . nBases ( )
self . MMismatches + = datum . nMismatches ( )
sumErrors + = datum . nExpectedMismatches ( )
self . updateQemp ( )
self . Qrep = phredQScore ( sumErrors , self . nBases ( ) )
#print 'self.Qrep is now', self.Qrep
return self
def updateQemp ( self ) :
newQemp = phredQScore ( self . nMismatches ( ) , self . nBases ( ) )
#print 'Updating qEmp', self.Qemp, newQemp
self . Qemp = newQemp
return newQemp
def __str__ ( self ) :
2009-06-21 03:50:37 +08:00
return " [rg= %s cycle= %s dinuc= %s qrep= %.1f qemp= %.1f nbases= %d nmismatchs= %d ] " % ( self . readGroup ( ) , str ( self . cycle ( ) ) , self . dinuc ( ) , self . qReported ( ) , self . qEmpirical ( ) , self . nBases ( ) , self . nMismatches ( ) )
def __repr__ ( self ) :
return self . __str__ ( )
2009-06-21 00:00:23 +08:00
# def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ):
# self.dinuc = dinuc
# self.Qrep = Qrep
def rawDataStream ( file ) :
""" Yields successive lists containing the CSVs in the data file; excludes headers """
header = None
for line in open ( file ) :
if line . find ( " # " ) < > - 1 : continue
else :
data = line . strip ( ) . split ( ' , ' )
if line . find ( " rg, " ) < > - 1 :
header = data
else :
yield RecalData ( ) . parse ( header , data )
def rawDataByReadGroup ( rawDataFile ) :
""" Yields a stream of the data in rawDataFile, grouped by readGroup """
for readGroup , generator in groupby ( rawDataStream ( rawDataFile ) , key = RecalData . readGroup ) :
yield ( readGroup , list ( generator ) )
def combineRecalData ( separateData ) :
return RecalData ( ) . combine ( separateData )
def groupRecalData ( allData , key = None ) :
s = sorted ( allData , key = key )
values = [ [ key , combineRecalData ( vals ) ] for key , vals in groupby ( s , key = key ) ]
return sorted ( values , key = lambda x : x [ 0 ] )
#
# let's actually analyze the data!
#
def analyzeReadGroup ( readGroup , data , outputRoot ) :
print ' Read group => ' , readGroup
print ' Number of elements => ' , len ( data )
files = [ ]
if OPTIONS . toStdout :
2009-06-21 03:50:37 +08:00
basicQualScoreStats ( readGroup , data , sys . stdout )
2009-06-21 00:00:23 +08:00
qReportedVsqEmpirical ( readGroup , data , sys . stdout )
qDiffByCycle ( readGroup , data , sys . stdout )
qDiffByDinuc ( readGroup , data , sys . stdout )
else :
def outputFile ( tail ) :
file = outputRoot + tail
files . append ( file )
return file
2009-06-21 03:50:37 +08:00
with open ( outputFile ( " .basic_info.dat " ) , ' w ' ) as output :
basicQualScoreStats ( readGroup , data , output )
2009-06-21 00:00:23 +08:00
with open ( outputFile ( " .empirical_v_reported_quality.dat " ) , ' w ' ) as output :
qReportedVsqEmpirical ( readGroup , data , output )
with open ( outputFile ( " .quality_difference_v_cycle.dat " ) , ' w ' ) as output :
qDiffByCycle ( readGroup , data , output )
with open ( outputFile ( " .quality_difference_v_dinucleotide.dat " ) , ' w ' ) as output :
qDiffByDinuc ( readGroup , data , output )
print ' Files ' , files
return analyzeFiles ( files )
2009-06-21 03:50:37 +08:00
def countQsOfMinQuality ( thres , data ) :
""" Returns RecalData lists for each of the following:
All quality score bins with qRep > thres , and all quality scores with qRep and qRemp > thres """
qDeclared = RecalData ( ) . combine ( filter ( lambda x : x . qReported ( ) > thres , data ) )
qDeclaredTrue = RecalData ( ) . combine ( filter ( lambda x : x . qReported ( ) > thres and x . qEmpirical ( ) > thres , data ) )
#print qDeclared
return qDeclared , qDeclaredTrue
def medianQreported ( jaffe , allBases ) :
i , ignore = medianByCounts ( map ( RecalData . nBases , jaffe ) )
return jaffe [ i ] . qReported ( )
def medianByCounts ( counts ) :
nTotal = lsum ( counts )
sum = 0.0
for i in range ( len ( counts ) ) :
sum + = counts [ i ]
if sum / nTotal > 0.5 :
# The current datum contains the median
return i , counts [ i ]
def modeQreported ( jaffe , allBases ) :
ordered = sorted ( jaffe , key = RecalData . nBases , reverse = True )
#print ordered
return ordered [ 0 ] . qReported ( )
def averageQreported ( jaffe , allBases ) :
# the average reported quality score is already calculated and stored as qRep!
return allBases . qReported ( )
def lsum ( inlist ) :
return reduce ( operator . __add__ , inlist , 0 )
def lsamplestdev ( inlist , counts , mean ) :
"""
Returns the variance of the values in the passed list using
N for the denominator ( i . e . , DESCRIBES the sample variance only ) .
Usage : lsamplevar ( inlist ) """
n = lsum ( counts )
sum = 0.0
for item , count in zip ( inlist , counts ) :
diff = item - mean
inc = count * diff * diff
#print item, count, mean, diff, diff*diff, inc
sum + = inc
return math . sqrt ( sum / float ( n - 1 ) )
def rmse ( reportedList , empiricalList , counts ) :
sum = 0.0
for reported , empirical , count in zip ( reportedList , empiricalList , counts ) :
diff = reported - empirical
inc = count * diff * diff
sum + = inc
return math . sqrt ( sum )
def stdevQReported ( jaffe , allBases ) :
mean = averageQreported ( jaffe , allBases )
return lsamplestdev ( map ( RecalData . qReported , jaffe ) , map ( RecalData . nBases , jaffe ) , mean )
def coeffOfVariationQreported ( jaffe , allBases ) :
mean = averageQreported ( jaffe , allBases )
stdev = stdevQReported ( jaffe , allBases )
return stdev / mean
# o("variance_Qreported %2.2f" % varianceQreported(jaffe))
def rmseJaffe ( jaffe ) :
return rmse ( map ( RecalData . qReported , jaffe ) , map ( RecalData . qEmpirical , jaffe ) , map ( RecalData . nBases , jaffe ) )
def basicQualScoreStats ( readGroup , data , output ) :
def o ( s ) :
print >> output , s
# aggregate all the data into a single datum
rg , allBases = groupRecalData ( data , key = RecalData . readGroup ) [ 0 ]
#o(allBases)
o ( " read_group %s " % rg )
#o("number_of_cycles %d" % 0)
#o("maximum_reported_quality_score %d" % 0)
o ( " number_of_bases %d " % allBases . nBases ( ) )
o ( " number_of_mismatching_bases %d " % allBases . nMismatches ( ) )
o ( " lane_wide_Qreported %2.2f " % allBases . qReported ( ) )
o ( " lane_wide_Qempirical %2.2f " % allBases . qEmpirical ( ) )
o ( " lane_wide_Qempirical_minus_Qreported %2.2f " % ( allBases . qEmpirical ( ) - allBases . qReported ( ) ) )
jaffe = [ datum for key , datum in qReportedVsqEmpiricalStream ( readGroup , data ) ]
o ( " median_Qreported %2.2f " % medianQreported ( jaffe , allBases ) )
o ( " mode_Qreported %2.2f " % modeQreported ( jaffe , allBases ) )
o ( " average_Qreported %2.2f " % averageQreported ( jaffe , allBases ) )
o ( " stdev_Qreported %2.2f " % stdevQReported ( jaffe , allBases ) )
o ( " coeff_of_variation_Qreported %2.2f " % coeffOfVariationQreported ( jaffe , allBases ) )
o ( " RMSE(qReported,qEmpirical) %2.2f " % rmseJaffe ( jaffe ) )
for thres in [ 20 , 25 , 30 ] :
qDeclared , qDeclaredTrue = countQsOfMinQuality ( thres , jaffe )
o ( " number_of_q %d _bases %d " % ( thres , qDeclared . nBases ( ) ) )
o ( " percent_of_q %d _bases %2.2f " % ( thres , 100 * qDeclared . nBases ( ) / float ( allBases . nBases ( ) ) ) )
o ( " number_of_q %d _bases_with_qemp_above_q %d %d " % ( thres , thres , qDeclaredTrue . nBases ( ) ) )
o ( " percent_of_q %d _bases_with_qemp_above_q %d %2.2f " % ( thres , thres , 100 * qDeclaredTrue . nBases ( ) / float ( allBases . nBases ( ) ) ) )
2009-06-21 00:00:23 +08:00
def qDiffByCycle ( readGroup , allData , output ) :
print >> output , ' # Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases '
print >> output , ' Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases '
for cycle , datum in groupRecalData ( allData , key = RecalData . cycle ) :
datum . set ( [ ' rg ' , ' dn ' , ' pos ' ] , [ readGroup , ' ** ' , cycle ] )
diff = datum . qEmpirical ( ) - datum . qReported ( )
2009-06-21 03:50:37 +08:00
print >> output , " %s %2.2f %2.2f %2.2f %12d %12d " % ( datum . cycle ( ) , datum . qReported ( ) , datum . qEmpirical ( ) , diff , datum . nMismatches ( ) , datum . nBases ( ) )
2009-06-21 00:00:23 +08:00
def qDiffByDinuc ( readGroup , allData , output ) :
print >> output , ' # Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases '
print >> output , ' Dinuc Qreported Qempirical Qempirical_Qreported nMismatches nBases '
for dinuc , datum in groupRecalData ( allData , key = RecalData . dinuc ) :
datum . set ( [ ' rg ' , ' dn ' , ' pos ' ] , [ readGroup , dinuc , ' * ' ] )
diff = datum . qEmpirical ( ) - datum . qReported ( )
print >> output , " %s %2.2f %2.2f %2.2f %12d %12d " % ( datum . dinuc ( ) , datum . qReported ( ) , datum . qEmpirical ( ) , diff , datum . nMismatches ( ) , datum . nBases ( ) )
2009-06-21 03:50:37 +08:00
def qReportedVsqEmpiricalStream ( readGroup , data ) :
for key , datum in groupRecalData ( data , key = RecalData . qReported ) :
datum . set ( [ ' rg ' , ' dn ' , ' Qrep ' , ' pos ' ] , [ readGroup , ' ** ' , key , ' * ' ] )
yield key , datum
2009-06-21 00:00:23 +08:00
def qReportedVsqEmpirical ( readGroup , allData , output ) :
print >> output , ' Qreported Qempirical nMismatches nBases '
2009-06-21 03:50:37 +08:00
for key , datum in qReportedVsqEmpiricalStream ( readGroup , allData ) :
print >> output , " %2.2f %2.2f %12d %12d " % ( datum . qReported ( ) , datum . qEmpirical ( ) , datum . nMismatches ( ) , datum . nBases ( ) )
2009-06-21 00:00:23 +08:00
def analyzeRawData ( rawDataFile ) :
for readGroup , data in rawDataByReadGroup ( rawDataFile ) :
if OPTIONS . selectedReadGroups == [ ] or readGroup in OPTIONS . selectedReadGroups :
root , sourceFilename = os . path . split ( rawDataFile )
if ( OPTIONS . outputDir ) : root = OPTIONS . outputDir
outputRoot = os . path . join ( root , " %s . %s . %s " % ( sourceFilename , readGroup , ' analysis ' ) )
analyzeReadGroup ( readGroup , data , outputRoot )
plottersByFile = {
" raw_data.csv$ " : analyzeRawData ,
" recal_data.csv$ " : analyzeRawData ,
" empirical_v_reported_quality " : ' PlotQEmpStated ' ,
" quality_difference_v_dinucleotide " : ' PlotQDiffByDinuc ' ,
" quality_difference_v_cycle " : ' PlotQDiffByCycle ' }
def getPlotterForFile ( file ) :
for pat , analysis in plottersByFile . iteritems ( ) :
if re . search ( pat , file ) :
if type ( analysis ) == str :
return config . getOption ( ' R ' , analysis , ' input_file ' )
else :
analysis ( file )
return None
def analyzeFiles ( files ) :
#print 'analyzeFiles', files
Rscript = config . getOption ( ' R ' , ' Rscript ' , ' input_file ' )
for file in files :
plotter = getPlotterForFile ( file )
if plotter < > None :
cmd = ' ' . join ( [ Rscript , plotter , file ] )
farm_commands . cmd ( cmd , None , None , just_print_commands = OPTIONS . dry )
2009-06-21 03:50:37 +08:00
def main ( ) :
global config , OPTIONS
2009-06-21 00:00:23 +08:00
usage = """ usage: % prog -c config.cfg files* """
parser = OptionParser ( usage = usage )
parser . add_option ( " -q " , " --farm " , dest = " farmQueue " ,
type = " string " , default = None ,
help = " Farm queue to send processing jobs to " )
parser . add_option ( " -d " , " --dir " , dest = " outputDir " ,
type = " string " , default = None ,
help = " If provided, analysis output files will be written to this directory " )
parser . add_option ( " -c " , " --config " , dest = " configs " ,
action = " append " , type = " string " , default = [ ] ,
help = " Configuration file " )
parser . add_option ( " -s " , " --stdout " , dest = " toStdout " ,
action = ' store_true ' , default = False ,
help = " If provided, writes output to standard output, not to files " )
parser . add_option ( " " , " --dry " , dest = " dry " ,
action = ' store_true ' , default = False ,
help = " If provided, nothing actually gets run, just a dry run " )
parser . add_option ( " -g " , " --readGroup " , dest = " selectedReadGroups " ,
action = " append " , type = " string " , default = [ ] ,
help = " If provided, only the provided read groups will be analyzed " )
( OPTIONS , args ) = parser . parse_args ( )
#if len(args) != 3:
# parser.error("incorrect number of arguments")
if len ( OPTIONS . configs ) == 0 :
parser . error ( " Requires at least one configuration file be provided " )
config = gatkConfigParser ( OPTIONS . configs )
2009-06-21 03:50:37 +08:00
analyzeFiles ( args )
import unittest
class TestanalzyeRecalQuals ( unittest . TestCase ) :
def setUp ( self ) :
self . numbers = [ 0 , 1 , 2 , 2 , 3 , 4 , 4 , 4 , 5 , 5 , 5 , 6 , 6 ]
self . numbersItems = [ 0 , 1 , 2 , 3 , 4 , 5 , 6 ]
self . numbersCounts = [ 1 , 1 , 2 , 1 , 3 , 3 , 2 ]
self . numbers_sum = 47
self . numbers_mean = 3.615385
self . numbers_mode = 4
self . numbers_median = 4
self . numbers_stdev = 1.894662
self . numbers_var = 3.589744
self . numbers_cov = self . numbers_stdev / self . numbers_mean
def testSum ( self ) :
self . assertEquals ( self . numbers_sum , lsum ( self . numbers ) )
self . assertEquals ( 0 , lsum ( self . numbers [ 0 : 0 ] ) )
self . assertEquals ( 1 , lsum ( self . numbers [ 0 : 2 ] ) )
self . assertEquals ( 3 , lsum ( self . numbers [ 0 : 3 ] ) )
def teststdev ( self ) :
self . assertAlmostEqual ( self . numbers_stdev , lsamplestdev ( self . numbersItems , self . numbersCounts , self . numbers_mean ) , 4 )
if __name__ == ' __main__ ' :
main ( )
#unittest.main()