Better support for comparison to truth. Now emits FP rates for each covariate if a truth file is provided. Also now writes out a detailed recal.log file that can be parsed directly into R

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2179 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-11-29 22:20:40 +00:00
parent 662bbbd53b
commit 2ea93385be
1 changed files with 53 additions and 21 deletions

View File

@ -231,12 +231,27 @@ def validateBins(bins):
elif contains2(left1) or contains2(right2):
raise Exception("Bad bins", left1, right1, left2, right2)
def printFieldQualHeader(more = ""):
print ' field left right nVariants nNovels titv titvNovels dbSNP fprate q', more
def printFieldQualHeader():
more = ""
if TRUTH_CALLS <> None:
more = "TP FP FPRate FN FNRate"
def p(stream):
if stream <> None:
print >> stream, ' field left right nVariants nNovels titv titvNovels dbSNP fprate q', more
p(sys.stdout)
p(RECAL_LOG)
def printFieldQual( field, left, right, variants, FPRate, more = ""):
def printFieldQual( field, left, right, variants, FPRate ):
more = ""
if TRUTH_CALLS <> None:
callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
more = str(callComparison)
novels = selectVariants(variants, VCFRecord.isNovel)
print ' %s %s %8d %8d %.2f %.2f %.2f %.2e %d' % (field, binString(left, right), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants), FPRate, phredScale(FPRate)), more
def p(stream):
if stream <> None:
print >> stream, ' %s %s %8d %8d %.2f %.2f %.2f %.2e %d' % (field, binString(left, right), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants), FPRate, phredScale(FPRate)), more
p(sys.stdout)
p(RECAL_LOG)
def binString(left, right):
leftStr = str(left)
@ -280,11 +295,9 @@ def optimizeCalls(variants, fields, titvTarget):
def printCallQuals(recalCalls, titvTarget, info = ""):
print '--------------------------------------------------------------------------------'
print info
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = False, forcePrint = True )
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = False, forcePrint = True, prefix = "OPT-", printHeader = False )
print 'Cumulative'
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = True, forcePrint = True )
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = True, forcePrint = True, prefix = "OPTCUM-", printHeader = False )
def all( p, l ):
for elt in l:
@ -310,10 +323,10 @@ def mapVariantBins(variants, field, cumulative = False):
return imap( variantsInBin, bins )
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False ):
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True ):
covariates = []
printFieldQualHeader()
if printHeader: printFieldQualHeader()
for field in fields:
if DEBUG: print 'Optimizing field', field
@ -325,7 +338,7 @@ def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulativ
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
dbsnp = dbSNPRate(selectedVariants)
covariates.append(CallCovariate(field, left, right, FPRate))
printFieldQual(field, left, right, selectedVariants, FPRate )
printFieldQual( prefix + field, left, right, selectedVariants, FPRate )
else:
print 'Not calibrating bin', left, right, 'because it contains too few variants:', len(selectedVariants)
@ -369,22 +382,20 @@ def sensitivitySpecificity(variants, truth):
nFN = len(truth) - nTP
return CallCmp(nTP, nFP, nFN), FPs
def compareCalls(calls, truthCalls):
for variant in calls: variant.setField("TP", 0) # set the TP field to 0
def compare1(name, cumulative):
for left, right, selectedVariants in mapVariantBins(calls, 'QUAL', cumulative = cumulative):
callComparison, theseFPs = sensitivitySpecificity(selectedVariants, truthCalls)
#print selectedVariants[0]
printFieldQual(name, left, right, selectedVariants, dephredScale(left), str(callComparison))
printFieldQual(name, left, right, selectedVariants, dephredScale(left))
print 'PER BIN nCalls=', len(calls)
printFieldQualHeader("TP FP FPRate FN FNRate")
compare1('TRUTH-PER-BIN', False)
# printFieldQualHeader()
compare1('RECAL-QUAL-TRUTH-PER-BIN', False)
print 'CUMULATIVE nCalls=', len(calls)
compare1('TRUTH-CUMULATIVE', True)
compare1('RECAL-QUAL-TRUTH-CUMULATIVE', True)
def randomSplit(l, pLeft):
def keep(elt, p):
@ -406,6 +417,9 @@ def setup():
parser.add_option("-t", "--truth", dest="truth",
type='string', default=None,
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
parser.add_option("-l", "--recalLog", dest="recalLog",
type='string', default="recal.log",
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
parser.add_option("", "--unFilteredTruth", dest="unFilteredTruth",
action='store_true', default=False,
help="If provided, the unfiltered truth calls will be used in comparisons [default: %default]")
@ -425,7 +439,7 @@ def setup():
type='int', default=60,
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
parser.add_option("-o", "--outputVCF", dest="outputVCF",
type='string', default=None,
type='string', default="recal.vcf",
help="If provided, a VCF file will be written out to this file [default: %default]")
parser.add_option("", "--FNoutputVCF", dest="FNoutputVCF",
type='string', default=None,
@ -491,7 +505,7 @@ def writeRecalibratedCalls(file, header, calls):
print >> f, line
f.close()
def evaluateTruth(header, callVCF, truthVCF):
def readTruth(truthVCF):
print 'Reading truth file', truthVCF
rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = False)[1])
def keepVariant(t):
@ -499,7 +513,9 @@ def evaluateTruth(header, callVCF, truthVCF):
return OPTIONS.unFilteredTruth or t.passesFilters()
truth = dict( [[v.getLoc(), v] for v in filter(keepVariant, rawTruth)])
print len(rawTruth), len(truth)
return truth
def evaluateTruth(header, callVCF, truth, truthVCF):
print 'Reading variants back in from', callVCF
header, calls = readVariants(callVCF)
calls = list(calls)
@ -519,12 +535,28 @@ def evaluateTruth(header, callVCF, truthVCF):
print >> f, line
f.close()
TRUTH_CALLS = None
RECAL_LOG = None
def main():
global TRUTH_CALLS, RECAL_LOG
args = setup()
fields = OPTIONS.fields.split(',')
truthVCF = None
if len(args) > 1:
truthVCF = args[1]
TRUTH_CALLS = readTruth(truthVCF)
if OPTIONS.recalLog <> None:
RECAL_LOG = open(OPTIONS.recalLog, "w")
print >> RECAL_LOG, "# optimized vcf", args[0]
print >> RECAL_LOG, "# truth vcf", truthVCF
for key, value in OPTIONS.__dict__.iteritems():
print >> RECAL_LOG, '#', key, value
header, allCalls, titvTarget = assessCalls(args[0])
if not OPTIONS.dontRecalibrate:
fields = OPTIONS.fields.split(',')
covariates = determineCovariates(allCalls, titvTarget, fields)
header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords)
RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
@ -534,7 +566,7 @@ def main():
OPTIONS.outputVCF = args[0]
if len(args) > 1:
evaluateTruth(header, OPTIONS.outputVCF, args[1])
evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)
PROFILE = False