Better support for comparison to truth. Now emits FP rates for each covariate if a truth file is provided. Also now writes out a detailed recal.log file that can be parsed directly into R
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2179 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
662bbbd53b
commit
2ea93385be
|
|
@ -231,12 +231,27 @@ def validateBins(bins):
|
||||||
elif contains2(left1) or contains2(right2):
|
elif contains2(left1) or contains2(right2):
|
||||||
raise Exception("Bad bins", left1, right1, left2, right2)
|
raise Exception("Bad bins", left1, right1, left2, right2)
|
||||||
|
|
||||||
def printFieldQualHeader(more = ""):
|
def printFieldQualHeader():
|
||||||
print ' field left right nVariants nNovels titv titvNovels dbSNP fprate q', more
|
more = ""
|
||||||
|
if TRUTH_CALLS <> None:
|
||||||
|
more = "TP FP FPRate FN FNRate"
|
||||||
|
def p(stream):
|
||||||
|
if stream <> None:
|
||||||
|
print >> stream, ' field left right nVariants nNovels titv titvNovels dbSNP fprate q', more
|
||||||
|
p(sys.stdout)
|
||||||
|
p(RECAL_LOG)
|
||||||
|
|
||||||
def printFieldQual( field, left, right, variants, FPRate, more = ""):
|
def printFieldQual( field, left, right, variants, FPRate ):
|
||||||
|
more = ""
|
||||||
|
if TRUTH_CALLS <> None:
|
||||||
|
callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
|
||||||
|
more = str(callComparison)
|
||||||
novels = selectVariants(variants, VCFRecord.isNovel)
|
novels = selectVariants(variants, VCFRecord.isNovel)
|
||||||
print ' %s %s %8d %8d %.2f %.2f %.2f %.2e %d' % (field, binString(left, right), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants), FPRate, phredScale(FPRate)), more
|
def p(stream):
|
||||||
|
if stream <> None:
|
||||||
|
print >> stream, ' %s %s %8d %8d %.2f %.2f %.2f %.2e %d' % (field, binString(left, right), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants), FPRate, phredScale(FPRate)), more
|
||||||
|
p(sys.stdout)
|
||||||
|
p(RECAL_LOG)
|
||||||
|
|
||||||
def binString(left, right):
|
def binString(left, right):
|
||||||
leftStr = str(left)
|
leftStr = str(left)
|
||||||
|
|
@ -280,11 +295,9 @@ def optimizeCalls(variants, fields, titvTarget):
|
||||||
def printCallQuals(recalCalls, titvTarget, info = ""):
|
def printCallQuals(recalCalls, titvTarget, info = ""):
|
||||||
print '--------------------------------------------------------------------------------'
|
print '--------------------------------------------------------------------------------'
|
||||||
print info
|
print info
|
||||||
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = False, forcePrint = True )
|
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = False, forcePrint = True, prefix = "OPT-", printHeader = False )
|
||||||
print 'Cumulative'
|
print 'Cumulative'
|
||||||
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = True, forcePrint = True )
|
calibrateFeatures(recalCalls, ['QUAL'], titvTarget, printCall = True, cumulative = True, forcePrint = True, prefix = "OPTCUM-", printHeader = False )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def all( p, l ):
|
def all( p, l ):
|
||||||
for elt in l:
|
for elt in l:
|
||||||
|
|
@ -310,10 +323,10 @@ def mapVariantBins(variants, field, cumulative = False):
|
||||||
|
|
||||||
return imap( variantsInBin, bins )
|
return imap( variantsInBin, bins )
|
||||||
|
|
||||||
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False ):
|
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True ):
|
||||||
covariates = []
|
covariates = []
|
||||||
|
|
||||||
printFieldQualHeader()
|
if printHeader: printFieldQualHeader()
|
||||||
for field in fields:
|
for field in fields:
|
||||||
if DEBUG: print 'Optimizing field', field
|
if DEBUG: print 'Optimizing field', field
|
||||||
|
|
||||||
|
|
@ -325,7 +338,7 @@ def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulativ
|
||||||
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
|
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
|
||||||
dbsnp = dbSNPRate(selectedVariants)
|
dbsnp = dbSNPRate(selectedVariants)
|
||||||
covariates.append(CallCovariate(field, left, right, FPRate))
|
covariates.append(CallCovariate(field, left, right, FPRate))
|
||||||
printFieldQual(field, left, right, selectedVariants, FPRate )
|
printFieldQual( prefix + field, left, right, selectedVariants, FPRate )
|
||||||
else:
|
else:
|
||||||
print 'Not calibrating bin', left, right, 'because it contains too few variants:', len(selectedVariants)
|
print 'Not calibrating bin', left, right, 'because it contains too few variants:', len(selectedVariants)
|
||||||
|
|
||||||
|
|
@ -369,22 +382,20 @@ def sensitivitySpecificity(variants, truth):
|
||||||
nFN = len(truth) - nTP
|
nFN = len(truth) - nTP
|
||||||
return CallCmp(nTP, nFP, nFN), FPs
|
return CallCmp(nTP, nFP, nFN), FPs
|
||||||
|
|
||||||
|
|
||||||
def compareCalls(calls, truthCalls):
|
def compareCalls(calls, truthCalls):
|
||||||
for variant in calls: variant.setField("TP", 0) # set the TP field to 0
|
for variant in calls: variant.setField("TP", 0) # set the TP field to 0
|
||||||
|
|
||||||
def compare1(name, cumulative):
|
def compare1(name, cumulative):
|
||||||
for left, right, selectedVariants in mapVariantBins(calls, 'QUAL', cumulative = cumulative):
|
for left, right, selectedVariants in mapVariantBins(calls, 'QUAL', cumulative = cumulative):
|
||||||
callComparison, theseFPs = sensitivitySpecificity(selectedVariants, truthCalls)
|
|
||||||
#print selectedVariants[0]
|
#print selectedVariants[0]
|
||||||
printFieldQual(name, left, right, selectedVariants, dephredScale(left), str(callComparison))
|
printFieldQual(name, left, right, selectedVariants, dephredScale(left))
|
||||||
|
|
||||||
print 'PER BIN nCalls=', len(calls)
|
print 'PER BIN nCalls=', len(calls)
|
||||||
printFieldQualHeader("TP FP FPRate FN FNRate")
|
# printFieldQualHeader()
|
||||||
compare1('TRUTH-PER-BIN', False)
|
compare1('RECAL-QUAL-TRUTH-PER-BIN', False)
|
||||||
|
|
||||||
print 'CUMULATIVE nCalls=', len(calls)
|
print 'CUMULATIVE nCalls=', len(calls)
|
||||||
compare1('TRUTH-CUMULATIVE', True)
|
compare1('RECAL-QUAL-TRUTH-CUMULATIVE', True)
|
||||||
|
|
||||||
def randomSplit(l, pLeft):
|
def randomSplit(l, pLeft):
|
||||||
def keep(elt, p):
|
def keep(elt, p):
|
||||||
|
|
@ -406,6 +417,9 @@ def setup():
|
||||||
parser.add_option("-t", "--truth", dest="truth",
|
parser.add_option("-t", "--truth", dest="truth",
|
||||||
type='string', default=None,
|
type='string', default=None,
|
||||||
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
|
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
|
||||||
|
parser.add_option("-l", "--recalLog", dest="recalLog",
|
||||||
|
type='string', default="recal.log",
|
||||||
|
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
|
||||||
parser.add_option("", "--unFilteredTruth", dest="unFilteredTruth",
|
parser.add_option("", "--unFilteredTruth", dest="unFilteredTruth",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="If provided, the unfiltered truth calls will be used in comparisons [default: %default]")
|
help="If provided, the unfiltered truth calls will be used in comparisons [default: %default]")
|
||||||
|
|
@ -425,7 +439,7 @@ def setup():
|
||||||
type='int', default=60,
|
type='int', default=60,
|
||||||
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
||||||
parser.add_option("-o", "--outputVCF", dest="outputVCF",
|
parser.add_option("-o", "--outputVCF", dest="outputVCF",
|
||||||
type='string', default=None,
|
type='string', default="recal.vcf",
|
||||||
help="If provided, a VCF file will be written out to this file [default: %default]")
|
help="If provided, a VCF file will be written out to this file [default: %default]")
|
||||||
parser.add_option("", "--FNoutputVCF", dest="FNoutputVCF",
|
parser.add_option("", "--FNoutputVCF", dest="FNoutputVCF",
|
||||||
type='string', default=None,
|
type='string', default=None,
|
||||||
|
|
@ -491,7 +505,7 @@ def writeRecalibratedCalls(file, header, calls):
|
||||||
print >> f, line
|
print >> f, line
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
def evaluateTruth(header, callVCF, truthVCF):
|
def readTruth(truthVCF):
|
||||||
print 'Reading truth file', truthVCF
|
print 'Reading truth file', truthVCF
|
||||||
rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = False)[1])
|
rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = False)[1])
|
||||||
def keepVariant(t):
|
def keepVariant(t):
|
||||||
|
|
@ -499,7 +513,9 @@ def evaluateTruth(header, callVCF, truthVCF):
|
||||||
return OPTIONS.unFilteredTruth or t.passesFilters()
|
return OPTIONS.unFilteredTruth or t.passesFilters()
|
||||||
truth = dict( [[v.getLoc(), v] for v in filter(keepVariant, rawTruth)])
|
truth = dict( [[v.getLoc(), v] for v in filter(keepVariant, rawTruth)])
|
||||||
print len(rawTruth), len(truth)
|
print len(rawTruth), len(truth)
|
||||||
|
return truth
|
||||||
|
|
||||||
|
def evaluateTruth(header, callVCF, truth, truthVCF):
|
||||||
print 'Reading variants back in from', callVCF
|
print 'Reading variants back in from', callVCF
|
||||||
header, calls = readVariants(callVCF)
|
header, calls = readVariants(callVCF)
|
||||||
calls = list(calls)
|
calls = list(calls)
|
||||||
|
|
@ -519,12 +535,28 @@ def evaluateTruth(header, callVCF, truthVCF):
|
||||||
print >> f, line
|
print >> f, line
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
TRUTH_CALLS = None
|
||||||
|
RECAL_LOG = None
|
||||||
def main():
|
def main():
|
||||||
|
global TRUTH_CALLS, RECAL_LOG
|
||||||
|
|
||||||
args = setup()
|
args = setup()
|
||||||
|
fields = OPTIONS.fields.split(',')
|
||||||
|
|
||||||
|
truthVCF = None
|
||||||
|
if len(args) > 1:
|
||||||
|
truthVCF = args[1]
|
||||||
|
TRUTH_CALLS = readTruth(truthVCF)
|
||||||
|
|
||||||
|
if OPTIONS.recalLog <> None:
|
||||||
|
RECAL_LOG = open(OPTIONS.recalLog, "w")
|
||||||
|
print >> RECAL_LOG, "# optimized vcf", args[0]
|
||||||
|
print >> RECAL_LOG, "# truth vcf", truthVCF
|
||||||
|
for key, value in OPTIONS.__dict__.iteritems():
|
||||||
|
print >> RECAL_LOG, '#', key, value
|
||||||
|
|
||||||
header, allCalls, titvTarget = assessCalls(args[0])
|
header, allCalls, titvTarget = assessCalls(args[0])
|
||||||
if not OPTIONS.dontRecalibrate:
|
if not OPTIONS.dontRecalibrate:
|
||||||
fields = OPTIONS.fields.split(',')
|
|
||||||
covariates = determineCovariates(allCalls, titvTarget, fields)
|
covariates = determineCovariates(allCalls, titvTarget, fields)
|
||||||
header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords)
|
header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords)
|
||||||
RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
|
RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
|
||||||
|
|
@ -534,7 +566,7 @@ def main():
|
||||||
OPTIONS.outputVCF = args[0]
|
OPTIONS.outputVCF = args[0]
|
||||||
|
|
||||||
if len(args) > 1:
|
if len(args) > 1:
|
||||||
evaluateTruth(header, OPTIONS.outputVCF, args[1])
|
evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)
|
||||||
|
|
||||||
|
|
||||||
PROFILE = False
|
PROFILE = False
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue