minor improvements to snpSelector to work with hapmap chip VCF files
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2343 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
45199136f0
commit
56467df49a
|
|
@ -67,24 +67,32 @@ class RecalibratedCall:
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate()))
|
return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate()))
|
||||||
|
|
||||||
def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None ):
|
def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, mustBeVariant = False ):
|
||||||
if filter == None:
|
if filter == None:
|
||||||
filter = not OPTIONS.unfiltered
|
filter = not OPTIONS.unfiltered
|
||||||
|
|
||||||
f = open(file)
|
f = open(file)
|
||||||
header, columnNames, lines = readVCFHeader(f)
|
header, columnNames, lines = readVCFHeader(f)
|
||||||
|
|
||||||
|
nLowQual = 0
|
||||||
def parseVariant(args):
|
def parseVariant(args):
|
||||||
|
global nLowQual
|
||||||
header1, VCF, counter = args
|
header1, VCF, counter = args
|
||||||
if filter and not VCF.passesFilters():
|
if filter and not VCF.passesFilters() or ( False and mustBeVariant == True and not VCF.isVariant() ): # currently ignore mustBeVariant
|
||||||
#print 'filtering', VCF
|
#print 'filtering', VCF
|
||||||
return None
|
return None
|
||||||
|
elif VCF.getQual() <= minQScore:
|
||||||
|
#print 'filtering', VCF
|
||||||
|
#nLowQual += 1
|
||||||
|
return None
|
||||||
elif random.random() <= downsampleFraction:
|
elif random.random() <= downsampleFraction:
|
||||||
return VCF
|
return VCF
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
variants = ifilter(None, imap(parseVariant, islice(lines2VCF(lines, header=header, columnNames = columnNames, extendedOutput = True, decodeAll = decodeAll), maxRecords)))
|
variants = ifilter(None, imap(parseVariant, islice(lines2VCF(lines, header=header, columnNames = columnNames, extendedOutput = True, decodeAll = decodeAll), maxRecords)))
|
||||||
|
if nLowQual > 0:
|
||||||
|
print '%d snps filtered due to QUAL < %d' % (nLowQual, minQScore)
|
||||||
return header, variants
|
return header, variants
|
||||||
|
|
||||||
def selectVariants( variants, selector = None ):
|
def selectVariants( variants, selector = None ):
|
||||||
|
|
@ -286,15 +294,15 @@ def binString(field, left, right):
|
||||||
left = right = FIELD_RANGES[field] - epsilon
|
left = right = FIELD_RANGES[field] - epsilon
|
||||||
if OPTIONS.plottableNones and not isNumber(left) and isNumber(right):
|
if OPTIONS.plottableNones and not isNumber(left) and isNumber(right):
|
||||||
left = right - epsilon
|
left = right - epsilon
|
||||||
if isNumber(left): leftStr = "%.2f" % left
|
if isNumber(left): leftStr = "%.4f" % left
|
||||||
if isNumber(right): rightStr = "%.2f" % right
|
if isNumber(right): rightStr = "%.4f" % right
|
||||||
return '%8s %8s' % (leftStr, rightStr)
|
return '%12s %12s' % (leftStr, rightStr)
|
||||||
|
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
def recalibrateCalls(variants, fields, callCovariates):
|
def recalibrateCalls(variants, fields, callCovariates):
|
||||||
def phred(v): return round(phredScale(v), 2)
|
def phred(v): return phredScale(v)
|
||||||
|
|
||||||
for variant in variants:
|
for variant in variants:
|
||||||
recalCall = RecalibratedCall(variant, fields)
|
recalCall = RecalibratedCall(variant, fields)
|
||||||
|
|
@ -419,9 +427,9 @@ def sensitivitySpecificity(variants, truth):
|
||||||
t = variantInTruth(variant, truth)
|
t = variantInTruth(variant, truth)
|
||||||
|
|
||||||
isTP, isFP = False, False
|
isTP, isFP = False, False
|
||||||
if OPTIONS.useSample:
|
if OPTIONS.useSample or OPTIONS.onlyAtTruth:
|
||||||
if t: # we have a site
|
if t: # we have a site
|
||||||
isTP = isVariantInSample(t, OPTIONS.useSample)
|
isTP = (isVariantInSample(t, OPTIONS.useSample) and t.isVariant()) or (not isVariantInSample(t, OPTIONS.useSample) and not t.isVariant())
|
||||||
isFP = not isTP
|
isFP = not isTP
|
||||||
else:
|
else:
|
||||||
isTP = t
|
isTP = t
|
||||||
|
|
@ -439,7 +447,9 @@ def sensitivitySpecificity(variants, truth):
|
||||||
variant.setField("TP", 0)
|
variant.setField("TP", 0)
|
||||||
#print t, variant, "is a FP!"
|
#print t, variant, "is a FP!"
|
||||||
FPs.append(variant)
|
FPs.append(variant)
|
||||||
nFN = variantsInTruth(truth.itervalues()) - nTP
|
nRef = len(filter(lambda x: not x.isVariant(), truth.itervalues()))
|
||||||
|
nFN = variantsInTruth(truth.itervalues()) - nTP - nRef
|
||||||
|
#print 'nRef', nTP, nFP, nFN, nRef
|
||||||
return CallCmp(nTP, nFP, nFN), FPs
|
return CallCmp(nTP, nFP, nFN), FPs
|
||||||
|
|
||||||
def markTruth(calls):
|
def markTruth(calls):
|
||||||
|
|
@ -491,6 +501,9 @@ def setup():
|
||||||
parser.add_option("", "--plottable", dest="plottableNones",
|
parser.add_option("", "--plottable", dest="plottableNones",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]")
|
help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]")
|
||||||
|
parser.add_option("", "--onlyAtTruth", dest="onlyAtTruth",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If provided, we only consider TP/FP/FN at truth sites[default: %default]")
|
||||||
parser.add_option("-p", "--partitions", dest="partitions",
|
parser.add_option("-p", "--partitions", dest="partitions",
|
||||||
type='int', default=25,
|
type='int', default=25,
|
||||||
help="Number of partitions to use for each feature. Don't use so many that the number of variants per bin is very low. [default: %default]")
|
help="Number of partitions to use for each feature. Don't use so many that the number of variants per bin is very low. [default: %default]")
|
||||||
|
|
@ -503,6 +516,9 @@ def setup():
|
||||||
parser.add_option("-M", "--maxRecords", dest="maxRecords",
|
parser.add_option("-M", "--maxRecords", dest="maxRecords",
|
||||||
type='int', default=None,
|
type='int', default=None,
|
||||||
help="Maximum number of input VCF records to process, if provided. Default is all records")
|
help="Maximum number of input VCF records to process, if provided. Default is all records")
|
||||||
|
parser.add_option("-Q", "--qMin", dest="minQScore",
|
||||||
|
type='int', default=-1,
|
||||||
|
help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
|
||||||
parser.add_option("-q", "--qMax", dest="maxQScore",
|
parser.add_option("-q", "--qMax", dest="maxQScore",
|
||||||
type='int', default=60,
|
type='int', default=60,
|
||||||
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
||||||
|
|
@ -536,7 +552,7 @@ def assessCalls(file):
|
||||||
if OPTIONS.maxRecords <> None and OPTIONS.maxRecords < numberOfRecords:
|
if OPTIONS.maxRecords <> None and OPTIONS.maxRecords < numberOfRecords:
|
||||||
numberOfRecords = OPTIONS.maxRecords
|
numberOfRecords = OPTIONS.maxRecords
|
||||||
downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
|
downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
|
||||||
header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction)
|
header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore)
|
||||||
allCalls = list(allCalls)
|
allCalls = list(allCalls)
|
||||||
print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records'
|
print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records'
|
||||||
print 'Number of selected VCF records', len(allCalls)
|
print 'Number of selected VCF records', len(allCalls)
|
||||||
|
|
@ -578,7 +594,7 @@ def writeRecalibratedCalls(file, header, calls):
|
||||||
|
|
||||||
def readTruth(truthVCF):
|
def readTruth(truthVCF):
|
||||||
print 'Reading truth file', truthVCF
|
print 'Reading truth file', truthVCF
|
||||||
rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True)[1])
|
rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True, mustBeVariant = True)[1])
|
||||||
truth = dict( [[v.getLoc(), v] for v in rawTruth])
|
truth = dict( [[v.getLoc(), v] for v in rawTruth])
|
||||||
print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth)
|
print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth)
|
||||||
return truth
|
return truth
|
||||||
|
|
@ -631,7 +647,7 @@ def main():
|
||||||
header, allCalls, titvTarget = assessCalls(args[0])
|
header, allCalls, titvTarget = assessCalls(args[0])
|
||||||
if not OPTIONS.dontRecalibrate:
|
if not OPTIONS.dontRecalibrate:
|
||||||
covariates = determineCovariates(allCalls, titvTarget, fields)
|
covariates = determineCovariates(allCalls, titvTarget, fields)
|
||||||
header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords)
|
header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore)
|
||||||
RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
|
RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
|
||||||
writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls)
|
writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls)
|
||||||
else:
|
else:
|
||||||
|
|
@ -639,7 +655,7 @@ def main():
|
||||||
printCallQuals("QUAL", allCalls, titvTarget)
|
printCallQuals("QUAL", allCalls, titvTarget)
|
||||||
OPTIONS.outputVCF = args[0]
|
OPTIONS.outputVCF = args[0]
|
||||||
|
|
||||||
if len(args) > 1:
|
if truthVCF <> None:
|
||||||
evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)
|
evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,11 @@ class VCFRecord:
|
||||||
|
|
||||||
def getRef(self): return self.get("REF")
|
def getRef(self): return self.get("REF")
|
||||||
def getAlt(self): return self.get("ALT")
|
def getAlt(self): return self.get("ALT")
|
||||||
|
def isVariant(self):
|
||||||
|
v = self.getAlt() <> '.'
|
||||||
|
#print 'isVariant', self.bindings, v
|
||||||
|
return v
|
||||||
|
|
||||||
def getQual(self): return self.get("QUAL")
|
def getQual(self): return self.get("QUAL")
|
||||||
|
|
||||||
def getVariation(self): return self.getRef() + self.getAlt()
|
def getVariation(self): return self.getRef() + self.getAlt()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue