minor improvements to snpSelector to work with hapmap chip VCF files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2343 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-13 17:59:32 +00:00 · 2009-12-13 17:59:32 +00:00 · 56467df49a
parent 45199136f0
commit 56467df49a
2 changed files with 34 additions and 13 deletions
--- a/python/snpSelector.py
+++ b/python/snpSelector.py
@ -67,24 +67,32 @@ class RecalibratedCall:
    def __str__(self):
        return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate()))

-def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None ):
+def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, mustBeVariant = False ):
    if filter == None:
        filter = not OPTIONS.unfiltered
        
    f = open(file)
    header, columnNames, lines = readVCFHeader(f)

+    nLowQual = 0
    def parseVariant(args):
+        global nLowQual
        header1, VCF, counter = args
-        if filter and not VCF.passesFilters():
+        if filter and not VCF.passesFilters() or ( False and mustBeVariant == True and not VCF.isVariant() ): # currently ignore mustBeVariant
            #print 'filtering', VCF
            return None
+        elif VCF.getQual() <= minQScore:
+            #print 'filtering', VCF
+            #nLowQual += 1
+            return None
        elif random.random() <= downsampleFraction:
            return VCF
        else:
            return None

    variants = ifilter(None, imap(parseVariant, islice(lines2VCF(lines, header=header, columnNames = columnNames, extendedOutput = True, decodeAll = decodeAll), maxRecords)))
+    if nLowQual > 0:
+        print '%d snps filtered due to QUAL < %d' % (nLowQual, minQScore)
    return header, variants

 def selectVariants( variants, selector = None ):
@ -286,15 +294,15 @@ def binString(field, left, right):
        left = right = FIELD_RANGES[field] - epsilon
    if OPTIONS.plottableNones and not isNumber(left) and isNumber(right):
        left = right - epsilon        
-    if isNumber(left): leftStr = "%.2f" % left
-    if isNumber(right): rightStr = "%.2f" % right
-    return '%8s %8s' % (leftStr, rightStr)
+    if isNumber(left): leftStr = "%.4f" % left
+    if isNumber(right): rightStr = "%.4f" % right
+    return '%12s %12s' % (leftStr, rightStr)

 #
 #
 #
 def recalibrateCalls(variants, fields, callCovariates):
-    def phred(v): return round(phredScale(v), 2)
+    def phred(v): return phredScale(v)
    
    for variant in variants:
        recalCall = RecalibratedCall(variant, fields) 
@ -419,9 +427,9 @@ def sensitivitySpecificity(variants, truth):
        t = variantInTruth(variant, truth)

        isTP, isFP = False, False
-        if OPTIONS.useSample:
+        if OPTIONS.useSample or OPTIONS.onlyAtTruth:
            if t: # we have a site
-                isTP = isVariantInSample(t, OPTIONS.useSample)
+                isTP = (isVariantInSample(t, OPTIONS.useSample) and t.isVariant()) or (not isVariantInSample(t, OPTIONS.useSample) and not t.isVariant())
                isFP = not isTP
        else:
            isTP = t
@ -439,7 +447,9 @@ def sensitivitySpecificity(variants, truth):
            variant.setField("TP", 0)
            #print t, variant, "is a FP!"
            FPs.append(variant)
-    nFN = variantsInTruth(truth.itervalues()) - nTP
+    nRef = len(filter(lambda x: not x.isVariant(), truth.itervalues()))
+    nFN = variantsInTruth(truth.itervalues()) - nTP - nRef
+    #print 'nRef', nTP, nFP, nFN, nRef
    return CallCmp(nTP, nFP, nFN), FPs

 def markTruth(calls):
@ -491,6 +501,9 @@ def setup():
    parser.add_option("", "--plottable", dest="plottableNones",
                        action='store_true', default=False,
                        help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]")
+    parser.add_option("", "--onlyAtTruth", dest="onlyAtTruth",
+                        action='store_true', default=False,
+                        help="If provided, we only consider TP/FP/FN at truth sites[default: %default]")
    parser.add_option("-p", "--partitions", dest="partitions",
                        type='int', default=25,
                        help="Number of partitions to use for each feature.  Don't use so many that the number of variants per bin is very low. [default: %default]")
@ -503,6 +516,9 @@ def setup():
    parser.add_option("-M", "--maxRecords", dest="maxRecords",
                       type='int', default=None,
                        help="Maximum number of input VCF records to process, if provided.  Default is all records")
+    parser.add_option("-Q", "--qMin", dest="minQScore",
+                        type='int', default=-1,
+                        help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
    parser.add_option("-q", "--qMax", dest="maxQScore",
                        type='int', default=60,
                        help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
@ -536,7 +552,7 @@ def assessCalls(file):
    if OPTIONS.maxRecords <> None and OPTIONS.maxRecords < numberOfRecords:
        numberOfRecords = OPTIONS.maxRecords
    downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
-    header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction)
+    header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore)
    allCalls = list(allCalls)
    print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records'
    print 'Number of selected VCF records', len(allCalls)
@ -578,7 +594,7 @@ def writeRecalibratedCalls(file, header, calls):

 def readTruth(truthVCF):
    print 'Reading truth file', truthVCF
-    rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True)[1])
+    rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True, mustBeVariant = True)[1])
    truth = dict( [[v.getLoc(), v] for v in rawTruth])
    print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth)
    return truth
@ -631,7 +647,7 @@ def main():
    header, allCalls, titvTarget = assessCalls(args[0])
    if not OPTIONS.dontRecalibrate: 
        covariates = determineCovariates(allCalls, titvTarget, fields)
-        header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords)
+        header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore)
        RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
        writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls)
    else:
@ -639,7 +655,7 @@ def main():
        printCallQuals("QUAL", allCalls, titvTarget)
        OPTIONS.outputVCF = args[0]

-    if len(args) > 1:
+    if truthVCF <> None:
        evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)


--- a/python/vcfReader.py
+++ b/python/vcfReader.py
@ -56,6 +56,11 @@ class VCFRecord:
    
    def getRef(self): return self.get("REF")
    def getAlt(self): return self.get("ALT")
+    def isVariant(self): 
+        v = self.getAlt() <> '.'
+        #print 'isVariant', self.bindings, v
+        return v
+
    def getQual(self): return self.get("QUAL")
    
    def getVariation(self): return self.getRef() + self.getAlt()