minor cleanup

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2616 348d0f76-0448-11de-a6fe-93d51630548a
2010-01-16 20:23:20 +00:00 · 2010-01-16 20:23:20 +00:00 · 8226f4aa12
parent d8e74c5795
commit 8226f4aa12
1 changed files with 23 additions and 9 deletions
--- a/python/snpSelector.py
+++ b/python/snpSelector.py
@ -91,7 +91,7 @@ class RecalibratedCall:
    def __str__(self):
        return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate()))

-def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, mustBeVariant = False ):
+def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, mustBeVariant = False, skip = None ):
    if filter == None:
        filter = not OPTIONS.unfiltered
        
@ -99,9 +99,12 @@ def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction
    header, columnNames, lines = readVCFHeader(f)

    nLowQual = 0
+    counter = 0
    def parseVariant(args):
-        global nLowQual
+        global nLowQual, counter
        header1, VCF, counter = args
+        counter += 1
+        #print counter, skip, counter % skip
        if filter and not VCF.passesFilters() or ( False and mustBeVariant == True and not VCF.isVariant() ): # currently ignore mustBeVariant
            #print 'filtering', VCF
            return None
@ -109,6 +112,9 @@ def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction
            #print 'filtering', VCF
            #nLowQual += 1
            return None
+        elif skip <> None and counter % skip <> 0:
+            #print 'skipping'
+            return None
        elif random.random() <= downsampleFraction:
            return VCF
        else:
@ -481,7 +487,7 @@ def sensitivitySpecificity(variants, truth):
            variant.setField("TP", 0)
            #print t, variant, "is a FP!"
            FPs.append(variant)
-    nRef = len(filter(lambda x: not x.isVariant(), truth.itervalues()))
+    nRef = 0 # len(filter(lambda x: not x.isVariant(), truth.itervalues()))
    nFN = variantsInTruth(truth.itervalues()) - nTP - nRef
    #print 'nRef', nTP, nFP, nFN, nRef
    return CallCmp(nTP, nFP, nFN), FPs
@ -571,6 +577,9 @@ def setup():
    parser.add_option("-b", "--bootstrap", dest="bootStrap",
                       type='float', default=None,
                       help="If provided, the % of the calls used to generate the recalibration tables. [default: %default]")
+    parser.add_option("-k", "--skip", dest="skip",
+                       type=int, default=None,
+                       help="If provided, we'll only take every nth record [default: %default]")
    parser.add_option("-s", "--useSample", dest="useSample",
                        type='string', default=False,
                        help="If provided, we will examine sample genotypes for this sample, and consider TP/FP/FN in the truth conditional on sample genotypes [default: %default]")
@ -585,11 +594,15 @@ def setup():

 def assessCalls(file):
    print 'Counting records in VCF', file
-    numberOfRecords = quickCountRecords(open(file))
-    if OPTIONS.maxRecords <> None and OPTIONS.maxRecords < numberOfRecords:
-        numberOfRecords = OPTIONS.maxRecords
-    downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
-    header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore)
+    numberOfRecords = 1
+    downsampleFraction = 1
+    if OPTIONS.maxRecords <> None:
+        numberOfRecords = quickCountRecords(open(file))
+        if OPTIONS.maxRecords < numberOfRecords:
+            numberOfRecords = OPTIONS.maxRecords
+        downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
+    #print 'Reading variants', OPTIONS.skip, downsampleFraction
+    header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore, skip = OPTIONS.skip)
    allCalls = list(allCalls)
    print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records'
    print 'Number of selected VCF records', len(allCalls)
@ -684,7 +697,8 @@ def main():
    header, allCalls, titvTarget = assessCalls(args[0])
    if not OPTIONS.dontRecalibrate: 
        covariates = determineCovariates(allCalls, titvTarget, fields)
-        header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore)
+        print OPTIONS
+        header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore, skip = OPTIONS.skip)
        RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
        writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls)
    else: