From 2632cb6b585869fd0dfc033f3e0c668d11403a41 Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 7 Dec 2009 03:37:14 +0000 Subject: [PATCH] minor improvements to snp selector git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2275 348d0f76-0448-11de-a6fe-93d51630548a --- python/snpSelector.py | 19 +++++++++++-------- python/vcfReader.py | 11 +++++++---- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/python/snpSelector.py b/python/snpSelector.py index 045f81e8b..16f230e04 100755 --- a/python/snpSelector.py +++ b/python/snpSelector.py @@ -67,13 +67,19 @@ class RecalibratedCall: def __str__(self): return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate())) -def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1 ): +def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None ): + if filter == None: + filter = not OPTIONS.unfiltered + f = open(file) header, columnNames, lines = readVCFHeader(f) def parseVariant(args): header1, VCF, counter = args - if random.random() <= downsampleFraction: + if filter and not VCF.passesFilters(): + #print 'filtering', VCF + return None + elif random.random() <= downsampleFraction: return VCF else: return None @@ -479,9 +485,9 @@ def setup(): parser.add_option("-l", "--recalLog", dest="recalLog", type='string', default="recal.log", help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls") - parser.add_option("", "--unFilteredTruth", dest="unFilteredTruth", + parser.add_option("-u", "--unfiltered", dest="unfiltered", action='store_true', default=False, - help="If provided, the unfiltered truth calls will be used in comparisons [default: %default]") + help="If provided, unfiltered calls will be used in comparisons [default: %default]") parser.add_option("", "--plottable", dest="plottableNones", action='store_true', default=False, help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]") @@ -573,10 +579,7 @@ def writeRecalibratedCalls(file, header, calls): def readTruth(truthVCF): print 'Reading truth file', truthVCF rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True)[1]) - def keepVariant(t): - #print t.getPos(), t.getLoc() - return OPTIONS.unFilteredTruth or t.passesFilters() - truth = dict( [[v.getLoc(), v] for v in filter(keepVariant, rawTruth)]) + truth = dict( [[v.getLoc(), v] for v in rawTruth]) print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth) return truth diff --git a/python/vcfReader.py b/python/vcfReader.py index 5075f517f..f345fe761 100755 --- a/python/vcfReader.py +++ b/python/vcfReader.py @@ -5,8 +5,9 @@ VCF_KEYS_FORMAT = "CHROM POS ID REF ALT QUAL FILTER".split() TRANSITIONS = dict() for p in ["AG", "CT"]: - TRANSITIONS[p] = True - TRANSITIONS[''.join(reversed(p))] = True + for x in [p, ''.join(reversed(p))]: + TRANSITIONS[x.lower()] = True + TRANSITIONS[x] = True def is_nan(x): return type(x) is float and x != x @@ -68,8 +69,10 @@ class VCFRecord: def getFilter(self): return self.get("FILTER") def failsFilters(self): return not self.passesFilters() def passesFilters(self): - #print self.getFilter(), ">>>", self - return self.getFilter() == "." or self.getFilter() == "0" + v = self.getFilter() == "." or self.getFilter() == "0" + #print self.getFilter(), ">>>", v, self + return v + def hasField(self, field): return field in self.bindings or field in self.info