From 3ac5ac066f2eca15c9feddcf747b1f54408688d7 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 3 Sep 2009 15:07:49 +0000 Subject: [PATCH] Checking in Michael's DoC parameterization script; this functionality will eventually be moved into VariantFiltration git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1515 348d0f76-0448-11de-a6fe-93d51630548a --- python/DOCParameter.py | 47 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100755 python/DOCParameter.py diff --git a/python/DOCParameter.py b/python/DOCParameter.py new file mode 100755 index 000000000..b25a7cfca --- /dev/null +++ b/python/DOCParameter.py @@ -0,0 +1,47 @@ +#!/util/bin/python + +import math, string, sys + +for input_file in sys.argv[1:]: + + FILENAME = input_file + + # Read the depth of coverage distribution from file + depth = []; count = [] + for line in open(FILENAME): + try: + int(line[0]) + split = string.split(line,' ') + if int(split[1]) != 0: + depth.append(int(split[0])) + count.append(int(split[1])) + except ValueError: pass + + # Calculate the mode + mode = depth[count.index(max(count))] + + # Find the index of maximum extent of 'good' data + idx = depth.index(mode); dist = 10**9 + while abs(count[idx] - count[0]) < dist: + dist = abs(count[idx] - count[0]) + idx += 1 + model_count = count[:idx + 1] + + # Calculate mean & variance of 'good' data + model_tot = sum(model_count); adder = 0 + for i in range(len(model_count)): adder += depth[i]*model_count[i] + mean = adder/float(model_tot) + + adder = 0 + for i in range(len(model_count)): adder += model_count[i]*(mean - depth[i])**2 + var = adder/float(model_tot) + stdev = var**0.5 + + # Output Thresholds + print FILENAME + print 'Mean Depth of Coverage: ' + str(mean) + print 'Plus One Std Dev: ' + str(mean + stdev) + print 'Plus Two Std Dev: ' + str(mean + 2*stdev) + print 'Plus Three Std Dev: ' + str(mean + 3*stdev) + print 'Plus Four Std Dev: ' + str(mean + 4*stdev) + print 'Plus Five Std Dev: ' + str(mean + 5*stdev)