From 196bca6819efb17f6084507a99dfb73f3e1177ed Mon Sep 17 00:00:00 2001
From: andrewk <andrewk@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 12 Mar 2010 22:20:44 +0000
Subject: [PATCH] Script to split concordance files into their constituent sets
 and calculate summary stats from a concordance file - SNPs called and number
 in dbSNP

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2992 348d0f76-0448-11de-a6fe-93d51630548a
---
 python/create_venn_evals.py | 81 +++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100755 python/create_venn_evals.py

diff --git a/python/create_venn_evals.py b/python/create_venn_evals.py
new file mode 100755
index 000000000..e5bc1bad1
--- /dev/null
+++ b/python/create_venn_evals.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+# Creates summary stats from a concordance file - SNPs called and number in dbSNP - and will output 
+
+import sys, re, os, farm_commands
+
+sting_dir = os.environ.get("STING_DIR")
+if sting_dir == None:
+    print "You need to set the environment variable STING_DIR to point to your Sting/GATK build"
+    sys.exit()
+    
+if len(sys.argv) < 2:
+    print "Usage: PROG CONCORDANCE_FILE"
+    sys.exit()
+
+class callset_data:
+    def __init__(self, snps, in_dbSNP, titv):
+        self.snps = snps
+        self.in_dbSNP = in_dbSNP
+        self.titv = titv
+    def inc_snps(self):
+        self.snps += 1
+    def inc_dbsnps(self):
+        self.in_dbSNP += 1
+    def __str__(self):
+        return "SNPs: %10d  in_dbSNP: %d  TiTv: %d" % (self.snps, self.in_dbSNP, self.titv)
+
+sets = dict()
+concordance_filename = os.path.abspath(sys.argv[1])
+for line in file(concordance_filename):
+    fields = line.split("\t")
+    filter = 0
+    if not re.search('HybridSelectionVariantFilter', line):
+        match = re.search('Venn=(\S*)', line) ### TODO: Only works in 2-way venn with ";", remove ";" for N-way Venn - should be changed to work for both automatically
+        if match:
+            my_set = match.groups()[0]
+            
+            if sets.has_key(my_set):
+                sets[my_set].inc_snps()
+                if fields[2] != ".":
+                    sets[my_set].inc_dbsnps()
+            else:
+                sets[my_set] = callset_data(1, 0, 0)
+
+print "\n".join(["%40s %s" % (k,str(v)) for k,v in sets.items()])
+
+print "Splitting concordance file\n"
+splits_dir = concordance_filename+".splits"
+
+for vennSet in sets.keys():
+    print vennSet
+    output_head = splits_dir+"_"+vennSet
+    vcf_file = splits_dir+"_"+vennSet+".vcf"
+    varianteval_file = splits_dir+"_"+vennSet+".varianteval"
+
+    if not os.path.exists(varianteval_file):
+        fout = open(vcf_file, "w")
+        for line in file(concordance_filename):    
+            if line.startswith("#"):
+                fout.write(line)
+            else:
+                match = re.search('Venn='+vennSet, line)
+                if match:
+                    fout.write(line)
+                    
+        varianteval_out = os.path.basename(concordance_filename)+".vcf"
+        farm_commands.cmd("java -Xmx4096m -jar "+sting_dir+" -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta "
+                  +"-T VariantEval "
+                  +"-B eval,VCF,"+vcf_file+" "
+                  +"-o "+varianteval_file+" "
+                  +"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod",
+                  "gsa")
+
+
+
+
+
+
+
+
+