gatk-3.8/python/create_venn_evals.py

82 lines
2.6 KiB
Python
Executable File

#!/usr/bin/env python
# Creates summary stats from a concordance file - SNPs called and number in dbSNP - and will output
import sys, re, os, farm_commands
sting_dir = os.environ.get("STING_DIR")
if sting_dir == None:
print "You need to set the environment variable STING_DIR to point to your Sting/GATK build"
sys.exit()
if len(sys.argv) < 2:
print "Usage: PROG CONCORDANCE_FILE"
sys.exit()
class callset_data:
def __init__(self, snps, in_dbSNP, titv):
self.snps = snps
self.in_dbSNP = in_dbSNP
self.titv = titv
def inc_snps(self):
self.snps += 1
def inc_dbsnps(self):
self.in_dbSNP += 1
def __str__(self):
return "SNPs: %10d in_dbSNP: %d TiTv: %d" % (self.snps, self.in_dbSNP, self.titv)
sets = dict()
concordance_filename = os.path.abspath(sys.argv[1])
for line in file(concordance_filename):
fields = line.split("\t")
filter = 0
if not re.search('HybridSelectionVariantFilter', line):
match = re.search('Venn=(\S*)', line) ### TODO: Only works in 2-way venn with ";", remove ";" for N-way Venn - should be changed to work for both automatically
if match:
my_set = match.groups()[0]
if sets.has_key(my_set):
sets[my_set].inc_snps()
if fields[2] != ".":
sets[my_set].inc_dbsnps()
else:
sets[my_set] = callset_data(1, 0, 0)
print "\n".join(["%40s %s" % (k,str(v)) for k,v in sets.items()])
print "Splitting concordance file\n"
splits_dir = concordance_filename+".splits"
for vennSet in sets.keys():
print vennSet
output_head = splits_dir+"_"+vennSet
vcf_file = splits_dir+"_"+vennSet+".vcf"
varianteval_file = splits_dir+"_"+vennSet+".varianteval"
if not os.path.exists(varianteval_file):
fout = open(vcf_file, "w")
for line in file(concordance_filename):
if line.startswith("#"):
fout.write(line)
else:
match = re.search('Venn='+vennSet, line)
if match:
fout.write(line)
varianteval_out = os.path.basename(concordance_filename)+".vcf"
farm_commands.cmd("java -Xmx4096m -jar "+sting_dir+" -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta "
+"-T VariantEval "
+"-B eval,VCF,"+vcf_file+" "
+"-o "+varianteval_file+" "
+"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod",
"gsa")