82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
|
|
#!/usr/bin/env python
|
||
|
|
|
||
|
|
# Creates summary stats from a concordance file - SNPs called and number in dbSNP - and will output
|
||
|
|
|
||
|
|
import sys, re, os, farm_commands
|
||
|
|
|
||
|
|
sting_dir = os.environ.get("STING_DIR")
|
||
|
|
if sting_dir == None:
|
||
|
|
print "You need to set the environment variable STING_DIR to point to your Sting/GATK build"
|
||
|
|
sys.exit()
|
||
|
|
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
print "Usage: PROG CONCORDANCE_FILE"
|
||
|
|
sys.exit()
|
||
|
|
|
||
|
|
class callset_data:
|
||
|
|
def __init__(self, snps, in_dbSNP, titv):
|
||
|
|
self.snps = snps
|
||
|
|
self.in_dbSNP = in_dbSNP
|
||
|
|
self.titv = titv
|
||
|
|
def inc_snps(self):
|
||
|
|
self.snps += 1
|
||
|
|
def inc_dbsnps(self):
|
||
|
|
self.in_dbSNP += 1
|
||
|
|
def __str__(self):
|
||
|
|
return "SNPs: %10d in_dbSNP: %d TiTv: %d" % (self.snps, self.in_dbSNP, self.titv)
|
||
|
|
|
||
|
|
sets = dict()
|
||
|
|
concordance_filename = os.path.abspath(sys.argv[1])
|
||
|
|
for line in file(concordance_filename):
|
||
|
|
fields = line.split("\t")
|
||
|
|
filter = 0
|
||
|
|
if not re.search('HybridSelectionVariantFilter', line):
|
||
|
|
match = re.search('Venn=(\S*)', line) ### TODO: Only works in 2-way venn with ";", remove ";" for N-way Venn - should be changed to work for both automatically
|
||
|
|
if match:
|
||
|
|
my_set = match.groups()[0]
|
||
|
|
|
||
|
|
if sets.has_key(my_set):
|
||
|
|
sets[my_set].inc_snps()
|
||
|
|
if fields[2] != ".":
|
||
|
|
sets[my_set].inc_dbsnps()
|
||
|
|
else:
|
||
|
|
sets[my_set] = callset_data(1, 0, 0)
|
||
|
|
|
||
|
|
print "\n".join(["%40s %s" % (k,str(v)) for k,v in sets.items()])
|
||
|
|
|
||
|
|
print "Splitting concordance file\n"
|
||
|
|
splits_dir = concordance_filename+".splits"
|
||
|
|
|
||
|
|
for vennSet in sets.keys():
|
||
|
|
print vennSet
|
||
|
|
output_head = splits_dir+"_"+vennSet
|
||
|
|
vcf_file = splits_dir+"_"+vennSet+".vcf"
|
||
|
|
varianteval_file = splits_dir+"_"+vennSet+".varianteval"
|
||
|
|
|
||
|
|
if not os.path.exists(varianteval_file):
|
||
|
|
fout = open(vcf_file, "w")
|
||
|
|
for line in file(concordance_filename):
|
||
|
|
if line.startswith("#"):
|
||
|
|
fout.write(line)
|
||
|
|
else:
|
||
|
|
match = re.search('Venn='+vennSet, line)
|
||
|
|
if match:
|
||
|
|
fout.write(line)
|
||
|
|
|
||
|
|
varianteval_out = os.path.basename(concordance_filename)+".vcf"
|
||
|
|
farm_commands.cmd("java -Xmx4096m -jar "+sting_dir+" -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta "
|
||
|
|
+"-T VariantEval "
|
||
|
|
+"-B eval,VCF,"+vcf_file+" "
|
||
|
|
+"-o "+varianteval_file+" "
|
||
|
|
+"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod",
|
||
|
|
"gsa")
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|