From 46c39f2d53c21b3bafb4c2263fafa76eeab8dab0 Mon Sep 17 00:00:00 2001 From: chartl Date: Mon, 12 Jul 2010 19:13:32 +0000 Subject: [PATCH] Quick python scripts for going from genotype VCFs to site-only VCFs, and one to fix BC vcf files (which had "het" genotypes at non-variant sites) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3768 348d0f76-0448-11de-a6fe-93d51630548a --- python/setFilterGenotypesToRef.py | 16 +++++++++++++ python/vcfGenotypeToSites.py | 38 +++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 python/setFilterGenotypesToRef.py create mode 100644 python/vcfGenotypeToSites.py diff --git a/python/setFilterGenotypesToRef.py b/python/setFilterGenotypesToRef.py new file mode 100644 index 000000000..e8944aa61 --- /dev/null +++ b/python/setFilterGenotypesToRef.py @@ -0,0 +1,16 @@ +import sys +print("Fixing "+sys.argv[1]+" to "+sys.argv[2]) +bad_vcf = open(sys.argv[1]) +out_vcf = open(sys.argv[2],'w') + +for line in bad_vcf.readlines(): + if ( line.startswith("#") ): + out_vcf.write(line) + else: + spline = line.strip().split("\t") + newspline = list() + for field in spline: + if ( field.find("pGeno") > -1 ): + field = "0/0:"+field.split(":",1)[1] + newspline.append(field) + out_vcf.write("\t".join(newspline)+"\n") diff --git a/python/vcfGenotypeToSites.py b/python/vcfGenotypeToSites.py new file mode 100644 index 000000000..203d51dfc --- /dev/null +++ b/python/vcfGenotypeToSites.py @@ -0,0 +1,38 @@ +import sys +genotype_vcf = open(sys.argv[1]) +sites_vcf = open(sys.argv[2],'w') + +sample_name = "ALL_HET" +info = "." +format = "GT" +het = "0/1" +use_fields = range(7) + +line_counter = 0 +print("Reading genotype file...") +for line in genotype_vcf.readlines(): + line_counter += 1 + if ( line.startswith("#") and not line.startswith("#CHR") ): + sites_vcf.write(line) + elif ( line.startswith("#CHR") ): + sites_vcf.write("##source=vcfGenotypeToSites\n") + spline = line.strip().split("\t") + newfields = list() + for i in range(9): + newfields.append(spline[i]) + newfields.append(sample_name) + sites_vcf.write("\t".join(newfields)+"\n") + else: + spline = line.strip().split("\t") + newfields = list() + for i in use_fields: + newfields.append(spline[i]) + newfields.append(info) + newfields.append(format) + newfields.append(het) + sites_vcf.write("\t".join(newfields)+"\n") + if ( line_counter % 100000 == 0 ): + print("Converted: "+str(line_counter)+" lines") + +genotype_vcf.close() +sites_vcf.close()