Quick python scripts for going from genotype VCFs to site-only VCFs, and one to fix BC vcf files (which had "het" genotypes at non-variant sites)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3768 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-07-12 19:13:32 +00:00
parent e50627a49e
commit 46c39f2d53
2 changed files with 54 additions and 0 deletions

View File

@ -0,0 +1,16 @@
import sys
print("Fixing "+sys.argv[1]+" to "+sys.argv[2])
bad_vcf = open(sys.argv[1])
out_vcf = open(sys.argv[2],'w')
for line in bad_vcf.readlines():
if ( line.startswith("#") ):
out_vcf.write(line)
else:
spline = line.strip().split("\t")
newspline = list()
for field in spline:
if ( field.find("pGeno") > -1 ):
field = "0/0:"+field.split(":",1)[1]
newspline.append(field)
out_vcf.write("\t".join(newspline)+"\n")

View File

@ -0,0 +1,38 @@
import sys
genotype_vcf = open(sys.argv[1])
sites_vcf = open(sys.argv[2],'w')
sample_name = "ALL_HET"
info = "."
format = "GT"
het = "0/1"
use_fields = range(7)
line_counter = 0
print("Reading genotype file...")
for line in genotype_vcf.readlines():
line_counter += 1
if ( line.startswith("#") and not line.startswith("#CHR") ):
sites_vcf.write(line)
elif ( line.startswith("#CHR") ):
sites_vcf.write("##source=vcfGenotypeToSites\n")
spline = line.strip().split("\t")
newfields = list()
for i in range(9):
newfields.append(spline[i])
newfields.append(sample_name)
sites_vcf.write("\t".join(newfields)+"\n")
else:
spline = line.strip().split("\t")
newfields = list()
for i in use_fields:
newfields.append(spline[i])
newfields.append(info)
newfields.append(format)
newfields.append(het)
sites_vcf.write("\t".join(newfields)+"\n")
if ( line_counter % 100000 == 0 ):
print("Converted: "+str(line_counter)+" lines")
genotype_vcf.close()
sites_vcf.close()