gatk-3.8/python/vcfGenotypeToSites.py

39 lines
1.1 KiB
Python
Raw Normal View History

import sys
genotype_vcf = open(sys.argv[1])
sites_vcf = open(sys.argv[2],'w')
sample_name = "ALL_HET"
info = "."
format = "GT"
het = "0/1"
use_fields = range(7)
line_counter = 0
print("Reading genotype file...")
for line in genotype_vcf.readlines():
line_counter += 1
if ( line.startswith("#") and not line.startswith("#CHR") ):
sites_vcf.write(line)
elif ( line.startswith("#CHR") ):
sites_vcf.write("##source=vcfGenotypeToSites\n")
spline = line.strip().split("\t")
newfields = list()
for i in range(9):
newfields.append(spline[i])
newfields.append(sample_name)
sites_vcf.write("\t".join(newfields)+"\n")
else:
spline = line.strip().split("\t")
newfields = list()
for i in use_fields:
newfields.append(spline[i])
newfields.append(info)
newfields.append(format)
newfields.append(het)
sites_vcf.write("\t".join(newfields)+"\n")
if ( line_counter % 100000 == 0 ):
print("Converted: "+str(line_counter)+" lines")
genotype_vcf.close()
sites_vcf.close()