A quick and dirty script that, given a list of input VCF files, will output a new VCF file which looks identical to the first VCF file of the input list, except that the info field has been updated to reflect the union of all the INFO annotations across the VCF files

Note: this is primarily for use with two files with mostly disjoint annotations. It views "SB=2.5" as a different info field than "SB=2.2" and so will output as info "SB=2.5;SB=2.2". That is, it compares the full field string, rather than only the field name.

Usage:

./mergeVCFInfoFields I=[comma-delimited list of files] O=[output file]



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2176 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-11-29 20:01:29 +00:00
parent 27651d8dc2
commit 45673d7851
1 changed files with 97 additions and 0 deletions

View File

@ -0,0 +1,97 @@
#!/usr/bin/env Python
import sys
debug = False
print("Merging:")
# open files for reading
vcfInputFiles = []
vcfOutputFile = ""
for arg in sys.argv:
if( arg == "./mergeVCFInfoFields.py" ):
# do nothing
continue
else:
if ( arg.startswith("I=") ):
input1 = arg.strip().split("I=")[1]
input2 = input1.split(",")
for vcfInput in input2:
print(vcfInput)
vcfInputFiles.append(vcfInput)
elif ( arg.startswith("O=") ):
vcfOutputFile=open(arg.strip().split("O=")[1],'w')
else:
print("Unsupported argument: "+arg)
sys.exit()
lines = 0
# the proper (albeit slower) way to do this is to read the info fields
# into a dict and then iterate through the keys
infodict = dict()
for inFile in vcfInputFiles:
# for each file
for line in open(inFile).readlines():
# for each line
if ( line.startswith("#") ):
# do nothing
continue
else:
spline = line.strip().split()
chrompos = spline[0]+":"+spline[1]
info = spline[7]
if ( chrompos in infodict ):
if ( info == "." ):
# do nothing
continue
else:
curInfo = infodict[chrompos]
if ( curInfo == "." or curInfo == ""):
newInfo = info
else:
# now we need to parse the fields
curinfofields = set(curInfo.split(";"))
newinfofields = set(newInfo.split(";"))
curinfofields.update(newinfofields)
newInfo = ";".join(curinfofields)
infodict[chrompos] = newInfo
#print(newInfo)
else:
infodict[chrompos] = info
# dictionary has been constructed; now just iterate through the first vcf
# and update the info field, printing to the output file
if ( debug ):
for key in infodict:
print(infodict[key])
for line in open(vcfInputFiles[0]).readlines():
line = line.strip()
if ( line.startswith("#") ):
# this is a header
vcfOutputFile.write(line+"\n")
else:
# this is a real line
spline = line.split()
outstring = ""
fieldno = 0
for field in spline:
#print("fieldno="+str(fieldno))
if ( fieldno == 7 ):
outstring = outstring+infodict[spline[0]+":"+spline[1]]+"\t"
fieldno = fieldno + 1
else:
outstring = outstring+field+"\t"
fieldno = fieldno + 1
# we wrote an extra \t, replace the last one with an \n
outstring = outstring.strip()+"\n"
vcfOutputFile.write(outstring)
# and we're done