A quick and dirty script that, given a list of input VCF files, will output a new VCF file which looks identical to the first VCF file of the input list, except that the info field has been updated to reflect the union of all the INFO annotations across the VCF files

Note: this is primarily for use with two files with mostly disjoint annotations. It views "SB=2.5" as a different info field than "SB=2.2" and so will output as info "SB=2.5;SB=2.2". That is, it compares the full field string, rather than only the field name. Usage: ./mergeVCFInfoFields I=[comma-delimited list of files] O=[output file] git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2176 348d0f76-0448-11de-a6fe-93d51630548a
2009-11-29 20:01:29 +00:00 · 2009-11-29 20:01:29 +00:00 · 45673d7851
parent 27651d8dc2
commit 45673d7851
1 changed files with 97 additions and 0 deletions
--- a/python/mergeVCFInfoFields.py
+++ b/python/mergeVCFInfoFields.py
@ -0,0 +1,97 @@
+#!/usr/bin/env Python
+
+import sys
+
+debug = False
+
+print("Merging:")
+
+# open files for reading
+vcfInputFiles = []
+vcfOutputFile = ""
+for arg in sys.argv:
+    if( arg == "./mergeVCFInfoFields.py" ):
+        # do nothing
+        continue
+    else:
+        if ( arg.startswith("I=") ):
+            input1 = arg.strip().split("I=")[1]
+            input2 = input1.split(",")
+            for vcfInput in input2:
+                print(vcfInput)
+                vcfInputFiles.append(vcfInput)
+        elif ( arg.startswith("O=") ):
+            vcfOutputFile=open(arg.strip().split("O=")[1],'w')
+        else:
+            print("Unsupported argument: "+arg)
+            sys.exit()
+
+lines = 0
+
+# the proper (albeit slower) way to do this is to read the info fields
+# into a dict and then iterate through the keys
+
+infodict = dict()
+
+for inFile in vcfInputFiles:
+    # for each file
+    for line in open(inFile).readlines():
+        # for each line
+        if ( line.startswith("#") ):
+            # do nothing
+            continue
+        else:
+            spline = line.strip().split()
+            chrompos = spline[0]+":"+spline[1]
+            info = spline[7]
+            if ( chrompos in infodict ):
+                if ( info == "." ):
+                    # do nothing
+                    continue
+                else:
+                    curInfo = infodict[chrompos]
+                    if ( curInfo == "." or curInfo == ""):
+                        newInfo = info
+                    else:
+                        # now we need to parse the fields
+                        curinfofields = set(curInfo.split(";"))
+                        newinfofields = set(newInfo.split(";"))
+                        curinfofields.update(newinfofields)
+                        newInfo = ";".join(curinfofields)
+                    
+                    infodict[chrompos] = newInfo
+                    #print(newInfo)
+            else:
+                infodict[chrompos] = info
+
+# dictionary has been constructed; now just iterate through the first vcf
+# and update the info field, printing to the output file
+
+if ( debug ):
+    for key in infodict:
+        print(infodict[key])
+
+for line in open(vcfInputFiles[0]).readlines():
+    line = line.strip()
+    if ( line.startswith("#") ):
+        # this is a header
+        vcfOutputFile.write(line+"\n")
+    else:
+        # this is a real line
+        spline = line.split()
+        outstring = ""
+        fieldno = 0
+        for field in spline:
+            #print("fieldno="+str(fieldno))
+            if ( fieldno == 7 ):
+                outstring = outstring+infodict[spline[0]+":"+spline[1]]+"\t"
+                fieldno = fieldno + 1
+            else:
+                outstring = outstring+field+"\t"
+                fieldno = fieldno + 1
+        # we wrote an extra \t, replace the last one with an \n
+        outstring = outstring.strip()+"\n"
+        vcfOutputFile.write(outstring)
+
+
+# and we're done