From 45673d78515533a3e6f782a85f8c83beb27b65aa Mon Sep 17 00:00:00 2001 From: chartl Date: Sun, 29 Nov 2009 20:01:29 +0000 Subject: [PATCH] A quick and dirty script that, given a list of input VCF files, will output a new VCF file which looks identical to the first VCF file of the input list, except that the info field has been updated to reflect the union of all the INFO annotations across the VCF files Note: this is primarily for use with two files with mostly disjoint annotations. It views "SB=2.5" as a different info field than "SB=2.2" and so will output as info "SB=2.5;SB=2.2". That is, it compares the full field string, rather than only the field name. Usage: ./mergeVCFInfoFields I=[comma-delimited list of files] O=[output file] git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2176 348d0f76-0448-11de-a6fe-93d51630548a --- python/mergeVCFInfoFields.py | 97 ++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 python/mergeVCFInfoFields.py diff --git a/python/mergeVCFInfoFields.py b/python/mergeVCFInfoFields.py new file mode 100644 index 000000000..0a886b51f --- /dev/null +++ b/python/mergeVCFInfoFields.py @@ -0,0 +1,97 @@ +#!/usr/bin/env Python + +import sys + +debug = False + +print("Merging:") + +# open files for reading +vcfInputFiles = [] +vcfOutputFile = "" +for arg in sys.argv: + if( arg == "./mergeVCFInfoFields.py" ): + # do nothing + continue + else: + if ( arg.startswith("I=") ): + input1 = arg.strip().split("I=")[1] + input2 = input1.split(",") + for vcfInput in input2: + print(vcfInput) + vcfInputFiles.append(vcfInput) + elif ( arg.startswith("O=") ): + vcfOutputFile=open(arg.strip().split("O=")[1],'w') + else: + print("Unsupported argument: "+arg) + sys.exit() + +lines = 0 + +# the proper (albeit slower) way to do this is to read the info fields +# into a dict and then iterate through the keys + +infodict = dict() + +for inFile in vcfInputFiles: + # for each file + for line in open(inFile).readlines(): + # for each line + if ( line.startswith("#") ): + # do nothing + continue + else: + spline = line.strip().split() + chrompos = spline[0]+":"+spline[1] + info = spline[7] + if ( chrompos in infodict ): + if ( info == "." ): + # do nothing + continue + else: + curInfo = infodict[chrompos] + if ( curInfo == "." or curInfo == ""): + newInfo = info + else: + # now we need to parse the fields + curinfofields = set(curInfo.split(";")) + newinfofields = set(newInfo.split(";")) + curinfofields.update(newinfofields) + newInfo = ";".join(curinfofields) + + infodict[chrompos] = newInfo + #print(newInfo) + else: + infodict[chrompos] = info + +# dictionary has been constructed; now just iterate through the first vcf +# and update the info field, printing to the output file + +if ( debug ): + for key in infodict: + print(infodict[key]) + +for line in open(vcfInputFiles[0]).readlines(): + line = line.strip() + if ( line.startswith("#") ): + # this is a header + vcfOutputFile.write(line+"\n") + else: + # this is a real line + spline = line.split() + outstring = "" + fieldno = 0 + for field in spline: + #print("fieldno="+str(fieldno)) + if ( fieldno == 7 ): + outstring = outstring+infodict[spline[0]+":"+spline[1]]+"\t" + fieldno = fieldno + 1 + else: + outstring = outstring+field+"\t" + fieldno = fieldno + 1 + # we wrote an extra \t, replace the last one with an \n + outstring = outstring.strip()+"\n" + vcfOutputFile.write(outstring) + + +# and we're done