gatk-3.8/python/createCaseControlMetaData.py

62 lines
2.3 KiB
Python
Raw Normal View History

import sys
import os
import subprocess
import shlex
from optparse import OptionParser
def parseInput(fList,ignoreExt = None):
inNames = []
for ele in fList:
if ( ignoreExt != None and ele.endswith(ignoreExt) ):
inFileNames.append(ele)
if ( os.path.exists(ele) ):
for line in open(ele).readlines():
inNames.append(line.strip())
return inNames
def bamsWithSamples(bamList):
cmdbase = "samtools view -H %s | grep SM | tr '\\t' '\\n' | grep SM | sed 's/SM://g' | uniq"
sam2bam = dict()
for bf in bamList:
if ( not os.path.exists(bf) ):
raise IOError("Bam file "+bf+" does not exist")
cmd = cmdbase % bf
proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
proc.wait()
stdout = proc.stdout.readlines()
if ( len(stdout) > 1 ):
raise RuntimeError("The bam file "+bf+" contains multiple different sample entries")
sm = stdout[0].strip()
sam2bam[sm]=bf
return sam2bam
def runMain(opt,arg):
bamFiles = bamsWithSamples(parseInput(opt.bam_files,"bam"))
caseNames = set(parseInput(opt.cases))
controlNames = set(parseInput(opt.controls))
output = open(opt.output,'w')
output.write("samples:")
sample_base = "- id: %s\n properties:\n cohort: %s\n bam: %s"
for s in bamFiles.keys():
cc = "Unknown"
if ( s in caseNames ):
cc = "case"
if ( s in controlNames ):
cc = "control"
output.write("\n" + sample_base % (s,cc,bamFiles[s]))
output.close()
def main():
usage = "usage: %prog [options] arg"
parser = OptionParser(usage)
parser.add_option("-I","--bams",dest="bam_files",help="the bam files, as multiple arguments or a simple newline-delimited file",action="append")
parser.add_option("-A","--case",dest="cases",action="append",help="A list of the case samples, as multiple arguments or a simple newline-delimited file")
parser.add_option("-O","--control",dest="controls",action="append",help="A list of the control samples, multiple arguments or a newline-delimited file")
parser.add_option("-o","--out",dest="output",action="store",help="Name of the output metadata file to write to")
(options,args) = parser.parse_args()
runMain(options,args)
if __name__ == "__main__":
main()