gatk-3.8/python/ValidateGATK.py

202 lines
8.5 KiB
Python
Executable File

import farm_commands
import os.path
import sys
from optparse import OptionParser
import string
defaultCommands = ['CountReads', 'Pileup']
def indexBAM(reads, queue=None):
cmd = "samtools index " + reads
farm_commands.cmd(cmd, queue, None, just_print_commands=OPTIONS.justPrint)
def readIntervalFile(file):
def notHeaderP(line):
return line[0][0] <> '@'
lines = filter(notHeaderP, map( string.split, [line for line in open(file)]))
byContig = {}
for entry in lines:
elt = [int(entry[1]), int(entry[2])]
if entry[0] in byContig:
byContig[entry[0]].append(elt)
else:
byContig[entry[0]] = [elt]
#print byContig
return byContig
def isIntervalFile(filename):
return os.path.splitext(filename)[1] == '.interval_list'
def filterByInterval(intervalMap, unfilteredPileupFile, filteredPileupFile):
print 'Intervals', intervalMap
def maybeWrite1(line, offsetMap, out):
debug = False
parts = line.split()
contig = parts[0]
pos = int(parts[1])
if debug: print 'pileup', contig, pos, line,
if contig in intervalMap:
intervals = intervalMap[contig]
while (offsetMap[contig] < len(intervals)):
offset = offsetMap[contig]
#print intervals[offset]
curStart, curStop = intervals[offset][0:2]
if debug: print contig, pos, curStart, curStop
if pos >= curStart:
if pos <= curStop:
#print line,
print >> out, line,
break
else:
if debug: print 'Progressing', contig, pos, curStart, curStop
offsetMap[contig] += 1
else:
break
return offsetMap
def allDone( offsetMap, intervalMap ):
def oneDone( contig ):
return offsetMap[contig] >= len(intervalMap[contig])
return all( map(oneDone, offsetMap.iterkeys()) )
i = 0
offsetMap = dict([(key, 0) for key in intervalMap.keys()])
out = open(filteredPileupFile, 'w')
for line in open(unfilteredPileupFile):
offsetMap = maybeWrite1(line, offsetMap, out)
i += 1
if i % 10000 == 0 and allDone(offsetMap, intervalMap):
break
out.close()
def main():
global OPTIONS, ROOT
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--file", dest="validateInfoFile",
type="string", default=None,
help="File name of the validation set")
parser.add_option("-d", "--dataDir", dest="dataDir",
type="string", default=None,
help="Directory to the data files")
parser.add_option("-o", "--outputDir", dest="outputDir",
type="string", default=None,
help="Directory to put the output")
parser.add_option("-q", "--farmQueue", dest="farmQueue",
type="string", default=None,
help="Farm queue to submit jobs to. Leave blank for local processing")
parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=False,
help="Ignores the existing files")
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
action='store_true', default=False,
help="If provided, all intermediate files (BAM and pileups) will be regenerated")
parser.add_option("-n", "--dontValidate", dest="dontValidate",
action='store_true', default=False,
help="If provided, won't try to validate, just make sure the validation data files are ready")
parser.add_option("-g", "--gatk", dest="gatkPath",
type="string", default="~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar",
help="Path to the gatk")
parser.add_option("-x", "--args", dest="extraArgs",
type="string", default="",
help="Extra args to pass to the GATK")
parser.add_option("-p", "--justPrint", dest="justPrint",
action='store_true', default=False,
help="Don't actually run GATK, just setup data files")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
parser.error("incorrect number of arguments")
if OPTIONS.validateInfoFile == None:
parser.error("f option is required")
if OPTIONS.dataDir == None:
parser.error("d option is required")
if OPTIONS.outputDir == None:
parser.error("o option is required")
if not os.path.exists(OPTIONS.outputDir):
os.mkdir(OPTIONS.outputDir)
if not os.path.exists(OPTIONS.dataDir):
os.mkdir(OPTIONS.dataDir)
for line in open(OPTIONS.validateInfoFile):
if line.strip() == "" or line[0] == '#':
# comment line
continue
# line is of the format:
# <reads.bam> <ref.fasta> <processingRegion>
originalReads, ref, region = line.split()
originalReads = os.path.expanduser(originalReads)
ref = os.path.expanduser(ref)
if not os.path.exists(originalReads) or not os.path.exists(ref):
print 'Input files do not exist!', originalReads, ref
sys.exit(1)
head, read_filename = os.path.split(originalReads)
filebase = os.path.splitext(read_filename)[0]
reads = os.path.join(OPTIONS.dataDir, read_filename)
readsIndex = reads + '.bai'
subBAM = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam')
pileup = os.path.join(OPTIONS.dataDir, filebase + '.selected.pileup')
validationOutput = os.path.join(OPTIONS.outputDir, filebase + '.validate.output')
#print 'reads', reads
if not os.path.exists(reads) or OPTIONS.rebuildAllFiles:
farm_commands.cmd("ln -s " + os.path.abspath(originalReads) + " " + reads, just_print_commands=OPTIONS.justPrint)
if not os.path.exists(readsIndex) or OPTIONS.rebuildAllFiles:
indexBAM(reads)
if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles:
if region == '*' or isIntervalFile(region):
farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM, just_print_commands=OPTIONS.justPrint)
else:
cmd = "samtools view -b " + reads + " " + region + " > " + subBAM
farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
subBAMIndex = subBAM+'.bai'
if not os.path.exists(subBAMIndex) or OPTIONS.rebuildAllFiles:
if region == '*' or isIntervalFile(region):
farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAMIndex, just_print_commands=OPTIONS.justPrint)
else:
indexBAM(subBAM)
if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles:
if isIntervalFile(region):
filteredPileup = pileup + ".unfiltered"
if not os.path.exists(filteredPileup) or OPTIONS.rebuildAllFiles:
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + filteredPileup
#farm_commands.cmd(cmd, None, None)
filterByInterval(readIntervalFile(region), filteredPileup, pileup)
else:
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup
farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
if not OPTIONS.dontValidate and (not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles):
print validationOutput, 'does not exist'
analysis = "ValidatingPileup"
cmd = "java -ea -Xmx1024m -jar " + OPTIONS.gatkPath + " -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U " + OPTIONS.extraArgs
if isIntervalFile(region):
cmd += " --intervals " + region
cmd += " -B pileup,SAMPileup," + pileup
print cmd
farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint)
if __name__ == "__main__":
main()