diff --git a/python/ValidateGATK.py b/python/ValidateGATK.py index 984adef44..952cb74ea 100755 --- a/python/ValidateGATK.py +++ b/python/ValidateGATK.py @@ -2,6 +2,7 @@ import farm_commands import os.path import sys from optparse import OptionParser +import string defaultCommands = ['CountReads', 'Pileup'] @@ -9,6 +10,72 @@ def indexBAM(reads): cmd = "samtools index " + reads farm_commands.cmd(cmd, None, None) +def readIntervalFile(file): + def notHeaderP(line): + return line[0][0] <> '@' + lines = filter(notHeaderP, map( string.split, [line for line in open(file)])) + + byContig = {} + for entry in lines: + elt = [int(entry[1]), int(entry[2])] + if entry[0] in byContig: + byContig[entry[0]].append(elt) + else: + byContig[entry[0]] = [elt] + + #print byContig + return byContig + +def isIntervalFile(filename): + return os.path.splitext(filename)[1] == '.interval_list' + +def filterByInterval(intervalMap, unfilteredPileupFile, filteredPileupFile): + print 'Intervals', intervalMap + + def maybeWrite1(line, offsetMap, out): + debug = False + parts = line.split() + contig = parts[0] + pos = int(parts[1]) + if debug: print 'pileup', contig, pos, line, + + if contig in intervalMap: + intervals = intervalMap[contig] + + while (offsetMap[contig] < len(intervals)): + offset = offsetMap[contig] + #print intervals[offset] + curStart, curStop = intervals[offset][0:2] + if debug: print contig, pos, curStart, curStop + if pos >= curStart: + if pos <= curStop: + #print line, + print >> out, line, + break + else: + if debug: print 'Progressing', contig, pos, curStart, curStop + offsetMap[contig] += 1 + else: + break + return offsetMap + + def allDone( offsetMap, intervalMap ): + def oneDone( contig ): + return offsetMap[contig] >= len(intervalMap[contig]) + return all( map(oneDone, offsetMap.iterkeys()) ) + + i = 0 + offsetMap = dict([(key, 0) for key in intervalMap.keys()]) + out = open(filteredPileupFile, 'w') + for line in open(unfilteredPileupFile): + offsetMap = maybeWrite1(line, offsetMap, out) + i += 1 + if i % 10000 == 0 and allDone(offsetMap, intervalMap): + break + out.close() + + + def main(): global OPTIONS, ROOT @@ -32,6 +99,15 @@ def main(): parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles", action='store_true', default=False, help="If provided, all intermediate files (BAM and pileups) will be regenerated") + parser.add_option("-n", "--dontValidate", dest="dontValidate", + action='store_true', default=False, + help="If provided, won't try to validate, just make sure the validation data files are ready") + parser.add_option("-g", "--gatk", dest="gatkPath", + type="string", default="~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar", + help="Path to the gatk") + parser.add_option("-x", "--args", dest="extraArgs", + type="string", default="", + help="Extra args to pass to the GATK") parser.add_option("-p", "--justPrint", dest="justPrint", action='store_true', default=False, help="Don't actually run GATK, just setup data files") @@ -65,7 +141,7 @@ def main(): originalReads = os.path.expanduser(originalReads) ref = os.path.expanduser(ref) - + if not os.path.exists(originalReads) or not os.path.exists(ref): print 'Input files do not exist!', originalReads, ref sys.exit(1) @@ -87,22 +163,39 @@ def main(): indexBAM(reads) if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles: - if region == '*': + if region == '*' or isIntervalFile(region): farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM) - farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAM+'.bai') else: cmd = "samtools view -b " + reads + " " + region + " > " + subBAM farm_commands.cmd(cmd, None, None) + + subBAMIndex = subBAM+'.bai' + if not os.path.exists(subBAMIndex) or OPTIONS.rebuildAllFiles: + if region == '*' or isIntervalFile(region): + farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAMIndex) + else: indexBAM(subBAM) - + if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles: - cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup - farm_commands.cmd(cmd, None, None) + if isIntervalFile(region): + filteredPileup = pileup + ".unfiltered" + if not os.path.exists(filteredPileup) or OPTIONS.rebuildAllFiles: + cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + filteredPileup + #farm_commands.cmd(cmd, None, None) + filterByInterval(readIntervalFile(region), filteredPileup, pileup) + else: + cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup + farm_commands.cmd(cmd, None, None) - if not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles: + if not OPTIONS.dontValidate and (not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles): print validationOutput, 'does not exist' analysis = "ValidatingPileup" - cmd = "java -ea -Xmx1024m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U -B pileup SAMPileup " + pileup + cmd = "java -ea -Xmx1024m -jar " + OPTIONS.gatkPath + " -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U " + OPTIONS.extraArgs + + if isIntervalFile(region): + cmd += " --intervals_file " + region + + cmd += " -B pileup SAMPileup " + pileup print cmd farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint) diff --git a/testdata/ValidatingPileupTargets.list b/testdata/ValidatingPileupTargets.list index 499bd4e6e..43f5722f8 100755 --- a/testdata/ValidatingPileupTargets.list +++ b/testdata/ValidatingPileupTargets.list @@ -21,6 +21,9 @@ # samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta * +# whole exom interval-based analysis for cancer +/humgen/gsa-scr1/GATK_Data/Validation_Data/TCGA-06-0188.aligned.duplicates_marked.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta /humgen/gsa-scr1/GATK_Data/Validation_Data/TCGA-06-0188.interval_list + # WGS tumor -- figure it out #/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *