From 2eabcfedb76b9bc7be09d0d75b143a5ea679b1a5 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 15 Apr 2009 21:41:30 +0000 Subject: [PATCH] Fixed potential bug with next() operation returning empty contexts when a read contains a large deletion. We can now use the look ahead safely... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@439 348d0f76-0448-11de-a6fe-93d51630548a --- python/ValidateGATK.py | 103 ++++++++++++++++++++++++++ python/farm_commands.py | 7 +- testdata/ValidatingPileupTargets.list | 24 ++++++ 3 files changed, 132 insertions(+), 2 deletions(-) create mode 100755 python/ValidateGATK.py create mode 100755 testdata/ValidatingPileupTargets.list diff --git a/python/ValidateGATK.py b/python/ValidateGATK.py new file mode 100755 index 000000000..9c883fea6 --- /dev/null +++ b/python/ValidateGATK.py @@ -0,0 +1,103 @@ +import farm_commands +import os.path +import sys +from optparse import OptionParser + +defaultCommands = ['CountReads', 'Pileup'] + +def indexBAM(reads): + cmd = "samtools index " + reads + farm_commands.cmd(cmd, None, None) + +def main(): + global OPTIONS, ROOT + + usage = "usage: %prog [options]" + parser = OptionParser(usage=usage) + parser.add_option("-f", "--file", dest="validateInfoFile", + type="string", default=None, + help="File name of the validation set") + parser.add_option("-d", "--dataDir", dest="dataDir", + type="string", default=None, + help="Directory to the data files") + parser.add_option("-o", "--outputDir", dest="outputDir", + type="string", default=None, + help="Directory to put the output") + parser.add_option("-q", "--farmQueue", dest="farmQueue", + type="string", default=None, + help="Farm queue to submit jobs to. Leave blank for local processing") + parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles", + action='store_true', default=True, + help="Ignores the existing files") + parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles", + action='store_true', default=False, + help="If provided, all intermediate files (BAM and pileups) will be regenerated") + + (OPTIONS, args) = parser.parse_args() + if len(args) != 0: + parser.error("incorrect number of arguments") + + if OPTIONS.validateInfoFile == None: + parser.error("f option is required") + if OPTIONS.dataDir == None: + parser.error("d option is required") + if OPTIONS.outputDir == None: + parser.error("o option is required") + + if not os.path.exists(OPTIONS.outputDir): + os.mkdir(OPTIONS.outputDir) + if not os.path.exists(OPTIONS.dataDir): + os.mkdir(OPTIONS.dataDir) + + for line in open(OPTIONS.validateInfoFile): + if line.strip() == "" or line[0] == '#': + # comment line + continue + + # line is of the format: + # + originalReads, ref, region = line.split() + + originalReads = os.path.expanduser(originalReads) + ref = os.path.expanduser(ref) + + if not os.path.exists(originalReads) or not os.path.exists(ref): + print 'Input files do not exist!', originalReads, ref + sys.exit(1) + + head, read_filename = os.path.split(originalReads) + filebase = os.path.splitext(read_filename)[0] + + reads = os.path.join(OPTIONS.dataDir, read_filename) + readsIndex = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam.bai') + subBAM = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam') + pileup = os.path.join(OPTIONS.dataDir, filebase + '.selected.pileup') + validationOutput = os.path.join(OPTIONS.outputDir, filebase + '.validate.output') + + #print 'reads', reads + if not os.path.exists(reads) or OPTIONS.rebuildAllFiles: + farm_commands.cmd("ln -s " + os.path.abspath(originalReads) + " " + reads) + + if not os.path.exists(readsIndex) or OPTIONS.rebuildAllFiles: + indexBAM(reads) + + if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles: + if region == '*': + farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM) + farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAM+'.bai') + else: + cmd = "samtools view -b " + reads + " " + region + " > " + subBAM + farm_commands.cmd(cmd, None, None) + + if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles: + cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup + farm_commands.cmd(cmd, None, None) + + if not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles: + analysis = "ValidatingPileup" + cmd = "java -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U -B pileup SAMPileup " + pileup + print cmd + farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput) + +if __name__ == "__main__": + main() diff --git a/python/farm_commands.py b/python/farm_commands.py index 8e9ad0cab..f6b618c98 100644 --- a/python/farm_commands.py +++ b/python/farm_commands.py @@ -3,11 +3,14 @@ import os #justPrintCommands = False -def cmd(cmd_str, farm_queue=False, output_head="", just_print_commands=False): +def cmd(cmd_str, farm_queue=False, output_head="", just_print_commands=False, outputFile = None): # if farm_queue is non-False, submits to queue, other if farm_queue: - farm_stdout = output_head+".stdout" + if outputFile <> None: + farm_stdout = outputFile + else: + farm_stdout = output_head+".stdout" cmd_str = "bsub -q "+farm_queue+" -o "+farm_stdout+" "+cmd_str #+" TMP_DIR=/wga/scr1/andrewk/tmp" print ">>> Farming via "+cmd_str else: diff --git a/testdata/ValidatingPileupTargets.list b/testdata/ValidatingPileupTargets.list new file mode 100755 index 000000000..08772b636 --- /dev/null +++ b/testdata/ValidatingPileupTargets.list @@ -0,0 +1,24 @@ +# e.coli +/humgen/gsa-scr1/GATK_Data/Validation_Data/MV1994.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Escherichia_coli_K12_MG1655.fasta * + +# daughter chrom 6 deep coverage +/broad/1KG/DCC/data/NA12878/alignment/NA12878.chrom6.SRP000032.2009_02.bam /broad/1KG/reference/human_b36_both.fasta 6:1-10,000,00 + +# daughter chrom 1 deep coverage +/broad/1KG/DCC/data/NA12878/alignment/NA12878.chrom1.SRP000032.2009_02.bam /broad/1KG/reference/human_b36_female.fasta * + +# mother of trio individual low coverage +/broad/1KG/DCC/data/NA12892/alignment/NA12892.SRP000031.2009_02.bam /broad/1KG/reference/human_b36_female.fasta * + +# Pilot 3 +/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * + +# anthony PCR lane +/seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * + +# anthony PCR lane +/seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * + +# WGS tumor +#/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta * +