Fixed potential bug with next() operation returning empty contexts when a read contains a large deletion. We can now use the look ahead safely...

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@439 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-15 21:41:30 +00:00
parent 7261787b71
commit 2eabcfedb7
3 changed files with 132 additions and 2 deletions

View File

@ -0,0 +1,103 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
defaultCommands = ['CountReads', 'Pileup']
def indexBAM(reads):
cmd = "samtools index " + reads
farm_commands.cmd(cmd, None, None)
def main():
global OPTIONS, ROOT
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--file", dest="validateInfoFile",
type="string", default=None,
help="File name of the validation set")
parser.add_option("-d", "--dataDir", dest="dataDir",
type="string", default=None,
help="Directory to the data files")
parser.add_option("-o", "--outputDir", dest="outputDir",
type="string", default=None,
help="Directory to put the output")
parser.add_option("-q", "--farmQueue", dest="farmQueue",
type="string", default=None,
help="Farm queue to submit jobs to. Leave blank for local processing")
parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=True,
help="Ignores the existing files")
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
action='store_true', default=False,
help="If provided, all intermediate files (BAM and pileups) will be regenerated")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
parser.error("incorrect number of arguments")
if OPTIONS.validateInfoFile == None:
parser.error("f option is required")
if OPTIONS.dataDir == None:
parser.error("d option is required")
if OPTIONS.outputDir == None:
parser.error("o option is required")
if not os.path.exists(OPTIONS.outputDir):
os.mkdir(OPTIONS.outputDir)
if not os.path.exists(OPTIONS.dataDir):
os.mkdir(OPTIONS.dataDir)
for line in open(OPTIONS.validateInfoFile):
if line.strip() == "" or line[0] == '#':
# comment line
continue
# line is of the format:
# <reads.bam> <ref.fasta> <processingRegion>
originalReads, ref, region = line.split()
originalReads = os.path.expanduser(originalReads)
ref = os.path.expanduser(ref)
if not os.path.exists(originalReads) or not os.path.exists(ref):
print 'Input files do not exist!', originalReads, ref
sys.exit(1)
head, read_filename = os.path.split(originalReads)
filebase = os.path.splitext(read_filename)[0]
reads = os.path.join(OPTIONS.dataDir, read_filename)
readsIndex = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam.bai')
subBAM = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam')
pileup = os.path.join(OPTIONS.dataDir, filebase + '.selected.pileup')
validationOutput = os.path.join(OPTIONS.outputDir, filebase + '.validate.output')
#print 'reads', reads
if not os.path.exists(reads) or OPTIONS.rebuildAllFiles:
farm_commands.cmd("ln -s " + os.path.abspath(originalReads) + " " + reads)
if not os.path.exists(readsIndex) or OPTIONS.rebuildAllFiles:
indexBAM(reads)
if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles:
if region == '*':
farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM)
farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAM+'.bai')
else:
cmd = "samtools view -b " + reads + " " + region + " > " + subBAM
farm_commands.cmd(cmd, None, None)
if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles:
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup
farm_commands.cmd(cmd, None, None)
if not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles:
analysis = "ValidatingPileup"
cmd = "java -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U -B pileup SAMPileup " + pileup
print cmd
farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput)
if __name__ == "__main__":
main()

View File

@ -3,11 +3,14 @@
import os
#justPrintCommands = False
def cmd(cmd_str, farm_queue=False, output_head="", just_print_commands=False):
def cmd(cmd_str, farm_queue=False, output_head="", just_print_commands=False, outputFile = None):
# if farm_queue is non-False, submits to queue, other
if farm_queue:
farm_stdout = output_head+".stdout"
if outputFile <> None:
farm_stdout = outputFile
else:
farm_stdout = output_head+".stdout"
cmd_str = "bsub -q "+farm_queue+" -o "+farm_stdout+" "+cmd_str #+" TMP_DIR=/wga/scr1/andrewk/tmp"
print ">>> Farming via "+cmd_str
else:

View File

@ -0,0 +1,24 @@
# e.coli
/humgen/gsa-scr1/GATK_Data/Validation_Data/MV1994.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Escherichia_coli_K12_MG1655.fasta *
# daughter chrom 6 deep coverage
/broad/1KG/DCC/data/NA12878/alignment/NA12878.chrom6.SRP000032.2009_02.bam /broad/1KG/reference/human_b36_both.fasta 6:1-10,000,00
# daughter chrom 1 deep coverage
/broad/1KG/DCC/data/NA12878/alignment/NA12878.chrom1.SRP000032.2009_02.bam /broad/1KG/reference/human_b36_female.fasta *
# mother of trio individual low coverage
/broad/1KG/DCC/data/NA12892/alignment/NA12892.SRP000031.2009_02.bam /broad/1KG/reference/human_b36_female.fasta *
# Pilot 3
/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
# anthony PCR lane
/seq/dirseq/aphilipp/combo/sequences/pcr/samfiles/10035.5.clean.sam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
# anthony PCR lane
/seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
# WGS tumor
#/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *