Better validation scripts and data
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@562 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7de5da7065
commit
30218ee31a
|
|
@ -2,6 +2,7 @@ import farm_commands
|
||||||
import os.path
|
import os.path
|
||||||
import sys
|
import sys
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
import string
|
||||||
|
|
||||||
defaultCommands = ['CountReads', 'Pileup']
|
defaultCommands = ['CountReads', 'Pileup']
|
||||||
|
|
||||||
|
|
@ -9,6 +10,72 @@ def indexBAM(reads):
|
||||||
cmd = "samtools index " + reads
|
cmd = "samtools index " + reads
|
||||||
farm_commands.cmd(cmd, None, None)
|
farm_commands.cmd(cmd, None, None)
|
||||||
|
|
||||||
|
def readIntervalFile(file):
|
||||||
|
def notHeaderP(line):
|
||||||
|
return line[0][0] <> '@'
|
||||||
|
lines = filter(notHeaderP, map( string.split, [line for line in open(file)]))
|
||||||
|
|
||||||
|
byContig = {}
|
||||||
|
for entry in lines:
|
||||||
|
elt = [int(entry[1]), int(entry[2])]
|
||||||
|
if entry[0] in byContig:
|
||||||
|
byContig[entry[0]].append(elt)
|
||||||
|
else:
|
||||||
|
byContig[entry[0]] = [elt]
|
||||||
|
|
||||||
|
#print byContig
|
||||||
|
return byContig
|
||||||
|
|
||||||
|
def isIntervalFile(filename):
|
||||||
|
return os.path.splitext(filename)[1] == '.interval_list'
|
||||||
|
|
||||||
|
def filterByInterval(intervalMap, unfilteredPileupFile, filteredPileupFile):
|
||||||
|
print 'Intervals', intervalMap
|
||||||
|
|
||||||
|
def maybeWrite1(line, offsetMap, out):
|
||||||
|
debug = False
|
||||||
|
parts = line.split()
|
||||||
|
contig = parts[0]
|
||||||
|
pos = int(parts[1])
|
||||||
|
if debug: print 'pileup', contig, pos, line,
|
||||||
|
|
||||||
|
if contig in intervalMap:
|
||||||
|
intervals = intervalMap[contig]
|
||||||
|
|
||||||
|
while (offsetMap[contig] < len(intervals)):
|
||||||
|
offset = offsetMap[contig]
|
||||||
|
#print intervals[offset]
|
||||||
|
curStart, curStop = intervals[offset][0:2]
|
||||||
|
if debug: print contig, pos, curStart, curStop
|
||||||
|
if pos >= curStart:
|
||||||
|
if pos <= curStop:
|
||||||
|
#print line,
|
||||||
|
print >> out, line,
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if debug: print 'Progressing', contig, pos, curStart, curStop
|
||||||
|
offsetMap[contig] += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return offsetMap
|
||||||
|
|
||||||
|
def allDone( offsetMap, intervalMap ):
|
||||||
|
def oneDone( contig ):
|
||||||
|
return offsetMap[contig] >= len(intervalMap[contig])
|
||||||
|
return all( map(oneDone, offsetMap.iterkeys()) )
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
offsetMap = dict([(key, 0) for key in intervalMap.keys()])
|
||||||
|
out = open(filteredPileupFile, 'w')
|
||||||
|
for line in open(unfilteredPileupFile):
|
||||||
|
offsetMap = maybeWrite1(line, offsetMap, out)
|
||||||
|
i += 1
|
||||||
|
if i % 10000 == 0 and allDone(offsetMap, intervalMap):
|
||||||
|
break
|
||||||
|
out.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global OPTIONS, ROOT
|
global OPTIONS, ROOT
|
||||||
|
|
||||||
|
|
@ -32,6 +99,15 @@ def main():
|
||||||
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
|
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="If provided, all intermediate files (BAM and pileups) will be regenerated")
|
help="If provided, all intermediate files (BAM and pileups) will be regenerated")
|
||||||
|
parser.add_option("-n", "--dontValidate", dest="dontValidate",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If provided, won't try to validate, just make sure the validation data files are ready")
|
||||||
|
parser.add_option("-g", "--gatk", dest="gatkPath",
|
||||||
|
type="string", default="~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar",
|
||||||
|
help="Path to the gatk")
|
||||||
|
parser.add_option("-x", "--args", dest="extraArgs",
|
||||||
|
type="string", default="",
|
||||||
|
help="Extra args to pass to the GATK")
|
||||||
parser.add_option("-p", "--justPrint", dest="justPrint",
|
parser.add_option("-p", "--justPrint", dest="justPrint",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="Don't actually run GATK, just setup data files")
|
help="Don't actually run GATK, just setup data files")
|
||||||
|
|
@ -65,7 +141,7 @@ def main():
|
||||||
|
|
||||||
originalReads = os.path.expanduser(originalReads)
|
originalReads = os.path.expanduser(originalReads)
|
||||||
ref = os.path.expanduser(ref)
|
ref = os.path.expanduser(ref)
|
||||||
|
|
||||||
if not os.path.exists(originalReads) or not os.path.exists(ref):
|
if not os.path.exists(originalReads) or not os.path.exists(ref):
|
||||||
print 'Input files do not exist!', originalReads, ref
|
print 'Input files do not exist!', originalReads, ref
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
@ -87,22 +163,39 @@ def main():
|
||||||
indexBAM(reads)
|
indexBAM(reads)
|
||||||
|
|
||||||
if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles:
|
if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles:
|
||||||
if region == '*':
|
if region == '*' or isIntervalFile(region):
|
||||||
farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM)
|
farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM)
|
||||||
farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAM+'.bai')
|
|
||||||
else:
|
else:
|
||||||
cmd = "samtools view -b " + reads + " " + region + " > " + subBAM
|
cmd = "samtools view -b " + reads + " " + region + " > " + subBAM
|
||||||
farm_commands.cmd(cmd, None, None)
|
farm_commands.cmd(cmd, None, None)
|
||||||
|
|
||||||
|
subBAMIndex = subBAM+'.bai'
|
||||||
|
if not os.path.exists(subBAMIndex) or OPTIONS.rebuildAllFiles:
|
||||||
|
if region == '*' or isIntervalFile(region):
|
||||||
|
farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAMIndex)
|
||||||
|
else:
|
||||||
indexBAM(subBAM)
|
indexBAM(subBAM)
|
||||||
|
|
||||||
if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles:
|
if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles:
|
||||||
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup
|
if isIntervalFile(region):
|
||||||
farm_commands.cmd(cmd, None, None)
|
filteredPileup = pileup + ".unfiltered"
|
||||||
|
if not os.path.exists(filteredPileup) or OPTIONS.rebuildAllFiles:
|
||||||
|
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + filteredPileup
|
||||||
|
#farm_commands.cmd(cmd, None, None)
|
||||||
|
filterByInterval(readIntervalFile(region), filteredPileup, pileup)
|
||||||
|
else:
|
||||||
|
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup
|
||||||
|
farm_commands.cmd(cmd, None, None)
|
||||||
|
|
||||||
if not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles:
|
if not OPTIONS.dontValidate and (not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles):
|
||||||
print validationOutput, 'does not exist'
|
print validationOutput, 'does not exist'
|
||||||
analysis = "ValidatingPileup"
|
analysis = "ValidatingPileup"
|
||||||
cmd = "java -ea -Xmx1024m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U -B pileup SAMPileup " + pileup
|
cmd = "java -ea -Xmx1024m -jar " + OPTIONS.gatkPath + " -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U " + OPTIONS.extraArgs
|
||||||
|
|
||||||
|
if isIntervalFile(region):
|
||||||
|
cmd += " --intervals_file " + region
|
||||||
|
|
||||||
|
cmd += " -B pileup SAMPileup " + pileup
|
||||||
print cmd
|
print cmd
|
||||||
farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint)
|
farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,9 @@
|
||||||
# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam
|
# samtools import /seq/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.bam.ref_list /seq/dirseq/aphilipp/combo/sequences/hs/samfiles/30CLA.5.clean.sam 30CLA.5.clean.bam
|
||||||
/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta *
|
/humgen/gsa-scr1/GATK_Data/Validation_Data/30CLA.5.clean.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta *
|
||||||
|
|
||||||
|
# whole exom interval-based analysis for cancer
|
||||||
|
/humgen/gsa-scr1/GATK_Data/Validation_Data/TCGA-06-0188.aligned.duplicates_marked.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta /humgen/gsa-scr1/GATK_Data/Validation_Data/TCGA-06-0188.interval_list
|
||||||
|
|
||||||
# WGS tumor -- figure it out
|
# WGS tumor -- figure it out
|
||||||
#/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
|
#/broad/1KG/pilot3/sams/NA12892.bam /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta *
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue