more and better python scripts for dealing with calls
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@881 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a1218ef508
commit
3998085e4b
|
|
@ -10,7 +10,22 @@ import itertools
|
||||||
|
|
||||||
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
|
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
|
||||||
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
||||||
analysis = "CombineDuplicates"
|
|
||||||
|
def geli2dbsnpFile(geli):
|
||||||
|
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||||
|
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||||
|
|
||||||
|
|
||||||
|
def bams2geli(bams):
|
||||||
|
def call1(bam):
|
||||||
|
geli = os.path.splitext(bam)[0] + '.geli'
|
||||||
|
jobid = 0
|
||||||
|
if not os.path.exists(geli):
|
||||||
|
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
|
||||||
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue )
|
||||||
|
return geli, jobid
|
||||||
|
calls = map(call1, bams)
|
||||||
|
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global OPTIONS, ROOT
|
global OPTIONS, ROOT
|
||||||
|
|
@ -25,7 +40,7 @@ def main():
|
||||||
help="minimum lod for calling a variant")
|
help="minimum lod for calling a variant")
|
||||||
parser.add_option("-k", "--column", dest="column",
|
parser.add_option("-k", "--column", dest="column",
|
||||||
type="int", default=1,
|
type="int", default=1,
|
||||||
help="Column in the file with the geli file path")
|
help="Column in the file with the bam or geli file path")
|
||||||
parser.add_option("-o", "--output", dest="output",
|
parser.add_option("-o", "--output", dest="output",
|
||||||
type="string", default='/dev/stdout',
|
type="string", default='/dev/stdout',
|
||||||
help="x")
|
help="x")
|
||||||
|
|
@ -35,17 +50,36 @@ def main():
|
||||||
parser.error("incorrect number of arguments")
|
parser.error("incorrect number of arguments")
|
||||||
lines = [line.split() for line in open(args[0])]
|
lines = [line.split() for line in open(args[0])]
|
||||||
nIndividuals = int(args[1])
|
nIndividuals = int(args[1])
|
||||||
gelis = map( lambda x: x[OPTIONS.column-1], lines )
|
|
||||||
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
data = map( lambda x: x[OPTIONS.column-1], lines )
|
||||||
|
if os.path.splitext(data[0])[1] == '.bam':
|
||||||
|
gelis, jobids = bams2geli(data)
|
||||||
|
if filter(lambda x: x <> 0, jobids) <> []:
|
||||||
|
# there's still work to do
|
||||||
|
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
|
||||||
|
print 'gelis', gelis
|
||||||
|
print 'jobids', jobids
|
||||||
|
else:
|
||||||
|
gelis = map( lambda x: x[OPTIONS.column-1], lines )
|
||||||
|
jobids = [None] * len(gelis)
|
||||||
|
|
||||||
|
print 'Geli files'
|
||||||
print gelis
|
print gelis
|
||||||
print variantsOut
|
|
||||||
|
|
||||||
|
for geli, jobid in zip(gelis, jobids):
|
||||||
|
dbsnpFile = geli2dbsnpFile(geli)
|
||||||
|
if not os.path.exists(dbsnpFile):
|
||||||
|
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
|
||||||
|
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, waitID = jobid)
|
||||||
|
|
||||||
|
# read in the dbSNP tracks
|
||||||
nTotalSnps = 0
|
nTotalSnps = 0
|
||||||
nNovelSnps = 0
|
nNovelSnps = 0
|
||||||
for geli in gelis:
|
for geli in gelis:
|
||||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||||
dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
#dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||||
|
dbsnp_matches = geli2dbsnpFile(geli)
|
||||||
|
print dbsnp_matches
|
||||||
if os.path.exists(dbsnp_matches):
|
if os.path.exists(dbsnp_matches):
|
||||||
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
|
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
|
||||||
nTotalSnps += int(TOTAL_SNPS)
|
nTotalSnps += int(TOTAL_SNPS)
|
||||||
|
|
@ -55,7 +89,9 @@ def main():
|
||||||
print 'DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
|
print 'DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
|
||||||
print 'DATA: AVERAGE DBSNP RATE ACROSS LANES ', float(nTotalSnps - nNovelSnps) / nTotalSnps
|
print 'DATA: AVERAGE DBSNP RATE ACROSS LANES ', float(nTotalSnps - nNovelSnps) / nTotalSnps
|
||||||
|
|
||||||
|
# convert the geli's to text
|
||||||
jobid = None
|
jobid = None
|
||||||
|
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
||||||
for geli, variantOut in zip(gelis, variantsOut):
|
for geli, variantOut in zip(gelis, variantsOut):
|
||||||
if not os.path.exists(variantOut):
|
if not os.path.exists(variantOut):
|
||||||
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
|
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
|
||||||
|
|
|
||||||
|
|
@ -90,11 +90,11 @@ class MergeFilesSpec:
|
||||||
sizes = map(greek, sizes)
|
sizes = map(greek, sizes)
|
||||||
return sizes
|
return sizes
|
||||||
|
|
||||||
def mergeCmd(self, mergeBin = None):
|
def mergeCmd(self, mergeBin = None, MSD = False):
|
||||||
if mergeBin == None:
|
if mergeBin == None:
|
||||||
mergeBin = MERGE_BIN
|
mergeBin = MERGE_BIN
|
||||||
|
|
||||||
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin)
|
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD)
|
||||||
|
|
||||||
def getIndexCmd(self):
|
def getIndexCmd(self):
|
||||||
return "samtools index " + self.getMergedBAM()
|
return "samtools index " + self.getMergedBAM()
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,9 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
|
||||||
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
||||||
type="string", default=None,
|
type="string", default=None,
|
||||||
help="Path to merge binary")
|
help="Path to merge binary")
|
||||||
|
parser.add_option("", "--MSD", dest="MSD",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="Merge sequence dictionaries?")
|
||||||
parser.add_option("", "--keyCol", dest="keyCol",
|
parser.add_option("", "--keyCol", dest="keyCol",
|
||||||
type=int, default=1,
|
type=int, default=1,
|
||||||
help="Column in the list file holding the key")
|
help="Column in the list file holding the key")
|
||||||
|
|
@ -69,7 +72,7 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
|
||||||
if len(spec.sources()) == 1 and OPTIONS.link:
|
if len(spec.sources()) == 1 and OPTIONS.link:
|
||||||
cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
|
cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
|
||||||
else:
|
else:
|
||||||
cmd = spec.mergeCmd(OPTIONS.mergeBin)
|
cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD)
|
||||||
print cmd
|
print cmd
|
||||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
|
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -46,5 +46,5 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
|
||||||
status = os.system(cmd_str)
|
status = os.system(cmd_str)
|
||||||
if not farm_queue:
|
if not farm_queue:
|
||||||
print "<<< Exit code:", status,"\n"
|
print "<<< Exit code:", status,"\n"
|
||||||
return status
|
return int(status)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,9 @@ analysis = "CombineDuplicates"
|
||||||
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
||||||
CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'
|
CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'
|
||||||
|
|
||||||
|
def CollectDbSnpMatchesCmd(inputGeli, outputFile, lod):
|
||||||
|
return 'CollectDbSnpMatches.jar INPUT=%s OUTPUT=%s DBSNP=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dbsnp MINIMUM_LOD=%f' % (inputGeli, outputFile, lod)
|
||||||
|
|
||||||
def unique(l):
|
def unique(l):
|
||||||
return list(set(l))
|
return list(set(l))
|
||||||
|
|
||||||
|
|
@ -140,10 +143,14 @@ def aggregateGeliCalls( sortedGeliCalls ):
|
||||||
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||||
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||||
|
|
||||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ):
|
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True ):
|
||||||
if type(inputFiles) <> list:
|
if type(inputFiles) <> list:
|
||||||
inputFiles = list(inputFiles)
|
inputFiles = list(inputFiles)
|
||||||
return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
|
||||||
|
MSDStr = ''
|
||||||
|
if MSD: MSDStr = 'MSD=true'
|
||||||
|
|
||||||
|
return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||||
|
|
||||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||||
|
|
@ -165,8 +172,11 @@ def getReferenceGenotypeFileFromConcordanceFile(concordFile):
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def hybridSelectionExtraArgsForCalling():
|
||||||
|
return "TARGET_INTERVALS=/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list CALL_ZERO_COVERAGE_LOCI=true"
|
||||||
|
|
||||||
def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
|
def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
|
||||||
return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options)
|
return "java -jar %s INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, ref, options)
|
||||||
|
|
||||||
def concord(options, geli, output, genotypeFile):
|
def concord(options, geli, output, genotypeFile):
|
||||||
return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )
|
return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue