diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py index 60f25456f..b3792527f 100755 --- a/python/Gelis2PopSNPs.py +++ b/python/Gelis2PopSNPs.py @@ -10,7 +10,22 @@ import itertools gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar" ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" -analysis = "CombineDuplicates" + +def geli2dbsnpFile(geli): + root, flowcellDotlane, ext = picard_utils.splitPath(geli) + return os.path.join(root, flowcellDotlane) + '.dbsnp_matches' + + +def bams2geli(bams): + def call1(bam): + geli = os.path.splitext(bam)[0] + '.geli' + jobid = 0 + if not os.path.exists(geli): + cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling()) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue ) + return geli, jobid + calls = map(call1, bams) + return map(lambda x: x[0], calls), map(lambda x: x[1], calls) def main(): global OPTIONS, ROOT @@ -25,7 +40,7 @@ def main(): help="minimum lod for calling a variant") parser.add_option("-k", "--column", dest="column", type="int", default=1, - help="Column in the file with the geli file path") + help="Column in the file with the bam or geli file path") parser.add_option("-o", "--output", dest="output", type="string", default='/dev/stdout', help="x") @@ -35,17 +50,36 @@ def main(): parser.error("incorrect number of arguments") lines = [line.split() for line in open(args[0])] nIndividuals = int(args[1]) - gelis = map( lambda x: x[OPTIONS.column-1], lines ) - variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis) - + + data = map( lambda x: x[OPTIONS.column-1], lines ) + if os.path.splitext(data[0])[1] == '.bam': + gelis, jobids = bams2geli(data) + if filter(lambda x: x <> 0, jobids) <> []: + # there's still work to do + sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids)) + print 'gelis', gelis + print 'jobids', jobids + else: + gelis = map( lambda x: x[OPTIONS.column-1], lines ) + jobids = [None] * len(gelis) + + print 'Geli files' print gelis - print variantsOut + for geli, jobid in zip(gelis, jobids): + dbsnpFile = geli2dbsnpFile(geli) + if not os.path.exists(dbsnpFile): + dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod) + farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, waitID = jobid) + + # read in the dbSNP tracks nTotalSnps = 0 nNovelSnps = 0 for geli in gelis: root, flowcellDotlane, ext = picard_utils.splitPath(geli) - dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches' + #dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches' + dbsnp_matches = geli2dbsnpFile(geli) + print dbsnp_matches if os.path.exists(dbsnp_matches): TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches) nTotalSnps += int(TOTAL_SNPS) @@ -55,7 +89,9 @@ def main(): print 'DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps print 'DATA: AVERAGE DBSNP RATE ACROSS LANES ', float(nTotalSnps - nNovelSnps) / nTotalSnps + # convert the geli's to text jobid = None + variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis) for geli, variantOut in zip(gelis, variantsOut): if not os.path.exists(variantOut): cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) ) diff --git a/python/MergeBAMsUtils.py b/python/MergeBAMsUtils.py index b29de288f..f520521b0 100755 --- a/python/MergeBAMsUtils.py +++ b/python/MergeBAMsUtils.py @@ -90,11 +90,11 @@ class MergeFilesSpec: sizes = map(greek, sizes) return sizes - def mergeCmd(self, mergeBin = None): + def mergeCmd(self, mergeBin = None, MSD = False): if mergeBin == None: mergeBin = MERGE_BIN - return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin) + return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD) def getIndexCmd(self): return "samtools index " + self.getMergedBAM() diff --git a/python/MergeBamsByKey.py b/python/MergeBamsByKey.py index 43d3909bd..fbac2cc7f 100755 --- a/python/MergeBamsByKey.py +++ b/python/MergeBamsByKey.py @@ -34,6 +34,9 @@ by key and spawn merge and index jobs to merge all of the files sharing the same parser.add_option("-m", "--mergeBin", dest="mergeBin", type="string", default=None, help="Path to merge binary") + parser.add_option("", "--MSD", dest="MSD", + action='store_true', default=False, + help="Merge sequence dictionaries?") parser.add_option("", "--keyCol", dest="keyCol", type=int, default=1, help="Column in the list file holding the key") @@ -69,7 +72,7 @@ by key and spawn merge and index jobs to merge all of the files sharing the same if len(spec.sources()) == 1 and OPTIONS.link: cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM() else: - cmd = spec.mergeCmd(OPTIONS.mergeBin) + cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD) print cmd jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry) diff --git a/python/farm_commands.py b/python/farm_commands.py index b72f01c6a..37167b3ef 100644 --- a/python/farm_commands.py +++ b/python/farm_commands.py @@ -46,5 +46,5 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman status = os.system(cmd_str) if not farm_queue: print "<<< Exit code:", status,"\n" - return status + return int(status) diff --git a/python/picard_utils.py b/python/picard_utils.py index 0a3fdf5d9..2e5070612 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -21,6 +21,9 @@ analysis = "CombineDuplicates" MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar' CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar' +def CollectDbSnpMatchesCmd(inputGeli, outputFile, lod): + return 'CollectDbSnpMatches.jar INPUT=%s OUTPUT=%s DBSNP=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dbsnp MINIMUM_LOD=%f' % (inputGeli, outputFile, lod) + def unique(l): return list(set(l)) @@ -140,10 +143,14 @@ def aggregateGeliCalls( sortedGeliCalls ): #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] -def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ): +def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True ): if type(inputFiles) <> list: inputFiles = list(inputFiles) - return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) + + MSDStr = '' + if MSD: MSDStr = 'MSD=true' + + return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) def getPicardPath(lane, picardRoot = '/seq/picard/'): @@ -165,8 +172,11 @@ def getReferenceGenotypeFileFromConcordanceFile(concordFile): return match.group(1) return None +def hybridSelectionExtraArgsForCalling(): + return "TARGET_INTERVALS=/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list CALL_ZERO_COVERAGE_LOCI=true" + def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''): - return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options) + return "java -jar %s INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, ref, options) def concord(options, geli, output, genotypeFile): return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )