From 2bdb0118650b398a4654e4bf7ea8e33ced65e4c4 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 16 Jul 2010 15:33:47 +0000 Subject: [PATCH] Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a --- python/1kgStatsForCalls.py | 18 ++++++++++++------ python/madPipelineUtils.py | 2 ++ python/realignBamByChr.py | 17 +++++++++++------ 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/python/1kgStatsForCalls.py b/python/1kgStatsForCalls.py index 38c368a48..3a4e90a7f 100755 --- a/python/1kgStatsForCalls.py +++ b/python/1kgStatsForCalls.py @@ -11,7 +11,7 @@ import string def average(l): sum = reduce(operator.add, l, 0) - return sum / (1.0 * len(l)) + return sum / (1.0 * (max(len(l), 1))) def printHeaderSep(): print @@ -87,8 +87,9 @@ def countBases(samples, seqIndex): return total def printStatus(samples): - for sample in samples.itervalues(): - print sample + if OPTIONS.verbose: + for sample in samples.itervalues(): + print sample def findVariantEvalResults(key, file, type=str): def capture1(line): @@ -159,11 +160,14 @@ def countSNPs(samples, snpsVCF, useIndels = False): sampleIDs = columnNames[9:] print 'Counting SNPs...' + #lines = 0 for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False): + #lines += 1 + #print 'lines', lines if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ): break - if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']: + if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']: if ( vcf.isVariant() ): total += 1 @@ -184,6 +188,8 @@ def countSNPs(samples, snpsVCF, useIndels = False): samples[sampleID].nIndels += 1 else: samples[sampleID].nSNPs += 1 + else: + print 'rejecting line', vcf printStatus(samples) return total, novel @@ -238,12 +244,12 @@ if __name__ == "__main__": ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex) totalBases = countBases(samples, OPTIONS.sequenceIndex) - meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples + meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1)) totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps) totalIndels, novelIndels = countIndels(samples, OPTIONS.indels) snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE) - indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE) + indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE) out = sys.stdout if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w') diff --git a/python/madPipelineUtils.py b/python/madPipelineUtils.py index 23c333b3d..3db856bd2 100755 --- a/python/madPipelineUtils.py +++ b/python/madPipelineUtils.py @@ -73,6 +73,7 @@ class PipelineArgs: return cmd def chrsToSplitBy(self, chrs): + #print 'XXX', self.excludeChrs return filter(lambda x: x not in self.excludeChrs, chrs) # @@ -101,6 +102,7 @@ def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeComma chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies) return chrCmd, chrOutputMap + #print '######################################### chrsToSplitBy', myPipelineArgs.chrsToSplitBy(hg18) splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) ) splitCommands = map(lambda x: x[0], splits) diff --git a/python/realignBamByChr.py b/python/realignBamByChr.py index 31cdfa67e..42ae65f48 100755 --- a/python/realignBamByChr.py +++ b/python/realignBamByChr.py @@ -68,7 +68,7 @@ def main(): def includeStage(name): return name in stages - out = open(outputBamList, 'w') + #out = open(outputBamList, 'w') realignInfo = [] for chr in getContigs(OPTIONS.contigs, hg18): lastJobs = None @@ -92,16 +92,19 @@ def main(): realignInfo.append([realignJobs, realignedBam]) # need to merge and then index indexJobs, ignore = execStage('index', index, realignedBam, realignJobs) - print >> out, os.path.abspath(realignedBam) + #print >> out, os.path.abspath(realignedBam) - out.close() + #out.close() if 'merge' in stages: realignerJobs = [] if realignInfo[0][0] != []: realignerJobs = map(lambda x: x[0][0], realignInfo) - mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs) + mergedBam = outputRoot + ".bam" + mergerJob = mergeBams(myPipelineArgs, mergedBam, map(lambda x: x[1], realignInfo), realignerJobs) + indexJob, ignore = index(myPipelineArgs, 'ignore', 'ignore', 'ignore', mergedBam, mergerJob) allJobs.append(mergerJob) + allJobs.append(indexJob) print 'EXECUTING JOBS' executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) @@ -113,7 +116,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ): def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ): outputBAM = outputRoot + ".bam" - GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr) + GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -snps %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", outputBAM + ".snps", chr) return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ): @@ -123,7 +126,9 @@ def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ): print lastJobs #cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 ) cmd = picard_utils.mergeFixingMatesBAMCmd(outputFilename, bamsToMerge, compression_level = 5) - return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs) + jobs1 = FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs) + #jobs2 = indexBAMFile( myPipelineArgs.name, outputFilename, jobs1 ) + return jobs1 if __name__ == "__main__": main()