Improvements to 1KG processing pipeline

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-07-16 15:33:47 +00:00
parent 1dd8a28a5d
commit 2bdb011865
3 changed files with 25 additions and 12 deletions

View File

@ -11,7 +11,7 @@ import string
def average(l):
sum = reduce(operator.add, l, 0)
return sum / (1.0 * len(l))
return sum / (1.0 * (max(len(l), 1)))
def printHeaderSep():
print
@ -87,8 +87,9 @@ def countBases(samples, seqIndex):
return total
def printStatus(samples):
for sample in samples.itervalues():
print sample
if OPTIONS.verbose:
for sample in samples.itervalues():
print sample
def findVariantEvalResults(key, file, type=str):
def capture1(line):
@ -159,11 +160,14 @@ def countSNPs(samples, snpsVCF, useIndels = False):
sampleIDs = columnNames[9:]
print 'Counting SNPs...'
#lines = 0
for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
#lines += 1
#print 'lines', lines
if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
break
if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']:
if ( vcf.isVariant() ):
total += 1
@ -184,6 +188,8 @@ def countSNPs(samples, snpsVCF, useIndels = False):
samples[sampleID].nIndels += 1
else:
samples[sampleID].nSNPs += 1
else:
print 'rejecting line', vcf
printStatus(samples)
return total, novel
@ -238,12 +244,12 @@ if __name__ == "__main__":
ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)
totalBases = countBases(samples, OPTIONS.sequenceIndex)
meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1))
totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)
indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE)
out = sys.stdout
if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')

View File

@ -73,6 +73,7 @@ class PipelineArgs:
return cmd
def chrsToSplitBy(self, chrs):
#print 'XXX', self.excludeChrs
return filter(lambda x: x not in self.excludeChrs, chrs)
#
@ -101,6 +102,7 @@ def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeComma
chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
return chrCmd, chrOutputMap
#print '######################################### chrsToSplitBy', myPipelineArgs.chrsToSplitBy(hg18)
splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) )
splitCommands = map(lambda x: x[0], splits)

View File

@ -68,7 +68,7 @@ def main():
def includeStage(name):
return name in stages
out = open(outputBamList, 'w')
#out = open(outputBamList, 'w')
realignInfo = []
for chr in getContigs(OPTIONS.contigs, hg18):
lastJobs = None
@ -92,16 +92,19 @@ def main():
realignInfo.append([realignJobs, realignedBam])
# need to merge and then index
indexJobs, ignore = execStage('index', index, realignedBam, realignJobs)
print >> out, os.path.abspath(realignedBam)
#print >> out, os.path.abspath(realignedBam)
out.close()
#out.close()
if 'merge' in stages:
realignerJobs = []
if realignInfo[0][0] != []:
realignerJobs = map(lambda x: x[0][0], realignInfo)
mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs)
mergedBam = outputRoot + ".bam"
mergerJob = mergeBams(myPipelineArgs, mergedBam, map(lambda x: x[1], realignInfo), realignerJobs)
indexJob, ignore = index(myPipelineArgs, 'ignore', 'ignore', 'ignore', mergedBam, mergerJob)
allJobs.append(mergerJob)
allJobs.append(indexJob)
print 'EXECUTING JOBS'
executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
@ -113,7 +116,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
outputBAM = outputRoot + ".bam"
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -snps %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", outputBAM + ".snps", chr)
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
@ -123,7 +126,9 @@ def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ):
print lastJobs
#cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 )
cmd = picard_utils.mergeFixingMatesBAMCmd(outputFilename, bamsToMerge, compression_level = 5)
return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
jobs1 = FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
#jobs2 = indexBAMFile( myPipelineArgs.name, outputFilename, jobs1 )
return jobs1
if __name__ == "__main__":
main()