Improvements to 1KG processing pipeline
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1dd8a28a5d
commit
2bdb011865
|
|
@ -11,7 +11,7 @@ import string
|
|||
|
||||
def average(l):
|
||||
sum = reduce(operator.add, l, 0)
|
||||
return sum / (1.0 * len(l))
|
||||
return sum / (1.0 * (max(len(l), 1)))
|
||||
|
||||
def printHeaderSep():
|
||||
print
|
||||
|
|
@ -87,8 +87,9 @@ def countBases(samples, seqIndex):
|
|||
return total
|
||||
|
||||
def printStatus(samples):
|
||||
for sample in samples.itervalues():
|
||||
print sample
|
||||
if OPTIONS.verbose:
|
||||
for sample in samples.itervalues():
|
||||
print sample
|
||||
|
||||
def findVariantEvalResults(key, file, type=str):
|
||||
def capture1(line):
|
||||
|
|
@ -159,11 +160,14 @@ def countSNPs(samples, snpsVCF, useIndels = False):
|
|||
sampleIDs = columnNames[9:]
|
||||
|
||||
print 'Counting SNPs...'
|
||||
#lines = 0
|
||||
for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
|
||||
#lines += 1
|
||||
#print 'lines', lines
|
||||
if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
|
||||
break
|
||||
|
||||
if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
|
||||
if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']:
|
||||
if ( vcf.isVariant() ):
|
||||
total += 1
|
||||
|
||||
|
|
@ -184,6 +188,8 @@ def countSNPs(samples, snpsVCF, useIndels = False):
|
|||
samples[sampleID].nIndels += 1
|
||||
else:
|
||||
samples[sampleID].nSNPs += 1
|
||||
else:
|
||||
print 'rejecting line', vcf
|
||||
|
||||
printStatus(samples)
|
||||
return total, novel
|
||||
|
|
@ -238,12 +244,12 @@ if __name__ == "__main__":
|
|||
|
||||
ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)
|
||||
totalBases = countBases(samples, OPTIONS.sequenceIndex)
|
||||
meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
|
||||
meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1))
|
||||
totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
|
||||
totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
|
||||
|
||||
snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
|
||||
indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)
|
||||
indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE)
|
||||
|
||||
out = sys.stdout
|
||||
if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ class PipelineArgs:
|
|||
return cmd
|
||||
|
||||
def chrsToSplitBy(self, chrs):
|
||||
#print 'XXX', self.excludeChrs
|
||||
return filter(lambda x: x not in self.excludeChrs, chrs)
|
||||
|
||||
#
|
||||
|
|
@ -101,6 +102,7 @@ def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeComma
|
|||
chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
|
||||
return chrCmd, chrOutputMap
|
||||
|
||||
#print '######################################### chrsToSplitBy', myPipelineArgs.chrsToSplitBy(hg18)
|
||||
splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) )
|
||||
splitCommands = map(lambda x: x[0], splits)
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ def main():
|
|||
def includeStage(name):
|
||||
return name in stages
|
||||
|
||||
out = open(outputBamList, 'w')
|
||||
#out = open(outputBamList, 'w')
|
||||
realignInfo = []
|
||||
for chr in getContigs(OPTIONS.contigs, hg18):
|
||||
lastJobs = None
|
||||
|
|
@ -92,16 +92,19 @@ def main():
|
|||
realignInfo.append([realignJobs, realignedBam])
|
||||
# need to merge and then index
|
||||
indexJobs, ignore = execStage('index', index, realignedBam, realignJobs)
|
||||
print >> out, os.path.abspath(realignedBam)
|
||||
#print >> out, os.path.abspath(realignedBam)
|
||||
|
||||
out.close()
|
||||
#out.close()
|
||||
|
||||
if 'merge' in stages:
|
||||
realignerJobs = []
|
||||
if realignInfo[0][0] != []:
|
||||
realignerJobs = map(lambda x: x[0][0], realignInfo)
|
||||
mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs)
|
||||
mergedBam = outputRoot + ".bam"
|
||||
mergerJob = mergeBams(myPipelineArgs, mergedBam, map(lambda x: x[1], realignInfo), realignerJobs)
|
||||
indexJob, ignore = index(myPipelineArgs, 'ignore', 'ignore', 'ignore', mergedBam, mergerJob)
|
||||
allJobs.append(mergerJob)
|
||||
allJobs.append(indexJob)
|
||||
|
||||
print 'EXECUTING JOBS'
|
||||
executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
|
||||
|
|
@ -113,7 +116,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
|
|||
|
||||
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
|
||||
outputBAM = outputRoot + ".bam"
|
||||
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
|
||||
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -snps %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", outputBAM + ".snps", chr)
|
||||
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
|
||||
|
||||
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
|
||||
|
|
@ -123,7 +126,9 @@ def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ):
|
|||
print lastJobs
|
||||
#cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 )
|
||||
cmd = picard_utils.mergeFixingMatesBAMCmd(outputFilename, bamsToMerge, compression_level = 5)
|
||||
return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
|
||||
jobs1 = FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
|
||||
#jobs2 = indexBAMFile( myPipelineArgs.name, outputFilename, jobs1 )
|
||||
return jobs1
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Reference in New Issue