Improvements to 1KG processing pipeline

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-16 15:33:47 +00:00 · 2010-07-16 15:33:47 +00:00 · 2bdb011865
parent 1dd8a28a5d
commit 2bdb011865
3 changed files with 25 additions and 12 deletions
--- a/python/1kgStatsForCalls.py
+++ b/python/1kgStatsForCalls.py
@ -11,7 +11,7 @@ import string

 def average(l):
    sum = reduce(operator.add, l, 0)
-    return sum / (1.0 * len(l))
+    return sum / (1.0 * (max(len(l), 1)))

 def printHeaderSep():
    print
@ -87,8 +87,9 @@ def countBases(samples, seqIndex):
    return total

 def printStatus(samples):
-    for sample in samples.itervalues():
-        print sample
+    if OPTIONS.verbose:
+        for sample in samples.itervalues():
+            print sample

 def findVariantEvalResults(key, file, type=str):
    def capture1(line):
@ -159,11 +160,14 @@ def countSNPs(samples, snpsVCF, useIndels = False):
    sampleIDs = columnNames[9:]

    print 'Counting SNPs...'
+    #lines = 0
    for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
+        #lines += 1
+        #print 'lines', lines
        if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
            break
            
-        if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
+        if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']:
            if ( vcf.isVariant() ): 
                total += 1

@ -184,6 +188,8 @@ def countSNPs(samples, snpsVCF, useIndels = False):
                                samples[sampleID].nIndels += 1
                            else:
                                samples[sampleID].nSNPs += 1
+        else:
+            print 'rejecting line', vcf

    printStatus(samples)
    return total, novel
@ -238,12 +244,12 @@ if __name__ == "__main__":

    ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)    
    totalBases = countBases(samples, OPTIONS.sequenceIndex)
-    meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
+    meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1))
    totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
    totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
    
    snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
-    indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)
+    indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE)

    out = sys.stdout
    if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
--- a/python/madPipelineUtils.py
+++ b/python/madPipelineUtils.py
@ -73,6 +73,7 @@ class PipelineArgs:
        return cmd
        
    def chrsToSplitBy(self, chrs):
+        #print 'XXX', self.excludeChrs
        return filter(lambda x: x not in self.excludeChrs, chrs)
        
 # 
@ -101,6 +102,7 @@ def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeComma
        chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
        return chrCmd, chrOutputMap

+    #print '######################################### chrsToSplitBy', myPipelineArgs.chrsToSplitBy(hg18)
    splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) )
    splitCommands = map(lambda x: x[0], splits)

--- a/python/realignBamByChr.py
+++ b/python/realignBamByChr.py
@ -68,7 +68,7 @@ def main():
    def includeStage(name):
        return name in stages

-    out = open(outputBamList, 'w')
+    #out = open(outputBamList, 'w')
    realignInfo = []
    for chr in getContigs(OPTIONS.contigs, hg18):
        lastJobs = None
@ -92,16 +92,19 @@ def main():
        realignInfo.append([realignJobs, realignedBam])
        # need to merge and then index
        indexJobs, ignore = execStage('index', index, realignedBam, realignJobs)
-        print >> out, os.path.abspath(realignedBam)
+        #print >> out, os.path.abspath(realignedBam)
     
-    out.close() 
+    #out.close() 

    if 'merge' in stages:
        realignerJobs = []
        if realignInfo[0][0] != []:
            realignerJobs = map(lambda x: x[0][0], realignInfo)
-        mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs)
+        mergedBam = outputRoot + ".bam"
+        mergerJob = mergeBams(myPipelineArgs, mergedBam, map(lambda x: x[1], realignInfo), realignerJobs)
+        indexJob, ignore = index(myPipelineArgs, 'ignore', 'ignore', 'ignore', mergedBam, mergerJob)
        allJobs.append(mergerJob)
+        allJobs.append(indexJob)

    print 'EXECUTING JOBS'
    executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) 
@ -113,7 +116,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):

 def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
    outputBAM = outputRoot + ".bam"
-    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
+    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -snps %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", outputBAM + ".snps", chr)
    return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM

 def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
@ -123,7 +126,9 @@ def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ):
    print lastJobs
    #cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 )
    cmd = picard_utils.mergeFixingMatesBAMCmd(outputFilename, bamsToMerge, compression_level = 5)
-    return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
+    jobs1 = FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
+    #jobs2 = indexBAMFile( myPipelineArgs.name, outputFilename, jobs1 )
+    return jobs1

 if __name__ == "__main__":
    main()