misc. changes to python scripts

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-07-09 00:13:35 +00:00
parent 6ffcaa0afe
commit dd978dd525
6 changed files with 29 additions and 19 deletions

View File

@ -11,7 +11,7 @@ import string
def average(l):
sum = reduce(operator.add, l, 0)
return sum / len(l)
return sum / (1.0 * len(l))
def printHeaderSep():
print
@ -104,12 +104,17 @@ def findVariantEvalResults(key, file, type=str):
def getDBSNPRate(file):
if file != None:
key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]"
return findVariantEvalResults(key, file, float)[0]
r = findVariantEvalResults(key, file, float)
if len(r) > 0:
return r[0]
else:
return -1
else:
return -1
def useProject(project):
return OPTIONS.project == None or project == OPTIONS.project
#print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project)
return OPTIONS.project == None or re.match(OPTIONS.project, project) != None
def countMappedBases(samples, alignmentIndex):
if ( OPTIONS.coverageFile != None ):
@ -148,16 +153,21 @@ def readMappedBasesFromBAS(samples, alignmentIndex):
def countSNPs(samples, snpsVCF, useIndels = False):
total = 0
novel = 0
header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF))
sampleIDs = columnNames[9:]
print 'Counting SNPs...'
for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
break
if vcf.passesFilters():
if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
if ( vcf.isVariant() ):
total += 1
if ( vcf.isNovel() ): novel += 1
if ( OPTIONS.verbose and total % 10000 == 0 ):
print ' progress', vcf.getChrom(), vcf.getPos()
@ -176,7 +186,7 @@ def countSNPs(samples, snpsVCF, useIndels = False):
samples[sampleID].nSNPs += 1
printStatus(samples)
return total
return total, novel
def countIndels(samples, indelsVCF):
total = 0
@ -184,7 +194,7 @@ def countIndels(samples, indelsVCF):
if ( indelsVCF != None ):
return countSNPs(samples, indelsVCF, True)
return total
return total, 0
def readSamples(vcf):
print 'Reading samples for', OPTIONS.population
@ -229,11 +239,11 @@ if __name__ == "__main__":
ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)
totalBases = countBases(samples, OPTIONS.sequenceIndex)
meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
totalSNPs = countSNPs(samples, OPTIONS.snps)
totalIndels = countIndels(samples, OPTIONS.indels)
totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
indelNoveltyRate = 100 - getDBSNPRate(OPTIONS.indelsVE)
indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)
out = sys.stdout
if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
@ -247,8 +257,8 @@ if __name__ == "__main__":
print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome)
print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate)
print >> out, 'average # SNP sites per individual', average(map(Sample.getNSNPs, samples.itervalues()))
print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues()))
print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate)
print >> out, 'average # indel sites per individual', average(map(Sample.getNIndels, samples.itervalues()))
print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues()))
out.close()

View File

@ -133,7 +133,7 @@ def buildExecutionString(job, farm_queue = None, debug = True):
#farm_stdout = None
farm_stdout = "%J.lsf.output"
cmd_str = "bsub -q " + farm_queue
cmd_str = "bsub -r -q " + farm_queue
if farm_stdout != None:
cmd_str += " -o " + farm_stdout

View File

@ -45,7 +45,7 @@ def appendExtension(path, newExt, addExtension = True):
# return os.path.join(OPTIONS.dir, s)
class PipelineArgs:
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g', excludeChrs = [] ):
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '3g', excludeChrs = [] ):
self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
self.ref = ref
self.name = name

View File

@ -154,7 +154,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ):
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx2g', compression_level = 1 ):
if useSamtools:
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
else:
@ -166,13 +166,12 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
if MSD: MSDStr = 'MSD=true'
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx4096m', compression_level = 1, tmpdir = '/broad/shptmp' ):
def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx2g', compression_level = 1, tmpdir = '/broad/shptmp' ):
if type(inputFiles) <> list:
inputFiles = list(inputFiles)
return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, 'I='.join(inputFiles) )
return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, ' I='.join(inputFiles) )
def getPicardPath(lane, picardRoot = '/seq/picard/'):
flowcell, laneNo = lane.split('.')

View File

@ -113,7 +113,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
outputBAM = outputRoot + ".bam"
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -L %s' % (inputBam, intervals, outputBAM, chr)
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):

View File

@ -74,7 +74,7 @@ class VCFRecord:
def getFilter(self): return self.get("FILTER")
def failsFilters(self): return not self.passesFilters()
def passesFilters(self):
v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASSES'
v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASS'
#print self.getFilter(), ">>>", v, self
return v
@ -180,6 +180,7 @@ def readVCFHeader(lines):
return header, columnNames, itertools.chain([line], lines)
# we reach this point for empty files
#print 'header is', header
return header, columnNames, []
def quickCountRecords(lines):