misc. changes to python scripts
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6ffcaa0afe
commit
dd978dd525
|
|
@ -11,7 +11,7 @@ import string
|
|||
|
||||
def average(l):
|
||||
sum = reduce(operator.add, l, 0)
|
||||
return sum / len(l)
|
||||
return sum / (1.0 * len(l))
|
||||
|
||||
def printHeaderSep():
|
||||
print
|
||||
|
|
@ -104,12 +104,17 @@ def findVariantEvalResults(key, file, type=str):
|
|||
def getDBSNPRate(file):
|
||||
if file != None:
|
||||
key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]"
|
||||
return findVariantEvalResults(key, file, float)[0]
|
||||
r = findVariantEvalResults(key, file, float)
|
||||
if len(r) > 0:
|
||||
return r[0]
|
||||
else:
|
||||
return -1
|
||||
else:
|
||||
return -1
|
||||
|
||||
def useProject(project):
|
||||
return OPTIONS.project == None or project == OPTIONS.project
|
||||
#print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project)
|
||||
return OPTIONS.project == None or re.match(OPTIONS.project, project) != None
|
||||
|
||||
def countMappedBases(samples, alignmentIndex):
|
||||
if ( OPTIONS.coverageFile != None ):
|
||||
|
|
@ -148,16 +153,21 @@ def readMappedBasesFromBAS(samples, alignmentIndex):
|
|||
|
||||
def countSNPs(samples, snpsVCF, useIndels = False):
|
||||
total = 0
|
||||
novel = 0
|
||||
|
||||
header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF))
|
||||
sampleIDs = columnNames[9:]
|
||||
|
||||
print 'Counting SNPs...'
|
||||
for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
|
||||
if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
|
||||
break
|
||||
if vcf.passesFilters():
|
||||
|
||||
if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
|
||||
if ( vcf.isVariant() ):
|
||||
total += 1
|
||||
|
||||
if ( vcf.isNovel() ): novel += 1
|
||||
|
||||
if ( OPTIONS.verbose and total % 10000 == 0 ):
|
||||
print ' progress', vcf.getChrom(), vcf.getPos()
|
||||
|
|
@ -176,7 +186,7 @@ def countSNPs(samples, snpsVCF, useIndels = False):
|
|||
samples[sampleID].nSNPs += 1
|
||||
|
||||
printStatus(samples)
|
||||
return total
|
||||
return total, novel
|
||||
|
||||
def countIndels(samples, indelsVCF):
|
||||
total = 0
|
||||
|
|
@ -184,7 +194,7 @@ def countIndels(samples, indelsVCF):
|
|||
if ( indelsVCF != None ):
|
||||
return countSNPs(samples, indelsVCF, True)
|
||||
|
||||
return total
|
||||
return total, 0
|
||||
|
||||
def readSamples(vcf):
|
||||
print 'Reading samples for', OPTIONS.population
|
||||
|
|
@ -229,11 +239,11 @@ if __name__ == "__main__":
|
|||
ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)
|
||||
totalBases = countBases(samples, OPTIONS.sequenceIndex)
|
||||
meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
|
||||
totalSNPs = countSNPs(samples, OPTIONS.snps)
|
||||
totalIndels = countIndels(samples, OPTIONS.indels)
|
||||
totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
|
||||
totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
|
||||
|
||||
snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
|
||||
indelNoveltyRate = 100 - getDBSNPRate(OPTIONS.indelsVE)
|
||||
indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)
|
||||
|
||||
out = sys.stdout
|
||||
if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
|
||||
|
|
@ -247,8 +257,8 @@ if __name__ == "__main__":
|
|||
|
||||
print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome)
|
||||
print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate)
|
||||
print >> out, 'average # SNP sites per individual', average(map(Sample.getNSNPs, samples.itervalues()))
|
||||
print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues()))
|
||||
print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate)
|
||||
print >> out, 'average # indel sites per individual', average(map(Sample.getNIndels, samples.itervalues()))
|
||||
print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues()))
|
||||
out.close()
|
||||
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ def buildExecutionString(job, farm_queue = None, debug = True):
|
|||
#farm_stdout = None
|
||||
farm_stdout = "%J.lsf.output"
|
||||
|
||||
cmd_str = "bsub -q " + farm_queue
|
||||
cmd_str = "bsub -r -q " + farm_queue
|
||||
if farm_stdout != None:
|
||||
cmd_str += " -o " + farm_stdout
|
||||
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ def appendExtension(path, newExt, addExtension = True):
|
|||
# return os.path.join(OPTIONS.dir, s)
|
||||
|
||||
class PipelineArgs:
|
||||
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g', excludeChrs = [] ):
|
||||
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '3g', excludeChrs = [] ):
|
||||
self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
|
||||
self.ref = ref
|
||||
self.name = name
|
||||
|
|
|
|||
|
|
@ -154,7 +154,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
|
|||
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ):
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx2g', compression_level = 1 ):
|
||||
if useSamtools:
|
||||
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
|
||||
else:
|
||||
|
|
@ -166,13 +166,12 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
|
|||
if MSD: MSDStr = 'MSD=true'
|
||||
|
||||
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
|
||||
def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx4096m', compression_level = 1, tmpdir = '/broad/shptmp' ):
|
||||
def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx2g', compression_level = 1, tmpdir = '/broad/shptmp' ):
|
||||
if type(inputFiles) <> list:
|
||||
inputFiles = list(inputFiles)
|
||||
|
||||
return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, 'I='.join(inputFiles) )
|
||||
return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, ' I='.join(inputFiles) )
|
||||
|
||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||
flowcell, laneNo = lane.split('.')
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
|
|||
|
||||
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
|
||||
outputBAM = outputRoot + ".bam"
|
||||
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -L %s' % (inputBam, intervals, outputBAM, chr)
|
||||
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
|
||||
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
|
||||
|
||||
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ class VCFRecord:
|
|||
def getFilter(self): return self.get("FILTER")
|
||||
def failsFilters(self): return not self.passesFilters()
|
||||
def passesFilters(self):
|
||||
v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASSES'
|
||||
v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASS'
|
||||
#print self.getFilter(), ">>>", v, self
|
||||
return v
|
||||
|
||||
|
|
@ -180,6 +180,7 @@ def readVCFHeader(lines):
|
|||
return header, columnNames, itertools.chain([line], lines)
|
||||
|
||||
# we reach this point for empty files
|
||||
#print 'header is', header
|
||||
return header, columnNames, []
|
||||
|
||||
def quickCountRecords(lines):
|
||||
|
|
|
|||
Loading…
Reference in New Issue