From cf910d9cc2a058b445f6593d73b5d1ad2d42ad40 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 30 Jun 2010 16:33:32 +0000 Subject: [PATCH] misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a --- python/1kgStatsForCalls.py | 254 +++++++++++++++++++++++++++++++++++++ python/makeIGVCategory.py | 24 ++++ python/picard_utils.py | 7 + python/realignBamByChr.py | 16 ++- python/vcfReader.py | 17 ++- 5 files changed, 308 insertions(+), 10 deletions(-) create mode 100755 python/1kgStatsForCalls.py create mode 100755 python/makeIGVCategory.py diff --git a/python/1kgStatsForCalls.py b/python/1kgStatsForCalls.py new file mode 100755 index 000000000..ff0c9e9cd --- /dev/null +++ b/python/1kgStatsForCalls.py @@ -0,0 +1,254 @@ +# import farm_commands2 +import os.path +import sys +from optparse import OptionParser +import glob +import operator +import itertools +import re +import vcfReader +import string + +def average(l): + sum = reduce(operator.add, l, 0) + return sum / len(l) + +def printHeaderSep(): + print + print ''.join(['-'] * 80) + +class Sample: + def __init__(self, name): + self.name = name + self.rawBases = 0 + self.mappedBases = 0 + self.nSNPs = 0 + self.nIndels = 0 + + def getName(self): return self.name + def getNSNPs(self): return self.nSNPs + def getNIndels(self): return self.nIndels + + def __str__(self): + return '[%s rawBases=%d mappedBases=%d percentMapped=%.2f nSNPs=%d nIndels=%d]' % (self.name, self.rawBases, self.mappedBases, (self.mappedBases * 100.0) / max(self.rawBases,1), self.nSNPs, self.nIndels) + __repr__ = __str__ + +def flatFileIterator(file, fields = None, skip = 0): + count = 0 + for line in open(file): + count += 1 + if count > skip: + s = map(string.strip, line.split('\t')) + if ( fields != None ): + s = map(lambda field: s[field], fields) + + if len(s) == 1: s = s[0] + yield s + +# 1. FASTQ_FILE, path to fastq file on ftp site +# 2. MD5, md5sum of file +# 3. RUN_ID, SRA/ERA run accession +# 4. STUDY_ID, SRA/ERA study accession +# 5. STUDY_NAME, Name of stury +# 6. CENTER_NAME, Submission centre name +# 7. SUBMISSION_ID, SRA/ERA submission accession +# 8. SUBMISSION_DATE, Date sequence submitted, YYYY-MM-DAY +# 9. SAMPLE_ID, SRA/ERA sample accession +# 10. SAMPLE_NAME, Sample name +# 11. POPULATION, Sample population +# 12. EXPERIMENT_ID, Experiment accession +# 13. INSTRUMENT_PLATFORM, Type of sequencing machine +# 14. INSTRUMENT_MODEL, Model of sequencing machine +# 15. LIBRARY_NAME, Library name +# 16. RUN_NAME, Name of machine run +# 17. RUN_BLOCK_NAME, Name of machine run sector +# 18. INSERT_SIZE, Submitter specifed insert size +# 19. LIBRARY_LAYOUT, Library layout, this can be either PAIRED or SINGLE +# 20. PAIRED_FASTQ, Name of mate pair file if exists (Runs with failed mates will have +# a library layout of PAIRED but no paired fastq file) +# 21. WITHDRAWN, 0/1 to indicate if the file has been withdrawn, only present if a file has been withdrawn +# 22. WITHDRAWN_DATE, date of withdrawal, this should only be defined if a file is +# withdrawn +# 23. COMMENT, comment about reason for withdrawal +# 24. READ_COUNT, read count for the file +# 25. BASE_COUNT, basepair count for the file +def countBases(samples, seqIndex): + total = 0 + + for project, sampleID, withdrawnP, bases in flatFileIterator(seqIndex, [3,9,20,24]): + if ( withdrawnP == "0" and useProject(project) and sampleID in samples ): + if OPTIONS.verbose: print project, sampleID, withdrawnP, bases + sample = samples[sampleID] + sample.rawBases += int(bases) + total += int(bases) + + printStatus(samples) + print 'Total raw bases', total + return total + +def printStatus(samples): + for sample in samples.itervalues(): + print sample + +def findVariantEvalResults(key, file, type=str): + def capture1(line): + if key in line: + s = line.split() + return type(s[len(s)-1]) + else: + return None + + return [val for val in map(capture1, open(file)) if val != None] + + +def getDBSNPRate(file): + if file != None: + key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]" + return findVariantEvalResults(key, file, float)[0] + else: + return -1 + +def useProject(project): + return OPTIONS.project == None or project == OPTIONS.project + +def countMappedBases(samples, alignmentIndex): + if ( OPTIONS.coverageFile != None ): + # read from summary file, looking for the line: + # Total 340710 1187.14 N/A N/A N/A + for parts in map( string.split, open(OPTIONS.coverageFile) ): + if parts[0] == "Total": + return -1, int(parts[1]) + else: + return readMappedBasesFromBAS(samples, alignmentIndex) + +def readMappedBasesFromBAS(samples, alignmentIndex): + totalBases = 0 + totalMapped = 0 + + for project, sampleID, basFile in flatFileIterator(alignmentIndex, [2,3,6]): + #print project, sampleID, basFile + if ( useProject(project) and sampleID in samples ): + if OPTIONS.verbose: print project, sampleID, basFile + sample = samples[sampleID] + + for rawBases, mappedBases in flatFileIterator(os.path.join(OPTIONS.root, basFile), [7, 8], skip=1): + #print ' ->', rawBases, mappedBases + if OPTIONS.rawBasesFromBas: + sample.rawBases += int(rawBases) + totalBases += int(rawBases) + sample.mappedBases += int(mappedBases) + totalMapped += int(mappedBases) + #print ' totals', totalBases, totalMapped + + printStatus(samples) + print 'Total raw bases', totalBases + print 'Total mapped bases', totalMapped + + return totalBases, totalMapped + +def countSNPs(samples, snpsVCF, useIndels = False): + total = 0 + + header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF)) + sampleIDs = columnNames[9:] + + for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False): + if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ): + break + if vcf.passesFilters(): + if ( vcf.isVariant() ): + total += 1 + + if ( OPTIONS.verbose and total % 10000 == 0 ): + print ' progress', vcf.getChrom(), vcf.getPos() + + genotypes = vcf.rest[1:] + for sampleID, genotypeField in itertools.izip(sampleIDs, genotypes): + #print sampleID, samples + if sampleID in samples: + genotype = genotypeField.split(':')[0] + variant = genotype != "0/0" and genotype != "0|0" and genotype != "0\0" and genotype != "./." + #print ' => ', vcf, sampleID, genotype, variant + if variant: + if ( useIndels ): + samples[sampleID].nIndels += 1 + else: + samples[sampleID].nSNPs += 1 + + printStatus(samples) + return total + +def countIndels(samples, indelsVCF): + total = 0 + + if ( indelsVCF != None ): + return countSNPs(samples, indelsVCF, True) + + return total + +def readSamples(vcf): + print 'Reading samples for', OPTIONS.population + header, columnNames, remainingLines = vcfReader.readVCFHeader(open(vcf)) + samples = map(Sample, columnNames[9:]) + if ( OPTIONS.onlySample != None ): + samples = filter( lambda x: x.getName() == OPTIONS.onlySample, samples ) + + print 'No. samples: ', len(samples) + print 'Samples: ', map(Sample.getName, samples) + + return dict(map( lambda x: (x.getName(), x), samples)) + +if __name__ == "__main__": + usage = "usage: %prog" + parser = OptionParser(usage=usage) + parser.add_option("-a", "--alignmentIndex", dest="alignmentIndex",type='string', default=None, help="1KG formated alignment index file") + parser.add_option("-s", "--sequenceIndex", dest="sequenceIndex", type='string', default=None, help="1KG formated sequence index file") + parser.add_option("", "--onlySample", dest="onlySample", type='string', default=None, help="If provide, only this sample will be processed") + parser.add_option("", "--snps", dest="snps", type='string', default=None, help="SNPs VCF") + parser.add_option("", "--snpsEval", dest="snpsVE", type='string', default=None, help="SNPs VCF VariantEval") + parser.add_option("", "--indels", dest="indels", type='string', default=None, help="Indels VCF") + parser.add_option("", "--indelsEval", dest="indelsVE", type='string', default=None, help="Indels VCF VariantEval") + parser.add_option("", "--totalGenome", dest="totalGenome", type='float', default=2.96e9, help="Size, in bp, of the callable genome") + parser.add_option("", "--calledGenome", dest="calledGenome", type='float', default=None, help="Size, in bp, of the callable genome") + parser.add_option("-p", "--pop", dest="population", type='string', default="Anonymous", help="Population") + parser.add_option("", "--project", dest="project", type='string', default=None, help="If provided, will only include fastq/BAM files that match this project in the stats calculations") + parser.add_option("-r", "--root", dest="root",type='string', default=".", help="Path to the 1KG data") + parser.add_option("-M", "--maxRecords", dest="maxRecords", type='int', default=-1, help="If provided, will only include fastq/BAM files that match this regex in the stats calculations") + parser.add_option("-v", "--verbose", dest="verbose", action='store_true', default=False, help="If provided, will be verbose during output") + parser.add_option("", "--rawBasesFromBas", dest="rawBasesFromBas", action='store_true', default=False, help="If provided, we'll take our raw base counts from the BAS file") + parser.add_option("-o", "--output", dest="output",type='string', default=None, help="Path to the 1KG data") + parser.add_option("-c", "--coverageFile", dest="coverageFile",type='string', default=None, help="Path to GATK DoC .sample_summary file") + + (OPTIONS, args) = parser.parse_args() + if len(args) != 0: + parser.error("incorrect number of arguments") + + samples = readSamples(OPTIONS.snps) + nSamples = len(samples) + + ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex) + totalBases = countBases(samples, OPTIONS.sequenceIndex) + meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples + totalSNPs = countSNPs(samples, OPTIONS.snps) + totalIndels = countIndels(samples, OPTIONS.indels) + + snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE) + indelNoveltyRate = 100 - getDBSNPRate(OPTIONS.indelsVE) + + out = sys.stdout + if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w') + print >> out, 'number of samples', nSamples + print >> out, 'total raw bases', totalBases + print >> out, 'total mapped bases', totalMappedBases + + # mean mapped depth is total bases mapped divided by acgt reference base count divided by number of individuals, after rmdup: for exons this is calculated on the target region only + + print >> out, 'mean mapped depth', meanMappedDepth + + print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome) + print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate) + print >> out, 'average # SNP sites per individual', average(map(Sample.getNSNPs, samples.itervalues())) + print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate) + print >> out, 'average # indel sites per individual', average(map(Sample.getNIndels, samples.itervalues())) + out.close() + diff --git a/python/makeIGVCategory.py b/python/makeIGVCategory.py new file mode 100755 index 000000000..6d6bde30c --- /dev/null +++ b/python/makeIGVCategory.py @@ -0,0 +1,24 @@ +import os.path +from optparse import OptionParser + +def main(): + global OPTIONS + usage = "usage: %prog [options] files" + parser = OptionParser(usage=usage) + parser.add_option("-c", "--category", dest="category", + type='string', default="Anonymous", + help="If provided, catagory name") + + (OPTIONS, args) = parser.parse_args() + if len(args) == 0: + parser.error("incorrect number of arguments") + + print '' % OPTIONS.category + for arg in args: + path = os.path.abspath(arg) + name = os.path.basename(arg) + print ' ' % (name, path) + print '' + +if __name__ == "__main__": + main() diff --git a/python/picard_utils.py b/python/picard_utils.py index e42d7a8b5..2a44b31f5 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -19,6 +19,7 @@ ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" analysis = "CombineDuplicates" MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar' +FIXMATES_BIN = '/seq/software/picard/current/bin/FixMateInformation.jar' SAMTOOLS_MERGE_BIN = '/seq/dirseq/samtools/current/samtools merge' CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar' @@ -167,6 +168,12 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) +def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx4096m', compression_level = 1, tmpdir = '/broad/shptmp' ): + if type(inputFiles) <> list: + inputFiles = list(inputFiles) + + return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, 'I='.join(inputFiles) ) + def getPicardPath(lane, picardRoot = '/seq/picard/'): flowcell, laneNo = lane.split('.') filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*') diff --git a/python/realignBamByChr.py b/python/realignBamByChr.py index d26ce0e4a..bf443e069 100755 --- a/python/realignBamByChr.py +++ b/python/realignBamByChr.py @@ -12,6 +12,12 @@ import string import picard_utils from madPipelineUtils import * +def getContigs(optionContigs, hg18): + if optionContigs != None: + return optionContigs.split(",") + else: + return hg18 + def main(): global OPTIONS usage = "usage: %prog [options] stages input.bam outputRoot" @@ -28,6 +34,9 @@ def main(): parser.add_option("-n", "--name", dest="name", type="string", default="realignBamByChr", help="Farm queue to send processing jobs to") + parser.add_option("-c", "--contigs", dest="contigs", + type="string", default=None, + help="Comma-separated list of contig:start-stop values to pass to the cleaner. Overrides whole-genome if provided") parser.add_option("-q", "--farm", dest="farmQueue", type="string", default=None, help="Farm queue to send processing jobs to") @@ -61,7 +70,7 @@ def main(): out = open(outputBamList, 'w') realignInfo = [] - for chr in hg18: + for chr in getContigs(OPTIONS.contigs, hg18): lastJobs = None def updateNewJobs(newjobs, lastJobs): @@ -104,7 +113,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ): def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ): outputBAM = outputRoot + ".bam" - GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr) + GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -L %s' % (inputBam, intervals, outputBAM, chr) return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ): @@ -112,7 +121,8 @@ def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ): def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ): print lastJobs - cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 ) + #cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 ) + cmd = picard_utils.mergeFixingMatesBAMCmd(outputFilename, bamsToMerge, compression_level = 5) return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs) if __name__ == "__main__": diff --git a/python/vcfReader.py b/python/vcfReader.py index 8cbcebb91..14c4d84e6 100755 --- a/python/vcfReader.py +++ b/python/vcfReader.py @@ -74,7 +74,7 @@ class VCFRecord: def getFilter(self): return self.get("FILTER") def failsFilters(self): return not self.passesFilters() def passesFilters(self): - v = self.getFilter() == "." or self.getFilter() == "0" + v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASSES' #print self.getFilter(), ">>>", v, self return v @@ -110,6 +110,8 @@ class VCFRecord: else: return str(x) + '=' + str(y) v = ';'.join(map(lambda x: info2str(*x), sorted(self.info.iteritems(), key=lambda x: x[0]))) + if v == "": + v = "." #print 'V = ', v return v @@ -134,12 +136,13 @@ class VCFRecord: def parseInfo(s): d = dict() - for elt in s.split(";"): - if '=' in elt: - key, val = elt.split('=') - else: - key, val = elt, 1 - d[key] = val + if s != "": + for elt in s.split(";"): + if '=' in elt: + key, val = elt.split('=') + else: + key, val = elt, 1 + d[key] = val return d # def parseInfo(s):