gatk-3.8/python/1kgStatsForCalls.py

# import farm_commands2
import os.path
import sys
from optparse import OptionParser
import glob
import operator
import itertools
import re
import vcfReader
import string

def average(l):
    sum = reduce(operator.add, l, 0)
    return sum / (1.0 * (max(len(l), 1)))

def printHeaderSep():
    print
    print ''.join(['-'] * 80)

class Sample:
    def __init__(self, name):
        self.name = name
        self.rawBases = 0
        self.mappedBases = 0
        self.nSNPs = 0
        self.nIndels = 0
        
    def getName(self): return self.name
    def getNSNPs(self): return self.nSNPs
    def getNIndels(self): return self.nIndels
        
    def __str__(self):
        return '[%s rawBases=%d mappedBases=%d percentMapped=%.2f nSNPs=%d nIndels=%d]' % (self.name, self.rawBases, self.mappedBases, (self.mappedBases * 100.0) / max(self.rawBases,1), self.nSNPs, self.nIndels)
    __repr__ = __str__
    
def flatFileIterator(file, fields = None, skip = 0):
    count = 0
    for line in open(file):
        count += 1
        if count > skip:
            s = map(string.strip, line.split('\t'))
            if ( fields != None ):
                s = map(lambda field: s[field], fields)
    
            if len(s) == 1: s = s[0]
            yield s

# 1.  FASTQ_FILE, path to fastq file on ftp site  
# 2.  MD5, md5sum of file
# 3.  RUN_ID, SRA/ERA run accession
# 4.  STUDY_ID, SRA/ERA study accession   
# 5.  STUDY_NAME, Name of stury   
# 6.  CENTER_NAME, Submission centre name 
# 7.  SUBMISSION_ID, SRA/ERA submission accession 
# 8.  SUBMISSION_DATE, Date sequence submitted, YYYY-MM-DAY
# 9.  SAMPLE_ID, SRA/ERA sample accession 
# 10. SAMPLE_NAME, Sample name
# 11. POPULATION, Sample population
# 12. EXPERIMENT_ID, Experiment accession 
# 13. INSTRUMENT_PLATFORM, Type of sequencing machine
# 14. INSTRUMENT_MODEL, Model of sequencing machine 
# 15. LIBRARY_NAME, Library name  
# 16. RUN_NAME, Name of machine run
# 17. RUN_BLOCK_NAME, Name of machine run sector  
# 18. INSERT_SIZE, Submitter specifed insert size 
# 19. LIBRARY_LAYOUT, Library layout, this can be either PAIRED or SINGLE 
# 20. PAIRED_FASTQ, Name of mate pair file if exists (Runs with failed mates will have 
#     a library layout of PAIRED but no paired fastq file)
# 21. WITHDRAWN, 0/1 to indicate if the file has been withdrawn, only present if a file has been withdrawn
# 22. WITHDRAWN_DATE, date of withdrawal, this should only be defined if a file is 
#     withdrawn
# 23. COMMENT, comment about reason for withdrawal
# 24. READ_COUNT, read  count for the file
# 25. BASE_COUNT, basepair count for the file
def countBases(samples, seqIndex):
    total = 0
    
    for project, sampleID, withdrawnP, bases in flatFileIterator(seqIndex, [3,9,20,24]):
        if ( withdrawnP == "0" and useProject(project) and sampleID in samples ):
            if OPTIONS.verbose: print project, sampleID, withdrawnP, bases
            sample = samples[sampleID]
            sample.rawBases += int(bases)
            total += int(bases)
    
    printStatus(samples)
    print 'Total raw bases', total
    return total

def printStatus(samples):
    if OPTIONS.verbose:
        for sample in samples.itervalues():
            print sample

def findVariantEvalResults(key, file, type=str):
    def capture1(line):
        if key in line:
            s = line.split()
            return type(s[len(s)-1])
        else:
            return None

    return [val for val in map(capture1, open(file)) if val != None]


def getDBSNPRate(file):
    if file != None:
        key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]"
        r = findVariantEvalResults(key, file, float)
        if len(r) > 0:
            return r[0]
        else:
            return -1
    else:
        return -1

def useProject(project):
    #print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project)
    return OPTIONS.project == None or re.match(OPTIONS.project, project) != None

def countMappedBases(samples, alignmentIndex):
    if ( OPTIONS.coverageFile != None ): 
        # read from summary file, looking for the line:
        # Total   340710  1187.14 N/A     N/A     N/A
        for parts in map( string.split, open(OPTIONS.coverageFile) ):
            if parts[0] == "Total":
                return -1, int(parts[1])
    else:
        return readMappedBasesFromBAS(samples, alignmentIndex)

def readMappedBasesFromBAS(samples, alignmentIndex):
    totalBases = 0 
    totalMapped = 0
    
    for project, sampleID, basFile in flatFileIterator(alignmentIndex, [2,3,6]):
        #print project, sampleID, basFile
        if ( useProject(project) and sampleID in samples ):
            if OPTIONS.verbose: print project, sampleID, basFile
            sample = samples[sampleID]
    
            for rawBases, mappedBases in flatFileIterator(os.path.join(OPTIONS.root, basFile), [7, 8], skip=1):
                #print '  ->', rawBases, mappedBases
                if OPTIONS.rawBasesFromBas:
                    sample.rawBases += int(rawBases)
                    totalBases += int(rawBases)
                sample.mappedBases += int(mappedBases)
                totalMapped += int(mappedBases)
                #print '  totals', totalBases, totalMapped
    
    printStatus(samples)
    print 'Total raw    bases', totalBases
    print 'Total mapped bases', totalMapped
    
    return totalBases, totalMapped

def countSNPs(samples, snpsVCF, useIndels = False):
    total = 0
    novel = 0
    
    header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF))
    sampleIDs = columnNames[9:]

    print 'Counting SNPs...'
    #lines = 0
    for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
        #lines += 1
        #print 'lines', lines
        if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
            break
            
        if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']:
            if ( vcf.isVariant() ): 
                total += 1

                if ( vcf.isNovel() ): novel += 1
                
                if ( OPTIONS.verbose and total % 10000 == 0 ):
                    print '  progress', vcf.getChrom(), vcf.getPos()
                
                genotypes = vcf.rest[1:]
                for sampleID, genotypeField in itertools.izip(sampleIDs, genotypes):
                    #print sampleID, samples
                    if sampleID in samples:
                        genotype = genotypeField.split(':')[0]
                        variant = genotype != "0/0" and genotype != "0|0" and genotype != "0\0" and genotype != "./."
                        #print '  => ', vcf, sampleID, genotype, variant
                        if variant:
                            if ( useIndels ):
                                samples[sampleID].nIndels += 1
                            else:
                                samples[sampleID].nSNPs += 1
        else:
            print 'rejecting line', vcf

    printStatus(samples)
    return total, novel

def countIndels(samples, indelsVCF):
    total = 0
    
    if ( indelsVCF != None ):
        return countSNPs(samples, indelsVCF, True)
    
    return total, 0

def readSamples(vcf):
    print 'Reading samples for', OPTIONS.population
    header, columnNames, remainingLines = vcfReader.readVCFHeader(open(vcf))
    samples = map(Sample, columnNames[9:])
    if ( OPTIONS.onlySample != None ):
        samples = filter( lambda x: x.getName() == OPTIONS.onlySample, samples )
    
    print 'No. samples: ', len(samples)
    print 'Samples:     ', map(Sample.getName, samples)

    return dict(map( lambda x: (x.getName(), x), samples))
    
if __name__ == "__main__":
    usage = "usage: %prog"
    parser = OptionParser(usage=usage)
    parser.add_option("-a", "--alignmentIndex", dest="alignmentIndex",type='string', default=None, help="1KG formated alignment index file")
    parser.add_option("-s", "--sequenceIndex", dest="sequenceIndex", type='string', default=None, help="1KG formated sequence index file")
    parser.add_option("", "--onlySample", dest="onlySample", type='string', default=None, help="If provide, only this sample will be processed")
    parser.add_option("", "--snps", dest="snps", type='string', default=None, help="SNPs VCF")
    parser.add_option("", "--snpsEval", dest="snpsVE", type='string', default=None, help="SNPs VCF VariantEval")
    parser.add_option("", "--indels", dest="indels", type='string', default=None, help="Indels VCF")
    parser.add_option("", "--indelsEval", dest="indelsVE", type='string', default=None, help="Indels VCF VariantEval")
    parser.add_option("", "--totalGenome", dest="totalGenome", type='float', default=2.96e9, help="Size, in bp, of the callable genome")
    parser.add_option("", "--calledGenome", dest="calledGenome", type='float', default=None, help="Size, in bp, of the callable genome")
    parser.add_option("-p", "--pop", dest="population", type='string', default="Anonymous", help="Population")
    parser.add_option("", "--project", dest="project", type='string', default=None, help="If provided, will only include fastq/BAM files that match this project in the stats calculations")
    parser.add_option("-r", "--root", dest="root",type='string', default=".", help="Path to the 1KG data")
    parser.add_option("-M", "--maxRecords", dest="maxRecords", type='int', default=-1, help="If provided, will only include fastq/BAM files that match this regex in the stats calculations")
    parser.add_option("-v", "--verbose", dest="verbose", action='store_true', default=False, help="If provided, will be verbose during output")
    parser.add_option("", "--rawBasesFromBas", dest="rawBasesFromBas", action='store_true', default=False, help="If provided, we'll take our raw base counts from the BAS file")
    parser.add_option("-o", "--output", dest="output",type='string', default=None, help="Path to the 1KG data")
    parser.add_option("-c", "--coverageFile", dest="coverageFile",type='string', default=None, help="Path to GATK DoC .sample_summary file")
                        
    (OPTIONS, args) = parser.parse_args()
    if len(args) != 0:
        parser.error("incorrect number of arguments")

    samples = readSamples(OPTIONS.snps)
    nSamples = len(samples)    

    ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)    
    totalBases = countBases(samples, OPTIONS.sequenceIndex)
    meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1))
    totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
    totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
    
    snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
    indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE)

    out = sys.stdout
    if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
    print >> out, 'number of samples', nSamples
    print >> out, 'total raw bases', totalBases
    print >> out, 'total mapped bases', totalMappedBases

    # mean mapped depth is total bases mapped divided by acgt reference base count divided by number of individuals, after rmdup: for exons this is calculated on the target region only
    
    print >> out, 'mean mapped depth', meanMappedDepth
    
    print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome)
    print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate)
    print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues()))
    print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate)
    print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues()))
    out.close()
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00			`# import farm_commands2`
			`import os.path`
			`import sys`
			`from optparse import OptionParser`
			`import glob`
			`import operator`
			`import itertools`
			`import re`
			`import vcfReader`
			`import string`

			`def average(l):`
			`sum = reduce(operator.add, l, 0)`
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`return sum / (1.0 * (max(len(l), 1)))`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`def printHeaderSep():`
			`print`
			`print ''.join(['-'] * 80)`

			`class Sample:`
			`def __init__(self, name):`
			`self.name = name`
			`self.rawBases = 0`
			`self.mappedBases = 0`
			`self.nSNPs = 0`
			`self.nIndels = 0`

			`def getName(self): return self.name`
			`def getNSNPs(self): return self.nSNPs`
			`def getNIndels(self): return self.nIndels`

			`def __str__(self):`
			`return '[%s rawBases=%d mappedBases=%d percentMapped=%.2f nSNPs=%d nIndels=%d]' % (self.name, self.rawBases, self.mappedBases, (self.mappedBases * 100.0) / max(self.rawBases,1), self.nSNPs, self.nIndels)`
			`__repr__ = __str__`

			`def flatFileIterator(file, fields = None, skip = 0):`
			`count = 0`
			`for line in open(file):`
			`count += 1`
			`if count > skip:`
			`s = map(string.strip, line.split('\t'))`
			`if ( fields != None ):`
			`s = map(lambda field: s[field], fields)`

			`if len(s) == 1: s = s[0]`
			`yield s`

			`# 1. FASTQ_FILE, path to fastq file on ftp site`
			`# 2. MD5, md5sum of file`
			`# 3. RUN_ID, SRA/ERA run accession`
			`# 4. STUDY_ID, SRA/ERA study accession`
			`# 5. STUDY_NAME, Name of stury`
			`# 6. CENTER_NAME, Submission centre name`
			`# 7. SUBMISSION_ID, SRA/ERA submission accession`
			`# 8. SUBMISSION_DATE, Date sequence submitted, YYYY-MM-DAY`
			`# 9. SAMPLE_ID, SRA/ERA sample accession`
			`# 10. SAMPLE_NAME, Sample name`
			`# 11. POPULATION, Sample population`
			`# 12. EXPERIMENT_ID, Experiment accession`
			`# 13. INSTRUMENT_PLATFORM, Type of sequencing machine`
			`# 14. INSTRUMENT_MODEL, Model of sequencing machine`
			`# 15. LIBRARY_NAME, Library name`
			`# 16. RUN_NAME, Name of machine run`
			`# 17. RUN_BLOCK_NAME, Name of machine run sector`
			`# 18. INSERT_SIZE, Submitter specifed insert size`
			`# 19. LIBRARY_LAYOUT, Library layout, this can be either PAIRED or SINGLE`
			`# 20. PAIRED_FASTQ, Name of mate pair file if exists (Runs with failed mates will have`
			`# a library layout of PAIRED but no paired fastq file)`
			`# 21. WITHDRAWN, 0/1 to indicate if the file has been withdrawn, only present if a file has been withdrawn`
			`# 22. WITHDRAWN_DATE, date of withdrawal, this should only be defined if a file is`
			`# withdrawn`
			`# 23. COMMENT, comment about reason for withdrawal`
			`# 24. READ_COUNT, read count for the file`
			`# 25. BASE_COUNT, basepair count for the file`
			`def countBases(samples, seqIndex):`
			`total = 0`

			`for project, sampleID, withdrawnP, bases in flatFileIterator(seqIndex, [3,9,20,24]):`
			`if ( withdrawnP == "0" and useProject(project) and sampleID in samples ):`
			`if OPTIONS.verbose: print project, sampleID, withdrawnP, bases`
			`sample = samples[sampleID]`
			`sample.rawBases += int(bases)`
			`total += int(bases)`

			`printStatus(samples)`
			`print 'Total raw bases', total`
			`return total`

			`def printStatus(samples):`
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`if OPTIONS.verbose:`
			`for sample in samples.itervalues():`
			`print sample`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`def findVariantEvalResults(key, file, type=str):`
			`def capture1(line):`
			`if key in line:`
			`s = line.split()`
			`return type(s[len(s)-1])`
			`else:`
			`return None`

			`return [val for val in map(capture1, open(file)) if val != None]`


			`def getDBSNPRate(file):`
			`if file != None:`
			`key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]"`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`r = findVariantEvalResults(key, file, float)`
			`if len(r) > 0:`
			`return r[0]`
			`else:`
			`return -1`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00			`else:`
			`return -1`

			`def useProject(project):`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`#print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project)`
			`return OPTIONS.project == None or re.match(OPTIONS.project, project) != None`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`def countMappedBases(samples, alignmentIndex):`
			`if ( OPTIONS.coverageFile != None ):`
			`# read from summary file, looking for the line:`
			`# Total 340710 1187.14 N/A N/A N/A`
			`for parts in map( string.split, open(OPTIONS.coverageFile) ):`
			`if parts[0] == "Total":`
			`return -1, int(parts[1])`
			`else:`
			`return readMappedBasesFromBAS(samples, alignmentIndex)`

			`def readMappedBasesFromBAS(samples, alignmentIndex):`
			`totalBases = 0`
			`totalMapped = 0`

			`for project, sampleID, basFile in flatFileIterator(alignmentIndex, [2,3,6]):`
			`#print project, sampleID, basFile`
			`if ( useProject(project) and sampleID in samples ):`
			`if OPTIONS.verbose: print project, sampleID, basFile`
			`sample = samples[sampleID]`

			`for rawBases, mappedBases in flatFileIterator(os.path.join(OPTIONS.root, basFile), [7, 8], skip=1):`
			`#print ' ->', rawBases, mappedBases`
			`if OPTIONS.rawBasesFromBas:`
			`sample.rawBases += int(rawBases)`
			`totalBases += int(rawBases)`
			`sample.mappedBases += int(mappedBases)`
			`totalMapped += int(mappedBases)`
			`#print ' totals', totalBases, totalMapped`

			`printStatus(samples)`
			`print 'Total raw bases', totalBases`
			`print 'Total mapped bases', totalMapped`

			`return totalBases, totalMapped`

			`def countSNPs(samples, snpsVCF, useIndels = False):`
			`total = 0`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`novel = 0`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF))`
			`sampleIDs = columnNames[9:]`

misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`print 'Counting SNPs...'`
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`#lines = 0`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00			`for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):`
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`#lines += 1`
			`#print 'lines', lines`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00			`if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):`
			`break`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']:`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00			`if ( vcf.isVariant() ):`
			`total += 1`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00
			`if ( vcf.isNovel() ): novel += 1`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`if ( OPTIONS.verbose and total % 10000 == 0 ):`
			`print ' progress', vcf.getChrom(), vcf.getPos()`

			`genotypes = vcf.rest[1:]`
			`for sampleID, genotypeField in itertools.izip(sampleIDs, genotypes):`
			`#print sampleID, samples`
			`if sampleID in samples:`
			`genotype = genotypeField.split(':')[0]`
			`variant = genotype != "0/0" and genotype != "0\|0" and genotype != "0\0" and genotype != "./."`
			`#print ' => ', vcf, sampleID, genotype, variant`
			`if variant:`
			`if ( useIndels ):`
			`samples[sampleID].nIndels += 1`
			`else:`
			`samples[sampleID].nSNPs += 1`
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`else:`
			`print 'rejecting line', vcf`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`printStatus(samples)`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`return total, novel`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`def countIndels(samples, indelsVCF):`
			`total = 0`

			`if ( indelsVCF != None ):`
			`return countSNPs(samples, indelsVCF, True)`

misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`return total, 0`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`def readSamples(vcf):`
			`print 'Reading samples for', OPTIONS.population`
			`header, columnNames, remainingLines = vcfReader.readVCFHeader(open(vcf))`
			`samples = map(Sample, columnNames[9:])`
			`if ( OPTIONS.onlySample != None ):`
			`samples = filter( lambda x: x.getName() == OPTIONS.onlySample, samples )`

			`print 'No. samples: ', len(samples)`
			`print 'Samples: ', map(Sample.getName, samples)`

			`return dict(map( lambda x: (x.getName(), x), samples))`

			`if __name__ == "__main__":`
			`usage = "usage: %prog"`
			`parser = OptionParser(usage=usage)`
			`parser.add_option("-a", "--alignmentIndex", dest="alignmentIndex",type='string', default=None, help="1KG formated alignment index file")`
			`parser.add_option("-s", "--sequenceIndex", dest="sequenceIndex", type='string', default=None, help="1KG formated sequence index file")`
			`parser.add_option("", "--onlySample", dest="onlySample", type='string', default=None, help="If provide, only this sample will be processed")`
			`parser.add_option("", "--snps", dest="snps", type='string', default=None, help="SNPs VCF")`
			`parser.add_option("", "--snpsEval", dest="snpsVE", type='string', default=None, help="SNPs VCF VariantEval")`
			`parser.add_option("", "--indels", dest="indels", type='string', default=None, help="Indels VCF")`
			`parser.add_option("", "--indelsEval", dest="indelsVE", type='string', default=None, help="Indels VCF VariantEval")`
			`parser.add_option("", "--totalGenome", dest="totalGenome", type='float', default=2.96e9, help="Size, in bp, of the callable genome")`
			`parser.add_option("", "--calledGenome", dest="calledGenome", type='float', default=None, help="Size, in bp, of the callable genome")`
			`parser.add_option("-p", "--pop", dest="population", type='string', default="Anonymous", help="Population")`
			`parser.add_option("", "--project", dest="project", type='string', default=None, help="If provided, will only include fastq/BAM files that match this project in the stats calculations")`
			`parser.add_option("-r", "--root", dest="root",type='string', default=".", help="Path to the 1KG data")`
			`parser.add_option("-M", "--maxRecords", dest="maxRecords", type='int', default=-1, help="If provided, will only include fastq/BAM files that match this regex in the stats calculations")`
			`parser.add_option("-v", "--verbose", dest="verbose", action='store_true', default=False, help="If provided, will be verbose during output")`
			`parser.add_option("", "--rawBasesFromBas", dest="rawBasesFromBas", action='store_true', default=False, help="If provided, we'll take our raw base counts from the BAS file")`
			`parser.add_option("-o", "--output", dest="output",type='string', default=None, help="Path to the 1KG data")`
			`parser.add_option("-c", "--coverageFile", dest="coverageFile",type='string', default=None, help="Path to GATK DoC .sample_summary file")`

			`(OPTIONS, args) = parser.parse_args()`
			`if len(args) != 0:`
			`parser.error("incorrect number of arguments")`

			`samples = readSamples(OPTIONS.snps)`
			`nSamples = len(samples)`

			`ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)`
			`totalBases = countBases(samples, OPTIONS.sequenceIndex)`
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1))`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)`
			`totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)`
Improvements to 1KG processing pipeline git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-16 23:33:47 +08:00			`indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE)`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00
			`out = sys.stdout`
			`if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')`
			`print >> out, 'number of samples', nSamples`
			`print >> out, 'total raw bases', totalBases`
			`print >> out, 'total mapped bases', totalMappedBases`

			`# mean mapped depth is total bases mapped divided by acgt reference base count divided by number of individuals, after rmdup: for exons this is calculated on the target region only`

			`print >> out, 'mean mapped depth', meanMappedDepth`

			`print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome)`
			`print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate)`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues()))`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00			`print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate)`
misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-09 08:13:35 +08:00			`print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues()))`
misc. useful updates to python library git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3683 348d0f76-0448-11de-a6fe-93d51630548a 2010-07-01 00:33:32 +08:00			`out.close()`