2010-07-01 00:33:32 +08:00
# import farm_commands2
import os . path
import sys
from optparse import OptionParser
import glob
import operator
import itertools
import re
import vcfReader
import string
def average ( l ) :
sum = reduce ( operator . add , l , 0 )
2010-07-16 23:33:47 +08:00
return sum / ( 1.0 * ( max ( len ( l ) , 1 ) ) )
2010-07-01 00:33:32 +08:00
def printHeaderSep ( ) :
print
print ' ' . join ( [ ' - ' ] * 80 )
class Sample :
def __init__ ( self , name ) :
self . name = name
self . rawBases = 0
self . mappedBases = 0
self . nSNPs = 0
self . nIndels = 0
def getName ( self ) : return self . name
def getNSNPs ( self ) : return self . nSNPs
def getNIndels ( self ) : return self . nIndels
def __str__ ( self ) :
return ' [ %s rawBases= %d mappedBases= %d percentMapped= %.2f nSNPs= %d nIndels= %d ] ' % ( self . name , self . rawBases , self . mappedBases , ( self . mappedBases * 100.0 ) / max ( self . rawBases , 1 ) , self . nSNPs , self . nIndels )
__repr__ = __str__
def flatFileIterator ( file , fields = None , skip = 0 ) :
count = 0
for line in open ( file ) :
count + = 1
if count > skip :
s = map ( string . strip , line . split ( ' \t ' ) )
if ( fields != None ) :
s = map ( lambda field : s [ field ] , fields )
if len ( s ) == 1 : s = s [ 0 ]
yield s
# 1. FASTQ_FILE, path to fastq file on ftp site
# 2. MD5, md5sum of file
# 3. RUN_ID, SRA/ERA run accession
# 4. STUDY_ID, SRA/ERA study accession
# 5. STUDY_NAME, Name of stury
# 6. CENTER_NAME, Submission centre name
# 7. SUBMISSION_ID, SRA/ERA submission accession
# 8. SUBMISSION_DATE, Date sequence submitted, YYYY-MM-DAY
# 9. SAMPLE_ID, SRA/ERA sample accession
# 10. SAMPLE_NAME, Sample name
# 11. POPULATION, Sample population
# 12. EXPERIMENT_ID, Experiment accession
# 13. INSTRUMENT_PLATFORM, Type of sequencing machine
# 14. INSTRUMENT_MODEL, Model of sequencing machine
# 15. LIBRARY_NAME, Library name
# 16. RUN_NAME, Name of machine run
# 17. RUN_BLOCK_NAME, Name of machine run sector
# 18. INSERT_SIZE, Submitter specifed insert size
# 19. LIBRARY_LAYOUT, Library layout, this can be either PAIRED or SINGLE
# 20. PAIRED_FASTQ, Name of mate pair file if exists (Runs with failed mates will have
# a library layout of PAIRED but no paired fastq file)
# 21. WITHDRAWN, 0/1 to indicate if the file has been withdrawn, only present if a file has been withdrawn
# 22. WITHDRAWN_DATE, date of withdrawal, this should only be defined if a file is
# withdrawn
# 23. COMMENT, comment about reason for withdrawal
# 24. READ_COUNT, read count for the file
# 25. BASE_COUNT, basepair count for the file
def countBases ( samples , seqIndex ) :
total = 0
for project , sampleID , withdrawnP , bases in flatFileIterator ( seqIndex , [ 3 , 9 , 20 , 24 ] ) :
if ( withdrawnP == " 0 " and useProject ( project ) and sampleID in samples ) :
if OPTIONS . verbose : print project , sampleID , withdrawnP , bases
sample = samples [ sampleID ]
sample . rawBases + = int ( bases )
total + = int ( bases )
printStatus ( samples )
print ' Total raw bases ' , total
return total
def printStatus ( samples ) :
2010-07-16 23:33:47 +08:00
if OPTIONS . verbose :
for sample in samples . itervalues ( ) :
print sample
2010-07-01 00:33:32 +08:00
def findVariantEvalResults ( key , file , type = str ) :
def capture1 ( line ) :
if key in line :
s = line . split ( )
return type ( s [ len ( s ) - 1 ] )
else :
return None
return [ val for val in map ( capture1 , open ( file ) ) if val != None ]
def getDBSNPRate ( file ) :
if file != None :
key = " [evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point= % e vals at comp] "
2010-07-09 08:13:35 +08:00
r = findVariantEvalResults ( key , file , float )
if len ( r ) > 0 :
return r [ 0 ]
else :
return - 1
2010-07-01 00:33:32 +08:00
else :
return - 1
def useProject ( project ) :
2010-07-09 08:13:35 +08:00
#print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project)
return OPTIONS . project == None or re . match ( OPTIONS . project , project ) != None
2010-07-01 00:33:32 +08:00
def countMappedBases ( samples , alignmentIndex ) :
if ( OPTIONS . coverageFile != None ) :
# read from summary file, looking for the line:
# Total 340710 1187.14 N/A N/A N/A
for parts in map ( string . split , open ( OPTIONS . coverageFile ) ) :
if parts [ 0 ] == " Total " :
return - 1 , int ( parts [ 1 ] )
else :
return readMappedBasesFromBAS ( samples , alignmentIndex )
def readMappedBasesFromBAS ( samples , alignmentIndex ) :
totalBases = 0
totalMapped = 0
for project , sampleID , basFile in flatFileIterator ( alignmentIndex , [ 2 , 3 , 6 ] ) :
#print project, sampleID, basFile
if ( useProject ( project ) and sampleID in samples ) :
if OPTIONS . verbose : print project , sampleID , basFile
sample = samples [ sampleID ]
for rawBases , mappedBases in flatFileIterator ( os . path . join ( OPTIONS . root , basFile ) , [ 7 , 8 ] , skip = 1 ) :
#print ' ->', rawBases, mappedBases
if OPTIONS . rawBasesFromBas :
sample . rawBases + = int ( rawBases )
totalBases + = int ( rawBases )
sample . mappedBases + = int ( mappedBases )
totalMapped + = int ( mappedBases )
#print ' totals', totalBases, totalMapped
printStatus ( samples )
print ' Total raw bases ' , totalBases
print ' Total mapped bases ' , totalMapped
return totalBases , totalMapped
def countSNPs ( samples , snpsVCF , useIndels = False ) :
total = 0
2010-07-09 08:13:35 +08:00
novel = 0
2010-07-01 00:33:32 +08:00
header , columnNames , remainingLines = vcfReader . readVCFHeader ( open ( snpsVCF ) )
sampleIDs = columnNames [ 9 : ]
2010-07-09 08:13:35 +08:00
print ' Counting SNPs... '
2010-07-16 23:33:47 +08:00
#lines = 0
2010-07-01 00:33:32 +08:00
for header , vcf , counter in vcfReader . lines2VCF ( remainingLines , extendedOutput = True , decodeAll = False ) :
2010-07-16 23:33:47 +08:00
#lines += 1
#print 'lines', lines
2010-07-01 00:33:32 +08:00
if ( counter > OPTIONS . maxRecords and OPTIONS . maxRecords != - 1 ) :
break
2010-07-09 08:13:35 +08:00
2010-07-16 23:33:47 +08:00
if vcf . passesFilters ( ) and vcf . getChrom ( ) not in [ ' MT ' , ' Y ' ] :
2010-07-01 00:33:32 +08:00
if ( vcf . isVariant ( ) ) :
total + = 1
2010-07-09 08:13:35 +08:00
if ( vcf . isNovel ( ) ) : novel + = 1
2010-07-01 00:33:32 +08:00
if ( OPTIONS . verbose and total % 10000 == 0 ) :
print ' progress ' , vcf . getChrom ( ) , vcf . getPos ( )
genotypes = vcf . rest [ 1 : ]
for sampleID , genotypeField in itertools . izip ( sampleIDs , genotypes ) :
#print sampleID, samples
if sampleID in samples :
genotype = genotypeField . split ( ' : ' ) [ 0 ]
variant = genotype != " 0/0 " and genotype != " 0|0 " and genotype != " 0 \0 " and genotype != " ./. "
#print ' => ', vcf, sampleID, genotype, variant
if variant :
if ( useIndels ) :
samples [ sampleID ] . nIndels + = 1
else :
samples [ sampleID ] . nSNPs + = 1
2010-07-16 23:33:47 +08:00
else :
print ' rejecting line ' , vcf
2010-07-01 00:33:32 +08:00
printStatus ( samples )
2010-07-09 08:13:35 +08:00
return total , novel
2010-07-01 00:33:32 +08:00
def countIndels ( samples , indelsVCF ) :
total = 0
if ( indelsVCF != None ) :
return countSNPs ( samples , indelsVCF , True )
2010-07-09 08:13:35 +08:00
return total , 0
2010-07-01 00:33:32 +08:00
def readSamples ( vcf ) :
print ' Reading samples for ' , OPTIONS . population
header , columnNames , remainingLines = vcfReader . readVCFHeader ( open ( vcf ) )
samples = map ( Sample , columnNames [ 9 : ] )
if ( OPTIONS . onlySample != None ) :
samples = filter ( lambda x : x . getName ( ) == OPTIONS . onlySample , samples )
print ' No. samples: ' , len ( samples )
print ' Samples: ' , map ( Sample . getName , samples )
return dict ( map ( lambda x : ( x . getName ( ) , x ) , samples ) )
if __name__ == " __main__ " :
usage = " usage: % prog "
parser = OptionParser ( usage = usage )
parser . add_option ( " -a " , " --alignmentIndex " , dest = " alignmentIndex " , type = ' string ' , default = None , help = " 1KG formated alignment index file " )
parser . add_option ( " -s " , " --sequenceIndex " , dest = " sequenceIndex " , type = ' string ' , default = None , help = " 1KG formated sequence index file " )
parser . add_option ( " " , " --onlySample " , dest = " onlySample " , type = ' string ' , default = None , help = " If provide, only this sample will be processed " )
parser . add_option ( " " , " --snps " , dest = " snps " , type = ' string ' , default = None , help = " SNPs VCF " )
parser . add_option ( " " , " --snpsEval " , dest = " snpsVE " , type = ' string ' , default = None , help = " SNPs VCF VariantEval " )
parser . add_option ( " " , " --indels " , dest = " indels " , type = ' string ' , default = None , help = " Indels VCF " )
parser . add_option ( " " , " --indelsEval " , dest = " indelsVE " , type = ' string ' , default = None , help = " Indels VCF VariantEval " )
parser . add_option ( " " , " --totalGenome " , dest = " totalGenome " , type = ' float ' , default = 2.96e9 , help = " Size, in bp, of the callable genome " )
parser . add_option ( " " , " --calledGenome " , dest = " calledGenome " , type = ' float ' , default = None , help = " Size, in bp, of the callable genome " )
parser . add_option ( " -p " , " --pop " , dest = " population " , type = ' string ' , default = " Anonymous " , help = " Population " )
parser . add_option ( " " , " --project " , dest = " project " , type = ' string ' , default = None , help = " If provided, will only include fastq/BAM files that match this project in the stats calculations " )
parser . add_option ( " -r " , " --root " , dest = " root " , type = ' string ' , default = " . " , help = " Path to the 1KG data " )
parser . add_option ( " -M " , " --maxRecords " , dest = " maxRecords " , type = ' int ' , default = - 1 , help = " If provided, will only include fastq/BAM files that match this regex in the stats calculations " )
parser . add_option ( " -v " , " --verbose " , dest = " verbose " , action = ' store_true ' , default = False , help = " If provided, will be verbose during output " )
parser . add_option ( " " , " --rawBasesFromBas " , dest = " rawBasesFromBas " , action = ' store_true ' , default = False , help = " If provided, we ' ll take our raw base counts from the BAS file " )
parser . add_option ( " -o " , " --output " , dest = " output " , type = ' string ' , default = None , help = " Path to the 1KG data " )
parser . add_option ( " -c " , " --coverageFile " , dest = " coverageFile " , type = ' string ' , default = None , help = " Path to GATK DoC .sample_summary file " )
( OPTIONS , args ) = parser . parse_args ( )
if len ( args ) != 0 :
parser . error ( " incorrect number of arguments " )
samples = readSamples ( OPTIONS . snps )
nSamples = len ( samples )
ignore , totalMappedBases = countMappedBases ( samples , OPTIONS . alignmentIndex )
totalBases = countBases ( samples , OPTIONS . sequenceIndex )
2010-07-16 23:33:47 +08:00
meanMappedDepth = totalMappedBases / OPTIONS . totalGenome / ( max ( nSamples , 1 ) )
2010-07-09 08:13:35 +08:00
totalSNPs , novelSNPs = countSNPs ( samples , OPTIONS . snps )
totalIndels , novelIndels = countIndels ( samples , OPTIONS . indels )
2010-07-01 00:33:32 +08:00
snpNoveltyRate = 100 - getDBSNPRate ( OPTIONS . snpsVE )
2010-07-16 23:33:47 +08:00
indelNoveltyRate = novelIndels / ( 1.0 * max ( totalIndels , 1 ) ) # 100 - getDBSNPRate(OPTIONS.indelsVE)
2010-07-01 00:33:32 +08:00
out = sys . stdout
if ( OPTIONS . output != None ) : out = open ( OPTIONS . output , ' w ' )
print >> out , ' number of samples ' , nSamples
print >> out , ' total raw bases ' , totalBases
print >> out , ' total mapped bases ' , totalMappedBases
# mean mapped depth is total bases mapped divided by acgt reference base count divided by number of individuals, after rmdup: for exons this is calculated on the target region only
print >> out , ' mean mapped depth ' , meanMappedDepth
print >> out , ' bases called (fraction ref genome) %f ( %.2f %% ) ' % ( OPTIONS . calledGenome , 100.0 * OPTIONS . calledGenome / OPTIONS . totalGenome )
print >> out , ' number of SNP sites ( %% novel) %d ( %.2f %% ) ' % ( totalSNPs , snpNoveltyRate )
2010-07-09 08:13:35 +08:00
print >> out , ' average # SNP sites per individual %.0f ' % average ( map ( Sample . getNSNPs , samples . itervalues ( ) ) )
2010-07-01 00:33:32 +08:00
print >> out , ' number of indel sites ( %% novel) %d ( %.2f %% ) ' % ( totalIndels , indelNoveltyRate )
2010-07-09 08:13:35 +08:00
print >> out , ' average # indel sites per individual %.0f ' % average ( map ( Sample . getNIndels , samples . itervalues ( ) ) )
2010-07-01 00:33:32 +08:00
out . close ( )