gatk-3.8/python/Verify1KGArchiveBAMs.py

216 lines
6.9 KiB
Python

import farm_commands
import os.path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
import itertools
class Status:
def __init__(self, file, exists, size):
self.file = file
self.exists = exists
self.size = size
if not exists: self.status = "missing"
if size == 0: self.status = "no-data"
else: self.status = "exists: bytes=" + str(self.size)
def __str__(self):
return self.status
def viewSize(self):
return MergeBAMsUtils.greek(self.size)
class ComparedFiles:
def __init__(self, file, status, localStat, ftpStat):
self.file = file
self.status = status
self.localStat = localStat
self.ftpStat = ftpStat
def size(self):
if self.localStat.size <> 0:
return self.localStat.size
if self.ftpStat.size <> 0:
return self.ftpStat.size
else:
return 0
def modTime(self):
if self.localStat.exists:
return os.path.getmtime(self.localStat.file)
else:
return 0
def modTimeStr(t):
return time.strftime("%m/%d/%y", time.localtime(t))
from urlparse import urlparse
from ftplib import FTP
FTPSERVER = None
DEBUG = False
# from directories to lists of lines
CACHED_LIST = dict()
def getSizeForFile(dir, filename):
global CACHED_LIST
size = [0]
def captureSize(line, cache = True):
#print line
if cache: CACHED_LIST[dir].append(line)
s = line.split()
if len(s) >= 9 and s[8] == filename:
size[0] = int(s[4])
#print 'Found size', s, size
if dir in CACHED_LIST:
#print 'cached is', CACHED_LIST[dir]
map( lambda l: captureSize(l, False), CACHED_LIST[dir] )
else:
FTPSERVER.cwd(dir)
CACHED_LIST[dir] = list()
result = FTPSERVER.retrlines('LIST', captureSize)
return size[0]
def ftpStatus( ftpPath ):
if DEBUG: print 'ftpPath', ftpPath
dir, filename = os.path.split(ftpPath)
if DEBUG: print 'listing', dir
try:
size = getSizeForFile(dir, filename)
except:
#print 'failing...'
size = 0
# finally:
# pass
#print 'FTPSERVER', FTPSERVER
#FTPSERVER.quit()
if DEBUG: print ' result was', size
return Status( ftpPath, size <> 0, size )
def localStatus(file):
exists = os.path.exists(file)
size = 0
if exists: size = os.path.getsize(file)
return Status(file, exists, int(size) )
def validateFile(relPath, localRoot, ftpRoot):
localPath = os.path.join(root, relPath)
ftpPath = os.path.join(ftpRoot, relPath)
# check the local file
if DEBUG: print 'Checking', relPath
localStat = localStatus(localPath)
ftpStat = ftpStatus(ftpPath)
if DEBUG: print ' local status is', localStat
if DEBUG: print ' ftp status is ', ftpStat
compared = compareFileStatus(localStat, ftpStat)
if not OPTIONS.quiet:
print 'STATUS %20s for %s ' % (compared.status, relPath)
return compared
import MergeBAMsUtils
import time
def compareFileStatus(localStat, ftpStat):
if localStat.exists:
if ftpStat.exists:
if localStat.size == ftpStat.size:
status = 'in-sync'
else:
status = 'size-mismatch'
else:
status = 'unknown-local-file'
else:
if ftpStat.exists:
status = 'local-file-missing'
else:
status = 'orphaned-file'
return ComparedFiles(localStat.file, status, localStat, ftpStat)
import re
def filesInLocalPath(root, subdir):
regex = re.compile(".*\.(bam|bai)$")
localFiles = set()
if subdir <> None:
for fullroot, dirs, files in os.walk(os.path.join(root, subdir)):
for file in filter( regex.match, files ):
fullpath = os.path.join(fullroot, file)
path = fullpath.split(root)[1]
#print 'adding relpath=', path, 'fullpath=', fullpath
localFiles.add(path)
if OPTIONS.maxLocalFiles <> None and len(localFiles) > OPTIONS.maxLocalFiles: return localFiles
return localFiles
def readAlignmentIndex(file):
files = set()
if file <> None:
for line in open(file):
files.add(line.split()[0])
files.add(line.split()[4])
return files
if __name__ == "__main__":
usage = "usage: %prog -l and/or -a root ftpRoot"
parser = OptionParser(usage=usage)
parser.add_option("-l", "--local", dest="scanLocal",
type='string', default=None,
help="If provided, checks all of the local files against the archive")
parser.add_option("-a", "--alignmentIndex", dest="alignmentIndex",
type='string', default=None,
help="If provided, checks all of the files in the alignment.index in the archive")
parser.add_option("-m", "--maxLocal", dest="maxLocalFiles",
type='int', default=None,
help="If provided, maximum number of files in the local archive to examine")
parser.add_option("-q", "--quiet", dest="quiet",
action='store_true', default=False,
help="If provided, prints out the individual status of all files")
(OPTIONS, args) = parser.parse_args()
if len(args) != 2:
parser.error("incorrect number of arguments")
root, ftpRoot = args
ftpParsed = urlparse(ftpRoot)
FTPSERVER = FTP(ftpParsed[1])
FTPSERVER.login()
results = dict()
for file in itertools.chain(readAlignmentIndex(OPTIONS.alignmentIndex), filesInLocalPath(root, OPTIONS.scanLocal )):
#print line
#bas = line.split()[6]
if file not in results:
compared = validateFile( file, root, ftpParsed[2] )
results[file] = compared
#localIndex
print 'SUMMARY: Total files examined', len(results)
for status in ['in-sync', 'size-mismatch', 'unknown-local-file', 'local-file-missing', 'orphaned-file']:
print ''.join(['-'] * 80)
filesOfStatus = filter(lambda x: x.status == status, results.itervalues())
n = len(filesOfStatus)
print 'SUMMARY: %s' % ( status )
print 'SUMMARY: files %d (%.2f%% of total)' % ( n, n * 100.0 / len(results))
if n > 0:
fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 ))
mostRecentMod = apply(max, map( ComparedFiles.modTime, filesOfStatus ))
if mostRecentMod > 0:
modTime = modTimeStr(mostRecentMod)
else:
modTime = "N/A"
print 'SUMMARY: total size %s' % ( fileSizes )
print 'SUMMARY: last modification time %s' % ( modTime )