From c988205884836ecd3a9dfeb7eae774733bcbf76f Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 10 Sep 2009 03:18:51 +0000 Subject: [PATCH] Notes for Aaron in SSG git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1576 348d0f76-0448-11de-a6fe-93d51630548a --- .../genotyper/SingleSampleGenotyper.java | 3 +- python/Verify1KGArchiveBAMs.py | 215 ++++++++++++++++++ 2 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 python/Verify1KGArchiveBAMs.py diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index c78c3ada5..fc4b869ab 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -154,7 +154,8 @@ public class SingleSampleGenotyper extends LocusWalker= LOD_THRESHOLD) { sum.nConfidentCalls++; diff --git a/python/Verify1KGArchiveBAMs.py b/python/Verify1KGArchiveBAMs.py new file mode 100644 index 000000000..519528bc1 --- /dev/null +++ b/python/Verify1KGArchiveBAMs.py @@ -0,0 +1,215 @@ +import farm_commands +import os.path +import sys +from optparse import OptionParser +from datetime import date +import glob +import operator +import itertools + +class Status: + def __init__(self, file, exists, size): + self.file = file + self.exists = exists + self.size = size + + if not exists: self.status = "missing" + if size == 0: self.status = "no-data" + else: self.status = "exists: bytes=" + str(self.size) + + def __str__(self): + return self.status + + def viewSize(self): + return MergeBAMsUtils.greek(self.size) + +class ComparedFiles: + def __init__(self, file, status, localStat, ftpStat): + self.file = file + self.status = status + self.localStat = localStat + self.ftpStat = ftpStat + + def size(self): + if self.localStat.size <> 0: + return self.localStat.size + if self.ftpStat.size <> 0: + return self.ftpStat.size + else: + return 0 + + def modTime(self): + if self.localStat.exists: + return os.path.getmtime(self.localStat.file) + else: + return 0 + +def modTimeStr(t): + return time.strftime("%m/%d/%y", time.localtime(t)) + +from urlparse import urlparse +from ftplib import FTP + +FTPSERVER = None + +DEBUG = False + +# from directories to lists of lines +CACHED_LIST = dict() +def getSizeForFile(dir, filename): + global CACHED_LIST + size = [0] + def captureSize(line, cache = True): + #print line + if cache: CACHED_LIST[dir].append(line) + s = line.split() + if len(s) >= 9 and s[8] == filename: + size[0] = int(s[4]) + #print 'Found size', s, size + + if dir in CACHED_LIST: + #print 'cached is', CACHED_LIST[dir] + map( lambda l: captureSize(l, False), CACHED_LIST[dir] ) + else: + FTPSERVER.cwd(dir) + CACHED_LIST[dir] = list() + result = FTPSERVER.retrlines('LIST', captureSize) + + return size[0] + +def ftpStatus( ftpPath ): + if DEBUG: print 'ftpPath', ftpPath + dir, filename = os.path.split(ftpPath) + if DEBUG: print 'listing', dir + + try: + size = getSizeForFile(dir, filename) + except: + #print 'failing...' + size = 0 +# finally: +# pass + #print 'FTPSERVER', FTPSERVER + #FTPSERVER.quit() + + if DEBUG: print ' result was', size + return Status( ftpPath, size <> 0, size ) + +def localStatus(file): + exists = os.path.exists(file) + size = 0 + if exists: size = os.path.getsize(file) + return Status(file, exists, int(size) ) + +def validateFile(relPath, localRoot, ftpRoot): + localPath = os.path.join(root, relPath) + ftpPath = os.path.join(ftpRoot, relPath) + + # check the local file + if DEBUG: print 'Checking', relPath + localStat = localStatus(localPath) + ftpStat = ftpStatus(ftpPath) + if DEBUG: print ' local status is', localStat + if DEBUG: print ' ftp status is ', ftpStat + compared = compareFileStatus(localStat, ftpStat) + + if not OPTIONS.quiet: + print 'STATUS %20s for %s ' % (compared.status, relPath) + return compared + +import MergeBAMsUtils + +import time +def compareFileStatus(localStat, ftpStat): + if localStat.exists: + if ftpStat.exists: + if localStat.size == ftpStat.size: + status = 'in-sync' + else: + status = 'size-mismatch' + else: + status = 'unknown-local-file' + else: + if ftpStat.exists: + status = 'local-file-missing' + else: + status = 'orphaned-file' + + return ComparedFiles(localStat.file, status, localStat, ftpStat) + + +import re +def filesInLocalPath(root, subdir): + regex = re.compile(".*\.(bam|bai)$") + localFiles = set() + + if subdir <> None: + for fullroot, dirs, files in os.walk(os.path.join(root, subdir)): + for file in filter( regex.match, files ): + fullpath = os.path.join(fullroot, file) + path = fullpath.split(root)[1] + #print 'adding relpath=', path, 'fullpath=', fullpath + localFiles.add(path) + if OPTIONS.maxLocalFiles <> None and len(localFiles) > OPTIONS.maxLocalFiles: return localFiles + return localFiles + +def readAlignmentIndex(file): + files = set() + if file <> None: + for line in open(file): + files.add(line.split()[0]) + files.add(line.split()[4]) + return files + +if __name__ == "__main__": + usage = "usage: %prog -l and/or -a root ftpRoot" + parser = OptionParser(usage=usage) + parser.add_option("-l", "--local", dest="scanLocal", + type='string', default=None, + help="If provided, checks all of the local files against the archive") + parser.add_option("-a", "--alignmentIndex", dest="alignmentIndex", + type='string', default=None, + help="If provided, checks all of the files in the alignment.index in the archive") + parser.add_option("-m", "--maxLocal", dest="maxLocalFiles", + type='int', default=None, + help="If provided, maximum number of files in the local archive to examine") + parser.add_option("-q", "--quiet", dest="quiet", + action='store_true', default=False, + help="If provided, prints out the individual status of all files") + + (OPTIONS, args) = parser.parse_args() + if len(args) != 2: + parser.error("incorrect number of arguments") + root, ftpRoot = args + + ftpParsed = urlparse(ftpRoot) + FTPSERVER = FTP(ftpParsed[1]) + FTPSERVER.login() + + results = dict() + for file in itertools.chain(readAlignmentIndex(OPTIONS.alignmentIndex), filesInLocalPath(root, OPTIONS.scanLocal )): + #print line + #bas = line.split()[6] + if file not in results: + compared = validateFile( file, root, ftpParsed[2] ) + results[file] = compared + #localIndex + + print 'SUMMARY: Total files examined', len(results) + for status in ['in-sync', 'size-mismatch', 'unknown-local-file', 'local-file-missing', 'orphaned-file']: + print ''.join(['-'] * 80) + filesOfStatus = filter(lambda x: x.status == status, results.itervalues()) + n = len(filesOfStatus) + print 'SUMMARY: %s' % ( status ) + print 'SUMMARY: files %d (%.2f%% of total)' % ( n, n * 100.0 / len(results)) + + if n > 0: + fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 )) + mostRecentMod = apply(max, map( ComparedFiles.modTime, filesOfStatus )) + if mostRecentMod > 0: + modTime = modTimeStr(mostRecentMod) + else: + modTime = "N/A" + + print 'SUMMARY: total size %s' % ( fileSizes ) + print 'SUMMARY: last modification time %s' % ( modTime )