From cc1f94310d3761613083bd650a129f874712e701 Mon Sep 17 00:00:00 2001 From: droazen Date: Wed, 22 Jun 2011 22:53:45 +0000 Subject: [PATCH] A prototype script and library dependencies to extract a BAM list from a reasonably well-formed PM's xls{x}-format spreadsheet or tsv file. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6036 348d0f76-0448-11de-a6fe-93d51630548a --- ivy.xml | 4 ++ python/parse_pm_input.py | 81 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 python/parse_pm_input.py diff --git a/ivy.xml b/ivy.xml index 6ece07367..c2a6c4ccd 100644 --- a/ivy.xml +++ b/ivy.xml @@ -60,6 +60,10 @@ + + + + diff --git a/python/parse_pm_input.py b/python/parse_pm_input.py new file mode 100644 index 000000000..9857c74df --- /dev/null +++ b/python/parse_pm_input.py @@ -0,0 +1,81 @@ +# +# Generates BAM lists from Excel and TSV files provided by project managers. Suitable for input into the pre-QC metrics generation +# script. +# +# To run: +# /humgen/gsa-hpprojects/software/bin/jython2.5.2/jython \ +# -J-classpath $STING_HOME/lib/poi-3.8-beta3.jar:$STING_HOME/lib/poi-ooxml-3.8-beta3.jar:$STING_HOME/lib/poi-ooxml-schemas-3.8-beta3.jar:$STING_HOME/lib/xmlbeans-2.3.0.jar:$STING_HOME/lib/dom4j-1.6.1.jar +# parse_pm_input.py > +# +from java.io import FileInputStream +from org.apache.poi.ss.usermodel import Row,Sheet,Workbook,WorkbookFactory + +import os,sys + +base_path = '/seq/picard_aggregation/%s/%s' + +def excel_reader(filename): + wb = WorkbookFactory.create(FileInputStream(filename)); + for sheet_number in range(wb.getNumberOfSheets()): + project_column = None + sample_column = None + + sheet = wb.getSheetAt(sheet_number); + + for cell in sheet.getRow(0): + column_index = cell.getColumnIndex() + column_contents = cell.getStringCellValue() + if column_contents == 'Project': + project_column = column_index + if column_contents == 'External ID' or column_contents == 'Individual ID': + sample_column = column_index + + if project_column != None and sample_column != None: + for row_number in range(1,sheet.getLastRowNum()+1): + project = sheet.getRow(row_number).getCell(project_column).getStringCellValue() + sample = sheet.getRow(row_number).getCell(sample_column).getStringCellValue() + yield project,sample + return + +def tsv_reader(filename): + f = open(filename,'rU') + for line in f: + tokens =line.split('\t') + project = tokens[0].strip() + sample = tokens[1].strip() + yield project,sample + f.close() + +def create_reader(filename): + extension = os.path.splitext(filename)[1] + if extension == '.xls' or extension == '.xlsx': + return excel_reader(filename) + elif extensions == '.tsv' or extension == '.txt': + return tsv_reader(filename) + else: + print 'Unrecognized file extension',extension + sys.exit(1) + +if len(sys.argv) != 2: + print 'USAGE: %s ' + sys.exit(1) +if not os.path.exists(sys.argv[1]): + print 'Input file %s not found' % sys.argv[1] + sys.exit(1) + +input_filename = sys.argv[1] + +for project,sample in create_reader(input_filename): + sample_path = base_path % (project,sample) + versions = [] + for version_path in os.listdir(sample_path): + if version_path[0] != 'v': + print 'Hit a path name that cannot be parsed: ',version_path + sys.exit(1) + versions.append(int(version_path[1:])) + versions = sorted(versions) + bam_file = '%s/v%d/%s.bam' % (sample_path,versions[-1],sample) + if not os.path.exists(bam_file): + print 'Malformed file: tried to find %s, but no such path exists' % bam_file + sys.exit(1) + print bam_file