# # Generates BAM lists from Excel and TSV files provided by project managers. Suitable for input into the pre-QC metrics generation # script. # # To run: # /humgen/gsa-hpprojects/software/bin/jython2.5.2/jython \ # -J-classpath $STING_HOME/lib/poi-3.8-beta3.jar:$STING_HOME/lib/poi-ooxml-3.8-beta3.jar:$STING_HOME/lib/poi-ooxml-schemas-3.8-beta3.jar:$STING_HOME/lib/xmlbeans-2.3.0.jar:$STING_HOME/lib/dom4j-1.6.1.jar \ # parse_pm_input.py > # from java.io import FileInputStream from org.apache.poi.ss.usermodel import Row,Sheet,Workbook,WorkbookFactory import os,sys base_path = '/seq/picard_aggregation/%s/%s' def excel_reader(filename): wb = WorkbookFactory.create(FileInputStream(filename)); for sheet_number in range(wb.getNumberOfSheets()): project_column = None sample_column = None sheet = wb.getSheetAt(sheet_number); for cell in sheet.getRow(0): column_index = cell.getColumnIndex() column_contents = cell.getStringCellValue() if column_contents == 'Project': project_column = column_index if column_contents == 'External ID' or column_contents == 'Individual ID': sample_column = column_index if project_column != None and sample_column != None: for row_number in range(1,sheet.getLastRowNum()+1): project = sheet.getRow(row_number).getCell(project_column).getStringCellValue() sample = sheet.getRow(row_number).getCell(sample_column).getStringCellValue() yield project,sample return def tsv_reader(filename): f = open(filename,'rU') for line in f: tokens =line.split('\t') project = tokens[0].strip() sample = tokens[1].strip() yield project,sample f.close() def create_reader(filename): extension = os.path.splitext(filename)[1] if extension == '.xls' or extension == '.xlsx': return excel_reader(filename) elif extensions == '.tsv' or extension == '.txt': return tsv_reader(filename) else: print 'Unrecognized file extension',extension sys.exit(1) if len(sys.argv) != 2: print 'USAGE: %s ' sys.exit(1) if not os.path.exists(sys.argv[1]): print 'Input file %s not found' % sys.argv[1] sys.exit(1) input_filename = sys.argv[1] for project,sample in create_reader(input_filename): sample_path = base_path % (project,sample) versions = [] for version_path in os.listdir(sample_path): if version_path[0] != 'v': print 'Hit a path name that cannot be parsed: ',version_path sys.exit(1) versions.append(int(version_path[1:])) versions = sorted(versions) bam_file = '%s/v%d/%s.bam' % (sample_path,versions[-1],sample) if not os.path.exists(bam_file): print 'Malformed file: tried to find %s, but no such path exists' % bam_file sys.exit(1) print bam_file