gatk-3.8/python/LogisticRegressionByReadGro...

#!/usr/bin/env python

import os,sys

def exit(msg,errorlevel):
    "Exit the program with the specified message and error code."
    print msg
    sys.exit(errorlevel)

def open_source_file(source_filename):
    "Open the source file with the given name.  Make sure it's readable."
    if not os.access(source_filename,os.R_OK):
        exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1)
    if not source_filename.endswith('.csv'):
        exit("Source file is in incorrect format.  Must be csv.")
    return open(source_filename,'r')

def read_header(source_file):
    "Read the header from the given source file.  Do basic validation."
    header = source_file.readline().split(',')
    if header[0] != 'rg' or header[1] != 'dn':
        exit("Input file is in invalid format.  First two columns should be <read group>,<dinucleotide>",1)
    return header

def create_intermediate_file(source_file,header,read_group,dinuc):
    "Create an intermediate file for a particular read group / dinuc from a given source file"
    base = source_file.name[:source_file.name.rfind('.csv')]
    intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
    intermediate_file = open(intermediate_filename,"w")
    intermediate_file.write(','.join(header[2:]))
    return intermediate_file

def open_target_file(target_filename):
    "Open a target file and write out the header."
    target_file = open(target_filename,'w')
    target_file.write("rg\tdinuc\t")
    for p in range(5):
        for q in range(5):
            target_file.write("%d,%d\t" % (p,q))
    target_file.write("\n")
    return target_file

def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script):    
    "Process the contents of an intermediate file.  An intermediate file is the specific data arranged per read group, per dinuc."
    base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
    regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
    print "Running " + regression_command
    os.system(regression_command)
    parameters_filename = '.'.join((base,dinuc,'parameters'))
    if not os.access(parameters_filename,os.R_OK):
        exit("Unable to read output of R from file " + parameters_filename)
    parameters_file = open(parameters_filename,'r')
    parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
    target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
    # optionally remove the input and output files.  For now, keep them around for debugging purposes.
    # os.remove('.'.join((base,dinuc,'csv')))
    # os.remove('.'.join((base,dinuc,'parameters')))
    
def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script):
    "Do the heavy lifting.  Group covariant data into individual output files, compute the logistic regression, and format the results."
    source_file = open_source_file(source_filename)
    target_file = open_target_file(target_filename)

    header = read_header(source_file)

    intermediate_file = None
    read_group = None
    dinuc = None

    for data_line in source_file:
        data_line = data_line.strip()
        if len(data_line) == 0:
            continue
        data = data_line.split(',')
        if read_group != data[0] or dinuc != data[1]:
            if intermediate_file:
                intermediate_file.close()
                process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
            read_group,dinuc = data[0:2]
            intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc)
        intermediate_file.write(','.join(data[2:])+'\n')

    if intermediate_file:
        intermediate_file.close()
        process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)

    source_file.close()
    target_file.close()

if __name__ == "__main__":
    # set up sensible defaults for the locations of R and the logistic regression script
    R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
    logistic_regression_script="resources/logistic_regression.R"

    if len(sys.argv) < 3:
        exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)

    compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script)
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00			`#!/usr/bin/env python`

			`import os,sys`

			`def exit(msg,errorlevel):`
			`"Exit the program with the specified message and error code."`
			`print msg`
			`sys.exit(errorlevel)`

			`def open_source_file(source_filename):`
			`"Open the source file with the given name. Make sure it's readable."`
			`if not os.access(source_filename,os.R_OK):`
			`exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1)`
			`if not source_filename.endswith('.csv'):`
			`exit("Source file is in incorrect format. Must be csv.")`
			`return open(source_filename,'r')`

			`def read_header(source_file):`
			`"Read the header from the given source file. Do basic validation."`
			`header = source_file.readline().split(',')`
			`if header[0] != 'rg' or header[1] != 'dn':`
			`exit("Input file is in invalid format. First two columns should be <read group>,<dinucleotide>",1)`
			`return header`

Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`def create_intermediate_file(source_file,header,read_group,dinuc):`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00			`"Create an intermediate file for a particular read group / dinuc from a given source file"`
			`base = source_file.name[:source_file.name.rfind('.csv')]`
			`intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)`
			`intermediate_file = open(intermediate_filename,"w")`
			`intermediate_file.write(','.join(header[2:]))`
			`return intermediate_file`

			`def open_target_file(target_filename):`
			`"Open a target file and write out the header."`
			`target_file = open(target_filename,'w')`
			`target_file.write("rg\tdinuc\t")`
			`for p in range(5):`
			`for q in range(5):`
			`target_file.write("%d,%d\t" % (p,q))`
			`target_file.write("\n")`
			`return target_file`

Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script):`
			`"Process the contents of an intermediate file. An intermediate file is the specific data arranged per read group, per dinuc."`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00			`base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group`
			`regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))`
			`print "Running " + regression_command`
			`os.system(regression_command)`
			`parameters_filename = '.'.join((base,dinuc,'parameters'))`
			`if not os.access(parameters_filename,os.R_OK):`
			`exit("Unable to read output of R from file " + parameters_filename)`
			`parameters_file = open(parameters_filename,'r')`
			`parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')`
			`target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')`
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`# optionally remove the input and output files. For now, keep them around for debugging purposes.`
			`# os.remove('.'.join((base,dinuc,'csv')))`
			`# os.remove('.'.join((base,dinuc,'parameters')))`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script):`
			`"Do the heavy lifting. Group covariant data into individual output files, compute the logistic regression, and format the results."`
			`source_file = open_source_file(source_filename)`
			`target_file = open_target_file(target_filename)`

			`header = read_header(source_file)`

			`intermediate_file = None`
			`read_group = None`
			`dinuc = None`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`for data_line in source_file:`
			`data_line = data_line.strip()`
			`if len(data_line) == 0:`
			`continue`
			`data = data_line.split(',')`
			`if read_group != data[0] or dinuc != data[1]:`
			`if intermediate_file:`
			`intermediate_file.close()`
			`process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)`
			`read_group,dinuc = data[0:2]`
			`intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc)`
			`intermediate_file.write(','.join(data[2:])+'\n')`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`if intermediate_file:`
			`intermediate_file.close()`
			`process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`source_file.close()`
			`target_file.close()`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`if __name__ == "__main__":`
			`# set up sensible defaults for the locations of R and the logistic regression script`
			`R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"`
			`logistic_regression_script="resources/logistic_regression.R"`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`if len(sys.argv) < 3:`
			`exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)`
The building blocks for segmenting covariate counting data by read group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@899 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 03:55:24 +08:00
Lots of cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@903 348d0f76-0448-11de-a6fe-93d51630548a 2009-06-05 09:26:10 +08:00			`compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script)`