Very early draft of script integrating the covariant counting / logistic regression. Deleted some unused code and spurious debug info.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@902 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
109bef6c08
commit
9689bb3331
|
|
@ -91,7 +91,6 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
|
|
||||||
|
|
||||||
public SAMRecord map(char[] ref, SAMRecord read) {
|
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||||
System.out.println(String.format("Reading: %s (%s:%d-%d)",read.getReadName(),read.getReferenceName(),read.getAlignmentStart(),read.getAlignmentEnd()));
|
|
||||||
SAMRecord recalRead = read;
|
SAMRecord recalRead = read;
|
||||||
byte[] bases = read.getReadBases();
|
byte[] bases = read.getReadBases();
|
||||||
byte[] quals = read.getBaseQualities();
|
byte[] quals = read.getBaseQualities();
|
||||||
|
|
|
||||||
|
|
@ -56,13 +56,6 @@ def process_file(source_file,read_group,dinuc,target_file):
|
||||||
target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
|
target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
|
||||||
os.remove('.'.join((base,dinuc,'csv')))
|
os.remove('.'.join((base,dinuc,'csv')))
|
||||||
os.remove('.'.join((base,dinuc,'parameters')))
|
os.remove('.'.join((base,dinuc,'parameters')))
|
||||||
|
|
||||||
class LogisticRegressionRunner:
|
|
||||||
def __init__(self,source_filename,read_group,dinuc):
|
|
||||||
__base = source_file.name[:source_file.name.rfind('.csv')]
|
|
||||||
__read_group = read_group
|
|
||||||
__dinuc = dinuc
|
|
||||||
|
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
if len(sys.argv) < 3:
|
||||||
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
|
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,68 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Executable files. Please update these to match the installed locations of your tools.
|
||||||
|
samtools_exe='/seq/dirseq/samtools/current/samtools'
|
||||||
|
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
|
||||||
|
R_exe='/util/bin/R'
|
||||||
|
GATK_jar='/humgen/gsa-scr1/hanna/src/StingWorking/dist/GenomeAnalysisTK.jar'
|
||||||
|
|
||||||
|
# Location of the resource files distributed with the recalibration tool.
|
||||||
|
resources='resources/'
|
||||||
|
|
||||||
|
# Where does the reference live?
|
||||||
|
reference_base = resources + 'Homo_sapiens_assembly18'
|
||||||
|
reference = reference_base + '.fasta'
|
||||||
|
reference_dict = reference_base + '.dict'
|
||||||
|
reference_fai = reference_base + '.fasta.fai'
|
||||||
|
|
||||||
|
# Where does DBSNP live?
|
||||||
|
dbsnp = resources + 'dbsnp.rod.out'
|
||||||
|
|
||||||
|
import sys,os
|
||||||
|
|
||||||
|
def exit(msg,errorcode):
|
||||||
|
print msg
|
||||||
|
sys.exit(errorcode)
|
||||||
|
|
||||||
|
def check_input_file_available(filename,description):
|
||||||
|
if not os.access(filename,os.R_OK):
|
||||||
|
exit('Unable to access %s %s' % (description,filename),1)
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
exit('Usage: python RecalQual.py <input bam file> <calibrated output bam file>',1)
|
||||||
|
|
||||||
|
# check that the input bam file exists, and that the bam is indexed.
|
||||||
|
bam = sys.argv[1]
|
||||||
|
bam_index = bam + '.bai'
|
||||||
|
|
||||||
|
check_input_file_available(bam,'reads file')
|
||||||
|
check_input_file_available(bam_index,'reads index file')
|
||||||
|
|
||||||
|
# parse the user's calibration output file requirements
|
||||||
|
calibrated_bam = sys.argv[2]
|
||||||
|
calibrated_bam_index = calibrated_bam + '.bai'
|
||||||
|
|
||||||
|
# check that the fasta and supporting files are available
|
||||||
|
check_input_file_available(reference,'reference file')
|
||||||
|
check_input_file_available(reference_dict,'reference dictionary')
|
||||||
|
check_input_file_available(reference_fai,'reference index file')
|
||||||
|
|
||||||
|
# check that the dbsnp is available
|
||||||
|
check_input_file_available(dbsnp,'dbsnp file')
|
||||||
|
|
||||||
|
# make an output directory for temporary files
|
||||||
|
if not os.path.isdir('output'):
|
||||||
|
os.mkdir('output')
|
||||||
|
|
||||||
|
# assemble the required program arguments
|
||||||
|
gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',GATK_jar,'-R',reference,'--DBSNP',dbsnp,'-l INFO','-L chrM'))
|
||||||
|
|
||||||
|
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/output','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
||||||
|
compute_logistic_regression = ' '.join(('python','LogisticRegressionByReadGroup.py','output/output.covariate_counts.csv','output/linear_regression_results.out'))
|
||||||
|
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam))
|
||||||
|
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
|
||||||
|
|
||||||
|
os.system(generate_covariates)
|
||||||
|
os.system(compute_logistic_regression)
|
||||||
|
os.system(apply_logistic_regression)
|
||||||
|
os.system(index_calibrated_bamfile)
|
||||||
Loading…
Reference in New Issue