Cleanup.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@931 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
98396732ba
commit
596773e6c6
|
|
@ -95,6 +95,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||||
if( readGroup.getAttribute("PL") == null )
|
if( readGroup.getAttribute("PL") == null )
|
||||||
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are illumina",readGroup.getReadGroupId()));
|
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are illumina",readGroup.getReadGroupId()));
|
||||||
|
if( !isIlluminaReadGroup(readGroup) )
|
||||||
|
continue;
|
||||||
data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
|
data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
|
||||||
for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
|
for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
|
||||||
for ( int j = 0; j < MAX_QUAL_SCORE+1; j++) {
|
for ( int j = 0; j < MAX_QUAL_SCORE+1; j++) {
|
||||||
|
|
@ -117,7 +119,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
for (int i =0; i < reads.size(); i++ ) {
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
||||||
if ( (readGroup.getAttribute("PL") == null || "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString())) &&
|
if ( isIlluminaReadGroup(readGroup) &&
|
||||||
!read.getReadNegativeStrandFlag() &&
|
!read.getReadNegativeStrandFlag() &&
|
||||||
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
||||||
|
|
@ -384,4 +386,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
public CovariateCounterWalker() throws FileNotFoundException {
|
public CovariateCounterWalker() throws FileNotFoundException {
|
||||||
random_genrator = new Random(123454321); // keep same random seed while debugging
|
random_genrator = new Random(123454321); // keep same random seed while debugging
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isIlluminaReadGroup( SAMReadGroupRecord readGroup ) {
|
||||||
|
return (readGroup.getAttribute("PL") == null || "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,13 @@ samtools_exe='/seq/dirseq/samtools/current/samtools'
|
||||||
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
|
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
|
||||||
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||||
|
|
||||||
|
# Any special site-specific arguments to pass the JVM.
|
||||||
|
jvm_args='-ea -Xmx4096m'
|
||||||
|
|
||||||
|
# Where to put the output created as part of recalibration.
|
||||||
|
# If editing, please end this variable with a trailing slash.
|
||||||
|
output_root = './'
|
||||||
|
|
||||||
# Location of the resource files distributed with the recalibration tool.
|
# Location of the resource files distributed with the recalibration tool.
|
||||||
# If editing, please end this variable with a trailing slash.
|
# If editing, please end this variable with a trailing slash.
|
||||||
resources='resources/'
|
resources='resources/'
|
||||||
|
|
@ -18,12 +25,12 @@ reference_fai = reference_base + '.fasta.fai'
|
||||||
# Where does DBSNP live?
|
# Where does DBSNP live?
|
||||||
dbsnp = resources + 'dbsnp.rod.out'
|
dbsnp = resources + 'dbsnp.rod.out'
|
||||||
|
|
||||||
# Where are the application files required to run the recalibration
|
# Where are the application files required to run the recalibration?
|
||||||
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
||||||
logistic_regression_script = resources + 'logistic_regression.R'
|
logistic_regression_script = resources + 'logistic_regression.R'
|
||||||
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
||||||
|
|
||||||
import glob,os,sys
|
import getopt,glob,os,sys
|
||||||
import LogisticRegressionByReadGroup
|
import LogisticRegressionByReadGroup
|
||||||
|
|
||||||
def exit(msg,errorcode):
|
def exit(msg,errorcode):
|
||||||
|
|
@ -46,18 +53,18 @@ def recalibrate():
|
||||||
'Recalibrate the given bam file'
|
'Recalibrate the given bam file'
|
||||||
# generate the covariates
|
# generate the covariates
|
||||||
print 'generating covariates'
|
print 'generating covariates'
|
||||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT output/initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
||||||
returncode = os.system(generate_covariates)
|
returncode = os.system(generate_covariates)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to generate covariates',1)
|
exit('Unable to generate covariates',1)
|
||||||
|
|
||||||
# compute the logistic regression
|
# compute the logistic regression
|
||||||
print 'computing the logistic regression'
|
print 'computing the logistic regression'
|
||||||
LogisticRegressionByReadGroup.compute_logistic_regression('output/initial.covariate_counts.csv','output/linear_regression_results.out',R_exe,logistic_regression_script)
|
LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',output_dir + 'linear_regression_results.out',R_exe,logistic_regression_script)
|
||||||
|
|
||||||
# apply the logistic regression, writing the output data to calibrated_bam
|
# apply the logistic regression, writing the output data to calibrated_bam
|
||||||
print 'applying the correction to the reads'
|
print 'applying the correction to the reads'
|
||||||
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams output/linear_regression_results.out','-outputBAM',calibrated_bam))
|
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
|
||||||
returncode = os.system(apply_logistic_regression)
|
returncode = os.system(apply_logistic_regression)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to apply logistic regression',1)
|
exit('Unable to apply logistic regression',1)
|
||||||
|
|
@ -75,38 +82,55 @@ def evaluate():
|
||||||
'Evaluate recalibration results.'
|
'Evaluate recalibration results.'
|
||||||
print 'Evaluating recalibration results'
|
print 'Evaluating recalibration results'
|
||||||
# regenerate the covariates
|
# regenerate the covariates
|
||||||
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT output/recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
||||||
print 'regenerating covariates'
|
print 'regenerating covariates'
|
||||||
returncode = os.system(regenerate_covariates)
|
returncode = os.system(regenerate_covariates)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to regenerate covariates',1)
|
exit('Unable to regenerate covariates',1)
|
||||||
|
|
||||||
print 'graphing initial results'
|
print 'graphing initial results'
|
||||||
for filename in glob.glob('output/initial.*.empirical_v_reported_quality.csv'):
|
for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'):
|
||||||
graph_file(empirical_vs_reported_grapher,filename)
|
graph_file(empirical_vs_reported_grapher,filename)
|
||||||
print 'graphing final results'
|
print 'graphing final results'
|
||||||
for filename in glob.glob('output/recalibrated.*.empirical_v_reported_quality.csv'):
|
for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
|
||||||
graph_file(empirical_vs_reported_grapher,filename)
|
graph_file(empirical_vs_reported_grapher,filename)
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
def usage():
|
||||||
exit('Usage: python RecalQual.py <input bam file> <calibrated output bam file> [{RECALIBRATE | EVALUATE | RECALIBRATE_AND_EVALUATE}]',1)
|
exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)
|
||||||
|
|
||||||
operation = 'RECALIBRATE'
|
# Try to parse the given command-line arguments.
|
||||||
if len(sys.argv) == 4:
|
try:
|
||||||
operation = sys.argv[3]
|
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate'])
|
||||||
|
except getopt.GetoptError, err:
|
||||||
|
usage()
|
||||||
|
|
||||||
if operation not in ['RECALIBRATE','EVALUATE','RECALIBRATE_AND_EVALUATE']:
|
# An input and an output file is required. Fail if left unspecified.
|
||||||
exit('Operation %s not recognized. Operation must be RECALIBRATE, EVALUATE, or RECALIBRATE_AND_EVALUATE' % operation,1)
|
if len(args) < 2:
|
||||||
|
usage()
|
||||||
|
|
||||||
|
# Determine whether to evaluate / recalibrate.
|
||||||
|
recalibrate_requested = False
|
||||||
|
evaluate_requested = False
|
||||||
|
for opt,arg in opts:
|
||||||
|
if opt == '--recalibrate':
|
||||||
|
recalibrate_requested = True
|
||||||
|
if opt == '--evaluate':
|
||||||
|
evaluate_requested = True
|
||||||
|
|
||||||
|
# Default to 'recalibrate' unless the user specified only evaluate.
|
||||||
|
do_recalibration = not (evaluate_requested and not recalibrate_requested)
|
||||||
|
# Only evaluate if the user specifically requested evaluation.
|
||||||
|
do_evaluation = evaluate_requested
|
||||||
|
|
||||||
# check that the input bam file exists, and that the bam is indexed.
|
# check that the input bam file exists, and that the bam is indexed.
|
||||||
bam = sys.argv[1]
|
bam = args[0]
|
||||||
bam_index = bam + '.bai'
|
bam_index = bam + '.bai'
|
||||||
|
|
||||||
check_input_file_available(bam,'reads file')
|
check_input_file_available(bam,'reads file')
|
||||||
check_input_file_available(bam_index,'reads index file')
|
check_input_file_available(bam_index,'reads index file')
|
||||||
|
|
||||||
# parse the user's calibration output file requirements
|
# parse the user's calibration output file requirements
|
||||||
calibrated_bam = sys.argv[2]
|
calibrated_bam = args[1]
|
||||||
calibrated_bam_index = calibrated_bam + '.bai'
|
calibrated_bam_index = calibrated_bam + '.bai'
|
||||||
|
|
||||||
# check that the fasta and supporting files are available
|
# check that the fasta and supporting files are available
|
||||||
|
|
@ -125,14 +149,17 @@ check_input_file_available(gatk,'Genome Analysis Toolkit')
|
||||||
check_input_file_available(logistic_regression_script,'logistic regression script')
|
check_input_file_available(logistic_regression_script,'logistic regression script')
|
||||||
|
|
||||||
# make an output directory for temporary files
|
# make an output directory for temporary files
|
||||||
if not os.path.isdir('output'):
|
output_dir=output_root+'output.' + bam[bam.rfind('/')+1:bam.rfind('.bam')] + '/'
|
||||||
os.mkdir('output')
|
if not os.path.isdir(output_dir):
|
||||||
|
os.mkdir(output_dir)
|
||||||
|
if not os.path.isdir(output_dir):
|
||||||
|
exit('Unable to create output directory ' + output_dir,1)
|
||||||
|
|
||||||
# assemble the required program arguments
|
# assemble the required program arguments
|
||||||
gatk_base_cmdline = ' '.join((java_exe,'-ea','-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))
|
gatk_base_cmdline = ' '.join((java_exe,jvm_args,'-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))
|
||||||
|
|
||||||
if operation.find('RECALIBRATE') != -1:
|
if do_recalibration:
|
||||||
recalibrate()
|
recalibrate()
|
||||||
|
|
||||||
if operation.find('EVALUATE') != -1:
|
if do_evaluation:
|
||||||
evaluate()
|
evaluate()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue