Allow script to be easily modified to support different platforms.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@955 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7fa84ea157
commit
e77dfe9983
|
|
@ -66,6 +66,15 @@ To (only) evaluate a given bam file after calibrating:
|
||||||
|
|
||||||
python RecalQual.py --evaluate <source bam> <recalibrated bam>
|
python RecalQual.py --evaluate <source bam> <recalibrated bam>
|
||||||
|
|
||||||
|
Platforms
|
||||||
|
---------
|
||||||
|
By default, the recalibrator processes only read groups
|
||||||
|
originating from Illumina sequencers. To enable calibration
|
||||||
|
for other platforms, edit the 'platforms' array at the
|
||||||
|
top of RecalQual.py. Platforms specified here should
|
||||||
|
case-insensitive match the "PL" attribute of the read
|
||||||
|
group in the BAM file.
|
||||||
|
|
||||||
Output
|
Output
|
||||||
------
|
------
|
||||||
The recalibration process keeps many incremental
|
The recalibration process keeps many incremental
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Collections;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
|
|
@ -44,6 +45,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
|
@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
|
||||||
public int MAX_READ_GROUPS = 100;
|
public int MAX_READ_GROUPS = 100;
|
||||||
|
|
||||||
|
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = Illumina)")
|
||||||
|
public List<String> platforms = Collections.singletonList("ILLUMINA");
|
||||||
|
|
||||||
int NDINUCS = 16;
|
int NDINUCS = 16;
|
||||||
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||||
HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
||||||
|
|
@ -101,8 +105,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||||
if( readGroup.getAttribute("PL") == null )
|
if( readGroup.getAttribute("PL") == null )
|
||||||
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are illumina",readGroup.getReadGroupId()));
|
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
|
||||||
if( !isIlluminaReadGroup(readGroup) )
|
if( !isSupportedReadGroup(readGroup) )
|
||||||
continue;
|
continue;
|
||||||
data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
|
data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
|
||||||
for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
|
for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
|
||||||
|
|
@ -126,7 +130,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
for (int i =0; i < reads.size(); i++ ) {
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
||||||
if ( isIlluminaReadGroup(readGroup) &&
|
if ( isSupportedReadGroup(readGroup) &&
|
||||||
!read.getReadNegativeStrandFlag() &&
|
!read.getReadNegativeStrandFlag() &&
|
||||||
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
||||||
|
|
@ -394,7 +398,18 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
random_genrator = new Random(123454321); // keep same random seed while debugging
|
random_genrator = new Random(123454321); // keep same random seed while debugging
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isIlluminaReadGroup( SAMReadGroupRecord readGroup ) {
|
/**
|
||||||
return (readGroup.getAttribute("PL") == null || "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString()));
|
* Check to see whether this read group should be processed.
|
||||||
|
* @param readGroup
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
|
||||||
|
for( String platform: platforms ) {
|
||||||
|
platform = platform.trim();
|
||||||
|
if( readGroup.getAttribute("PL") == null || readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,9 @@ R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||||
# Any special site-specific arguments to pass the JVM.
|
# Any special site-specific arguments to pass the JVM.
|
||||||
jvm_args='-ea -Xmx4096m'
|
jvm_args='-ea -Xmx4096m'
|
||||||
|
|
||||||
|
# Which platforms should the calibration tool be run over?
|
||||||
|
platforms=['illumina']
|
||||||
|
|
||||||
# Where to put the output created as part of recalibration.
|
# Where to put the output created as part of recalibration.
|
||||||
# If editing, please end this variable with a trailing slash.
|
# If editing, please end this variable with a trailing slash.
|
||||||
output_root = './'
|
output_root = './'
|
||||||
|
|
@ -39,6 +42,9 @@ gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
||||||
logistic_regression_script = resources + 'logistic_regression.R'
|
logistic_regression_script = resources + 'logistic_regression.R'
|
||||||
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
||||||
|
|
||||||
|
# Assemble the platform list into command-line arguments.
|
||||||
|
platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
|
||||||
|
|
||||||
def exit(msg,errorcode):
|
def exit(msg,errorcode):
|
||||||
print msg
|
print msg
|
||||||
sys.exit(errorcode)
|
sys.exit(errorcode)
|
||||||
|
|
@ -59,7 +65,7 @@ def recalibrate():
|
||||||
'Recalibrate the given bam file'
|
'Recalibrate the given bam file'
|
||||||
# generate the covariates
|
# generate the covariates
|
||||||
print 'generating covariates'
|
print 'generating covariates'
|
||||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||||
returncode = os.system(generate_covariates)
|
returncode = os.system(generate_covariates)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
exit('Unable to generate covariates',1)
|
exit('Unable to generate covariates',1)
|
||||||
|
|
@ -88,7 +94,7 @@ def evaluate():
|
||||||
'Evaluate recalibration results.'
|
'Evaluate recalibration results.'
|
||||||
print 'Evaluating recalibration results'
|
print 'Evaluating recalibration results'
|
||||||
# regenerate the covariates
|
# regenerate the covariates
|
||||||
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||||
print 'regenerating covariates'
|
print 'regenerating covariates'
|
||||||
returncode = os.system(regenerate_covariates)
|
returncode = os.system(regenerate_covariates)
|
||||||
if returncode != 0:
|
if returncode != 0:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue