Allow script to be easily modified to support different platforms.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@955 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-06-09 16:06:57 +00:00
parent 7fa84ea157
commit e77dfe9983
3 changed files with 37 additions and 7 deletions

View File

@ -66,6 +66,15 @@ To (only) evaluate a given bam file after calibrating:
python RecalQual.py --evaluate <source bam> <recalibrated bam>
Platforms
---------
By default, the recalibrator processes only read groups
originating from Illumina sequencers. To enable calibration
for other platforms, edit the 'platforms' array at the
top of RecalQual.py. Platforms specified here should
case-insensitive match the "PL" attribute of the read
group in the BAM file.
Output
------
The recalibration process keeps many incremental

View File

@ -15,6 +15,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.HashMap;
import java.util.Collections;
import java.io.PrintStream;
import java.io.FileNotFoundException;
@ -44,6 +45,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
public int MAX_READ_GROUPS = 100;
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = Illumina)")
public List<String> platforms = Collections.singletonList("ILLUMINA");
int NDINUCS = 16;
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
@ -101,8 +105,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
if( readGroup.getAttribute("PL") == null )
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are illumina",readGroup.getReadGroupId()));
if( !isIlluminaReadGroup(readGroup) )
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
if( !isSupportedReadGroup(readGroup) )
continue;
data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
@ -126,7 +130,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
for (int i =0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
if ( isIlluminaReadGroup(readGroup) &&
if ( isSupportedReadGroup(readGroup) &&
!read.getReadNegativeStrandFlag() &&
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
@ -394,7 +398,18 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
random_genrator = new Random(123454321); // keep same random seed while debugging
}
private boolean isIlluminaReadGroup( SAMReadGroupRecord readGroup ) {
return (readGroup.getAttribute("PL") == null || "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString()));
/**
* Check to see whether this read group should be processed.
* @param readGroup
* @return
*/
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
for( String platform: platforms ) {
platform = platform.trim();
if( readGroup.getAttribute("PL") == null || readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
return true;
}
return false;
}
}

View File

@ -8,6 +8,9 @@ R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
# Any special site-specific arguments to pass the JVM.
jvm_args='-ea -Xmx4096m'
# Which platforms should the calibration tool be run over?
platforms=['illumina']
# Where to put the output created as part of recalibration.
# If editing, please end this variable with a trailing slash.
output_root = './'
@ -39,6 +42,9 @@ gatk = resources + 'gatk/GenomeAnalysisTK.jar'
logistic_regression_script = resources + 'logistic_regression.R'
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
# Assemble the platform list into command-line arguments.
platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
def exit(msg,errorcode):
print msg
sys.exit(errorcode)
@ -59,7 +65,7 @@ def recalibrate():
'Recalibrate the given bam file'
# generate the covariates
print 'generating covariates'
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
returncode = os.system(generate_covariates)
if returncode != 0:
exit('Unable to generate covariates',1)
@ -88,7 +94,7 @@ def evaluate():
'Evaluate recalibration results.'
print 'Evaluating recalibration results'
# regenerate the covariates
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
print 'regenerating covariates'
returncode = os.system(regenerate_covariates)
if returncode != 0: