Allow script to be easily modified to support different platforms.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@955 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7fa84ea157
commit
e77dfe9983
|
|
@ -66,6 +66,15 @@ To (only) evaluate a given bam file after calibrating:
|
|||
|
||||
python RecalQual.py --evaluate <source bam> <recalibrated bam>
|
||||
|
||||
Platforms
|
||||
---------
|
||||
By default, the recalibrator processes only read groups
|
||||
originating from Illumina sequencers. To enable calibration
|
||||
for other platforms, edit the 'platforms' array at the
|
||||
top of RecalQual.py. Platforms specified here should
|
||||
case-insensitive match the "PL" attribute of the read
|
||||
group in the BAM file.
|
||||
|
||||
Output
|
||||
------
|
||||
The recalibration process keeps many incremental
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.HashMap;
|
||||
import java.util.Collections;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
|
|
@ -44,6 +45,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
|
||||
public int MAX_READ_GROUPS = 100;
|
||||
|
||||
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = Illumina)")
|
||||
public List<String> platforms = Collections.singletonList("ILLUMINA");
|
||||
|
||||
int NDINUCS = 16;
|
||||
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||
HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
||||
|
|
@ -101,8 +105,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
if( readGroup.getAttribute("PL") == null )
|
||||
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are illumina",readGroup.getReadGroupId()));
|
||||
if( !isIlluminaReadGroup(readGroup) )
|
||||
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
|
||||
if( !isSupportedReadGroup(readGroup) )
|
||||
continue;
|
||||
data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
|
||||
for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
|
||||
|
|
@ -126,7 +130,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
for (int i =0; i < reads.size(); i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
||||
if ( isIlluminaReadGroup(readGroup) &&
|
||||
if ( isSupportedReadGroup(readGroup) &&
|
||||
!read.getReadNegativeStrandFlag() &&
|
||||
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
||||
|
|
@ -394,7 +398,18 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
random_genrator = new Random(123454321); // keep same random seed while debugging
|
||||
}
|
||||
|
||||
private boolean isIlluminaReadGroup( SAMReadGroupRecord readGroup ) {
|
||||
return (readGroup.getAttribute("PL") == null || "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString()));
|
||||
/**
|
||||
* Check to see whether this read group should be processed.
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
|
||||
for( String platform: platforms ) {
|
||||
platform = platform.trim();
|
||||
if( readGroup.getAttribute("PL") == null || readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,9 @@ R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
|||
# Any special site-specific arguments to pass the JVM.
|
||||
jvm_args='-ea -Xmx4096m'
|
||||
|
||||
# Which platforms should the calibration tool be run over?
|
||||
platforms=['illumina']
|
||||
|
||||
# Where to put the output created as part of recalibration.
|
||||
# If editing, please end this variable with a trailing slash.
|
||||
output_root = './'
|
||||
|
|
@ -39,6 +42,9 @@ gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
|||
logistic_regression_script = resources + 'logistic_regression.R'
|
||||
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
||||
|
||||
# Assemble the platform list into command-line arguments.
|
||||
platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
|
||||
|
||||
def exit(msg,errorcode):
|
||||
print msg
|
||||
sys.exit(errorcode)
|
||||
|
|
@ -59,7 +65,7 @@ def recalibrate():
|
|||
'Recalibrate the given bam file'
|
||||
# generate the covariates
|
||||
print 'generating covariates'
|
||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||
returncode = os.system(generate_covariates)
|
||||
if returncode != 0:
|
||||
exit('Unable to generate covariates',1)
|
||||
|
|
@ -88,7 +94,7 @@ def evaluate():
|
|||
'Evaluate recalibration results.'
|
||||
print 'Evaluating recalibration results'
|
||||
# regenerate the covariates
|
||||
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
|
||||
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||
print 'regenerating covariates'
|
||||
returncode = os.system(regenerate_covariates)
|
||||
if returncode != 0:
|
||||
|
|
|
|||
Loading…
Reference in New Issue