Further optimizations of TableRecalibration. This completes my goal of having the only math done in the map function be addition, subtraction and rounding the quality score to an integer. Everything else has been moved to the initialize method and only done once.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2145 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e4546f802c
commit
277e6d6b32
|
|
@ -110,7 +110,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private final String versionNumber = "2.0.3"; // Major version, minor version, and build number
|
||||
private final String versionNumber = "2.0.4"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -320,7 +320,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name.");
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroupCode;
|
||||
platform = "ILLUMINA";
|
||||
} else {
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
|
|
@ -332,7 +332,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
platform = "ILLUMINA";
|
||||
}
|
||||
if( IGNORE_READGROUP ) {
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroupCode;
|
||||
}
|
||||
|
||||
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
|
||||
|
|
@ -348,11 +348,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
if( readDatum.quals[offset] > 0 ) {
|
||||
|
||||
refBase = (byte)ref.getBase();
|
||||
prevBase = readDatum.bases[offset-1];
|
||||
prevBase = readDatum.bases[offset - 1];
|
||||
|
||||
// Get the complement base strand if we are a negative strand read, DinucCovariate is responsible for getting the complement bases if needed
|
||||
if( readDatum.isNegStrand ) {
|
||||
prevBase = readDatum.bases[offset+1];
|
||||
prevBase = readDatum.bases[offset + 1];
|
||||
}
|
||||
|
||||
// Skip if this base or the previous one was an 'N' or etc.
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
public class ReadGroupCovariate implements Covariate{
|
||||
|
||||
public static final String collapsedReadGroup = "Read Group Collapsed";
|
||||
public static final String collapsedReadGroupCode = "Read Group Collapsed";
|
||||
|
||||
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,26 +38,26 @@ import java.util.*;
|
|||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
public NHashMap<RecalDatum> data; // the full dataset
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
public NHashMap<RecalDatum> data; // The full dataset
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
private NHashMap<Double> dataSumExpectedErrors; // Table used to calculate the overall aggregate reported quality score in which everything except read group is collapsed
|
||||
|
||||
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
private NHashMap<Double> dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed
|
||||
private NHashMap<Double> dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
public NHashMap<Double> aggregateReportedQuality; // Table of the overall aggregate reported quality score in which everything except read group is collapsed
|
||||
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
|
||||
RecalDataManager() {
|
||||
data = new NHashMap<RecalDatum>();
|
||||
}
|
||||
|
||||
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
|
||||
if( createCollapsedTables ) { // initialize all the collapsed tables
|
||||
if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
|
||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||
|
|
@ -65,13 +65,14 @@ public class RecalDataManager {
|
|||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||
}
|
||||
dataSumExpectedErrors = new NHashMap<Double>();
|
||||
aggregateReportedQuality = new NHashMap<Double>();
|
||||
} else {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
|
||||
}
|
||||
}
|
||||
|
||||
RecalDataManager( final int estimatedCapacity ) {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // Second arg is the 'loading factor',
|
||||
// a number to monkey around with when optimizing performace of the HashMap
|
||||
}
|
||||
|
||||
|
|
@ -85,9 +86,9 @@ public class RecalDataManager {
|
|||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||
newKey.add( key.get(0) ); // Make a new key with just the read group
|
||||
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
|
|
@ -95,9 +96,9 @@ public class RecalDataManager {
|
|||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
// Create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||
newKey.add( key.get(0) ); // Make a new key with just the read group
|
||||
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
|
||||
if( sumExpectedErrors == null ) {
|
||||
dataSumExpectedErrors.put( newKey, 0.0 );
|
||||
|
|
@ -108,8 +109,8 @@ public class RecalDataManager {
|
|||
}
|
||||
|
||||
newKey = new ArrayList<Comparable>();
|
||||
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
newKey.add( key.get(0) ); // Make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score
|
||||
collapsedDatum = dataCollapsedQualityScore.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
|
|
@ -118,10 +119,10 @@ public class RecalDataManager {
|
|||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||
newKey.add( key.get(0) ); // Make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score ...
|
||||
newKey.add( key.get(iii + 2) ); // and the given covariate
|
||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||
|
|
@ -151,24 +152,30 @@ public class RecalDataManager {
|
|||
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
|
||||
// Looping over the entrySet is really expensive but worth it
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
|
||||
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
aggregateReportedQuality.put( entry.getKey(), // sum of the expected errors divided by number of observations turned into a Q score
|
||||
QualityUtils.phredScaleErrorRate( dataSumExpectedErrors.get(entry.getKey()) / ((double)entry.getValue().getNumObservations()) ) );
|
||||
}
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
|
||||
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
}
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
|
||||
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
}
|
||||
}
|
||||
|
||||
dataSumExpectedErrors.clear();
|
||||
dataCollapsedReadGroup.clear();
|
||||
dataCollapsedQualityScore.clear();
|
||||
dataCollapsedByCovariate.clear();
|
||||
dataCollapsedQualityScore = null; // will never need this again
|
||||
dataCollapsedByCovariate = null; // will never need this again
|
||||
if( data!=null ) {
|
||||
dataSumExpectedErrors = null; // Will never need this table again
|
||||
dataCollapsedReadGroup = null; // Will never need this table again
|
||||
dataCollapsedQualityScore = null; // Will never need this table again
|
||||
dataCollapsedByCovariate = null; // Will never need this table again
|
||||
if( data != null ) {
|
||||
data.clear();
|
||||
data = null; // will never need this again
|
||||
data = null; // Will never need this table again
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -179,11 +186,11 @@ public class RecalDataManager {
|
|||
*/
|
||||
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
|
||||
return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -194,11 +201,11 @@ public class RecalDataManager {
|
|||
*/
|
||||
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||
return dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
return dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
|
|
@ -103,7 +101,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
private final String versionNumber = "2.0.1"; // Major version, minor version, and build number
|
||||
private final String versionNumber = "2.0.2"; // Major version, minor version, and build number
|
||||
private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -217,6 +215,18 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
|
||||
logger.info( "...done!" );
|
||||
|
||||
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
|
||||
//SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
|
||||
//SAMSequenceDictionary referenceDictionary =
|
||||
// ReferenceSequenceFileFactory.getReferenceSequenceFile(getToolkit().getArguments().referenceFile).getSequenceDictionary();
|
||||
//header.setSequenceDictionary(referenceDictionary);
|
||||
//header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
|
||||
//if( OUTPUT_BAM != null ) {
|
||||
// SAMFileWriterFactory factory = new SAMFileWriterFactory();
|
||||
// OUTPUT_BAM = factory.makeBAMWriter(header, false, new File(OUTPUT_BAM.toString()), 5); // Bam compression = 5
|
||||
//}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -288,7 +298,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name.");
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroupCode;
|
||||
platform = "ILLUMINA";
|
||||
} else {
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
|
|
@ -307,7 +317,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
platform = "ILLUMINA";
|
||||
}
|
||||
if( IGNORE_READGROUP ) {
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroupCode;
|
||||
}
|
||||
|
||||
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
|
||||
|
|
@ -363,18 +373,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
// The global quality shift (over the read group only)
|
||||
collapsedTableKey.add( readGroupKeyElement );
|
||||
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get(collapsedTableKey);
|
||||
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get(collapsedTableKey);
|
||||
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( collapsedTableKey );
|
||||
Double aggregrateQreported = dataManager.aggregateReportedQuality.get( collapsedTableKey );
|
||||
double globalDeltaQ = 0.0;
|
||||
double aggregrateQreported = 0.0;
|
||||
if( globalDeltaQDatum != null ) {
|
||||
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get(collapsedTableKey) / ((double) globalDeltaQDatum.getNumObservations()) );
|
||||
if( globalDeltaQEmpirical != null && aggregrateQreported != null ) {
|
||||
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
collapsedTableKey.add( qualityScoreKeyElement );
|
||||
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get(collapsedTableKey);
|
||||
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get( collapsedTableKey );
|
||||
double deltaQReported = 0.0;
|
||||
if( deltaQReportedEmpirical != null ) {
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
|
|
@ -387,7 +396,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
double deltaQDinuc = 0.0;
|
||||
for( int iii = 2; iii < key.size(); iii++ ) {
|
||||
collapsedTableKey.add( key.get(iii) ); // The given covariate
|
||||
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey);
|
||||
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( collapsedTableKey );
|
||||
if( deltaQCovariateEmpirical != null ) {
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue