Updating the half of the code that makes use of the recalibration information to work with the new refactoring of the bqsr. Reverting the covariate interface change in the original bqsr because the error model enum was moved to a different class and didn't make sense any more.

This commit is contained in:
Ryan Poplin 2012-02-11 10:57:20 -05:00
parent f52f1f659f
commit 9b8fd4c2ff
27 changed files with 165 additions and 179 deletions

View File

@ -107,7 +107,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
// which active regions in the work queue are now safe to process
if( !locusView.hasNext() ) {
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
@ -135,7 +135,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
}
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
@ -190,7 +190,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker
final M x = walker.map( activeRegion, null );
return walker.reduce( x, sum );
}

View File

@ -5,14 +5,12 @@ import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -77,7 +75,7 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
public abstract double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
// Map over the ActiveRegion
public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker);
public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension();

View File

@ -22,8 +22,8 @@ import java.util.*;
* User: chartl
* Date: 9/14/11
* Time: 12:24 PM
* To change this template use File | Settings | File Templates.
*/
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
private MendelianViolation mendelianViolation = null;

View File

@ -98,4 +98,9 @@ public class ContextCovariate implements StandardCovariate {
return s;
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return str;
}
}

View File

@ -53,6 +53,7 @@ public interface Covariate {
*/
public CovariateValues getValues(GATKSAMRecord read);
public Comparable getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
}
interface RequiredCovariate extends Covariate {}

View File

@ -15,18 +15,18 @@ public class CovariateKeySet {
private int nextCovariateIndex;
private final String mismatchesCovariateName = "M";
private final String insertionsCovariateName = "I";
private final String deletionsCovariateName = "D";
public final static String mismatchesCovariateName = "M";
public final static String insertionsCovariateName = "I";
public final static String deletionsCovariateName = "D";
public CovariateKeySet(int readLength, int numberOfCovariates) {
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
this.mismatchesKeySet = new Object[readLength][numberOfCovariates];
this.insertionsKeySet = new Object[readLength][numberOfCovariates];
this.deletionsKeySet = new Object[readLength][numberOfCovariates];
initializeCovariateKeySet(this.mismatchesKeySet, this.mismatchesCovariateName);
initializeCovariateKeySet(this.insertionsKeySet, this.insertionsCovariateName);
initializeCovariateKeySet(this.deletionsKeySet, this.deletionsCovariateName);
initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName);
initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName);
initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName);
this.nextCovariateIndex = 0;
}

View File

@ -196,4 +196,9 @@ public class CycleCovariate implements StandardCovariate {
return new CovariateValues(cycles, cycles, cycles);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return Integer.parseInt(str);
}
}

View File

@ -39,39 +39,35 @@ import java.util.Arrays;
public class QualityScoreCovariate implements RequiredCovariate {
private byte defaultMismatchesQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
private byte defaultInsertionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
private byte defaultDeletionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
defaultMismatchesQuality = RAC.MISMATCHES_DEFAULT_QUALITY;
defaultInsertionsQuality = RAC.INSERTIONS_DEFAULT_QUALITY;
defaultDeletionsQuality = RAC.DELETIONS_DEFAULT_QUALITY;
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int readLength = read.getReadLength();
Byte [] mismatches = new Byte[readLength];
Byte [] insertions = new Byte[readLength];
Byte [] deletions = new Byte[readLength];
Integer [] mismatches = new Integer[readLength];
Integer [] insertions = new Integer[readLength];
Integer [] deletions = new Integer[readLength];
byte [] baseQualities = read.getBaseQualities();
byte [] baseInsertionQualities = read.getBaseInsertionQualities();
byte [] baseDeletionQualities = read.getBaseDeletionQualities();
if (defaultMismatchesQuality >= 0)
Arrays.fill(mismatches, defaultMismatchesQuality); // if the user decides to override the base qualities in the read, use the flat value
else {
for (int i=0; i<baseQualities.length; i++)
mismatches[i] = baseQualities[i];
for (int i=0; i<baseQualities.length; i++) {
mismatches[i] = (int) baseQualities[i];
insertions[i] = (int) baseInsertionQualities[i];
deletions[i] = (int) baseDeletionQualities[i];
}
Arrays.fill(insertions, defaultInsertionsQuality); // Some day in the future when base insertion and base deletion quals exist the samtools API will
Arrays.fill( deletions, defaultDeletionsQuality); // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat value (parameter)
return new CovariateValues(mismatches, insertions, deletions);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return Integer.parseInt(str);
}
}

View File

@ -52,6 +52,12 @@ public class ReadGroupCovariate implements RequiredCovariate {
Arrays.fill(readGroups, readGroupId);
return new CovariateValues(readGroups, readGroups, readGroups);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return str;
}
}

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -52,20 +53,23 @@ import java.util.Map;
*/
public class RecalDataManager {
public final NestedHashMap nestedHashMap; // The full dataset
private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
private final ArrayList<NestedHashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
public final NestedHashMap nestedHashMap; // The full dataset
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
private final HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNullPlatform = false;
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
public enum BaseRecalibrationType {
BASE_SUBSTITUTION,
BASE_INSERTION,
BASE_DELETION
}
public enum SOLID_RECAL_MODE {
/**
@ -109,13 +113,18 @@ public class RecalDataManager {
}
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
nestedHashMap = null;
dataCollapsedReadGroup = new NestedHashMap();
dataCollapsedQualityScore = new NestedHashMap();
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add(new NestedHashMap());
dataCollapsedReadGroup = new HashMap<BaseRecalibrationType, NestedHashMap>();
dataCollapsedQualityScore = new HashMap<BaseRecalibrationType, NestedHashMap>();
dataCollapsedByCovariate = new HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>>();
for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap());
}
}
}
else {
@ -137,7 +146,7 @@ public class RecalDataManager {
* @param fullDatum The RecalDatum which is the data for this mapping
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
*/
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) {
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) {
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
//data.put(key, thisDatum); // add the mapping to the main table
@ -151,9 +160,9 @@ public class RecalDataManager {
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey);
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey);
dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey);
}
else {
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
@ -163,9 +172,9 @@ public class RecalDataManager {
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
qualityScoreCollapsedKey[1] = key[1]; // and quality score
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey);
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
}
else {
collapsedDatum.increment(fullDatum);
@ -178,9 +187,9 @@ public class RecalDataManager {
final Object theCovariateElement = key[iii + 2]; // and the given covariate
if (theCovariateElement != null) {
covariateCollapsedKey[2] = theCovariateElement;
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey);
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
}
else {
collapsedDatum.increment(fullDatum);
@ -198,11 +207,13 @@ public class RecalDataManager {
*/
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual);
for (NestedHashMap map : dataCollapsedByCovariate) {
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
checkForSingletons(map.data);
for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual);
for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) {
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
checkForSingletons(map.data);
}
}
}
@ -241,15 +252,15 @@ public class RecalDataManager {
* @param covariate Which covariate indexes the desired collapsed HashMap
* @return The desired collapsed HashMap
*/
public final NestedHashMap getCollapsedTable(final int covariate) {
public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) {
if (covariate == 0) {
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
}
else if (covariate == 1) {
return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed
}
else {
return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
}
}
@ -260,7 +271,7 @@ public class RecalDataManager {
* @param RAC The list of shared command line arguments
*/
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
GATKSAMReadGroupRecord readGroup = read.getReadGroup();
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM);

View File

@ -56,7 +56,7 @@ public class ContextCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
byte[] bases = read.getReadBases();
for (int i = 0; i < read.getReadLength(); i++)
comparable[i] = (i < CONTEXT_SIZE) ? allN : new String(Arrays.copyOfRange(bases, i - CONTEXT_SIZE, i));

View File

@ -378,7 +378,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
RecalDataManager.parseColorSpace(gatkRead);
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates));
}
// Skip this position if base quality is zero

View File

@ -43,7 +43,7 @@ public interface Covariate {
public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public void getValues(GATKSAMRecord read, Comparable[] comparable, BaseRecalibration.BaseRecalibrationType modelType);
public void getValues(GATKSAMRecord read, Comparable[] comparable);
//Takes an array of size (at least) read.getReadLength() and fills it with covariate
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
//read-specific calculations to be done just once rather than for each offset.

View File

@ -66,7 +66,7 @@ public class CycleCovariate implements StandardCovariate {
// Used to pick out the covariate's value from attributes of the read
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
//-----------------------------
// Illumina, Solid, PacBio, and Complete Genomics

View File

@ -66,7 +66,7 @@ public class DinucCovariate implements StandardCovariate {
* Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read.
*/
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
final HashMap<Integer, Dinuc> dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap
final int readLength = read.getReadLength();
final boolean negativeStrand = read.getReadNegativeStrandFlag();

View File

@ -82,7 +82,7 @@ public class GCContentCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -95,7 +95,7 @@ public class HomopolymerCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -55,7 +55,7 @@ public class MappingQualityCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read); // BUGBUG: this can be optimized
}

View File

@ -65,7 +65,7 @@ public class MinimumNQSCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -55,7 +55,7 @@ public class PositionCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -62,7 +62,7 @@ public class PrimerRoundCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -46,16 +46,10 @@ public class QualityScoreCovariate implements RequiredCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
if (modelType == BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION) {
byte[] baseQualities = read.getBaseQualities();
for (int i = 0; i < read.getReadLength(); i++) {
comparable[i] = (int) baseQualities[i];
}
}
else { // model == BASE_INSERTION || model == BASE_DELETION
Arrays.fill(comparable, 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
byte[] baseQualities = read.getBaseQualities();
for (int i = 0; i < read.getReadLength(); i++) {
comparable[i] = (int) baseQualities[i];
}
}

View File

@ -44,7 +44,7 @@ public class ReadGroupCovariate implements RequiredCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
final String readGroupId = read.getReadGroup().getReadGroupId();
for (int i = 0; i < read.getReadLength(); i++) {
comparable[i] = readGroupId;

View File

@ -63,7 +63,6 @@ public class RecalDataManager {
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNullPlatform = false;
public enum SOLID_RECAL_MODE {
@ -604,7 +603,7 @@ public class RecalDataManager {
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*/
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates, final BaseRecalibration.BaseRecalibrationType modelType) {
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates) {
//compute all covariates for this read
final int numRequestedCovariates = requestedCovariates.size();
final int readLength = gatkRead.getReadLength();
@ -613,7 +612,7 @@ public class RecalDataManager {
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType);
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder);
for (int j = 0; j < readLength; j++)
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
}

View File

@ -405,7 +405,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates);
// For each base in the read
for (int offset = 0; offset < read.getReadLength(); offset++) {

View File

@ -25,14 +25,12 @@
package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.text.XReadLines;
@ -52,19 +50,13 @@ import java.util.regex.Pattern;
public class BaseRecalibration {
public enum BaseRecalibrationType {
BASE_SUBSTITUTION,
BASE_INSERTION,
BASE_DELETION
}
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
public static final String EOF_MARKER = "EOF";
private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here?
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values.
public BaseRecalibration( final File RECAL_FILE ) {
// Get a list of all available covariates
@ -89,7 +81,7 @@ public class BaseRecalibration {
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
} else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(",");
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
@ -160,7 +152,7 @@ public class BaseRecalibration {
final String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
}
@ -172,39 +164,63 @@ public class BaseRecalibration {
cov = requestedCovariates.get( iii );
key[iii] = cov.getValue( vals[iii] );
}
final String modelString = vals[iii++];
final RecalDataManager.BaseRecalibrationType errorModel = ( modelString.equals(CovariateKeySet.mismatchesCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION :
( modelString.equals(CovariateKeySet.insertionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_INSERTION :
( modelString.equals(CovariateKeySet.deletionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_DELETION : null ) ) );
// Create a new datum using the number of observations, number of mismatches, and reported quality score
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
}
public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals, final BaseRecalibrationType modelType ) {
public void recalibrateRead( final GATKSAMRecord read ) {
final byte[] recalQuals = originalQuals.clone();
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates(read, requestedCovariates, modelType);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null)
{
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = qualityScore;
}
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
RecalDataManager.computeCovariates(read, requestedCovariates);
final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read );
for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) {
final byte[] originalQuals = ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? read.getBaseQualities() :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? read.getBaseDeletionQualities() :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? read.getBaseDeletionQualities() : null ) ) );
final byte[] recalQuals = originalQuals.clone();
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
return recalQuals;
final Object[] fullCovariateKey =
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? covariateKeySet.getMismatchesKeySet(offset) :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? covariateKeySet.getInsertionsKeySet(offset) :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? covariateKeySet.getDeletionsKeySet(offset) : null ) ) );
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if( qualityScore == null ) {
qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = qualityScore;
}
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
switch (errorModel) {
case BASE_SUBSTITUTION:
read.setBaseQualities( recalQuals );
break;
case BASE_INSERTION:
read.setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, recalQuals );
break;
case BASE_DELETION:
read.setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, recalQuals );
break;
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
}
}
}
/**
@ -222,7 +238,7 @@ public class BaseRecalibration {
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation( final Object... key ) {
private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) {
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
@ -231,7 +247,7 @@ public class BaseRecalibration {
// The global quality shift (over the read group only)
readGroupCollapsedKey[0] = key[0];
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey ));
double globalDeltaQ = 0.0;
if( globalRecalDatum != null ) {
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
@ -242,7 +258,7 @@ public class BaseRecalibration {
// The shift in quality between reported and empirical
qualityScoreCollapsedKey[0] = key[0];
qualityScoreCollapsedKey[1] = key[1];
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey ));
double deltaQReported = 0.0;
if( qReportedRecalDatum != null ) {
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
@ -256,7 +272,7 @@ public class BaseRecalibration {
covariateCollapsedKey[1] = key[1];
for( int iii = 2; iii < key.length; iii++ ) {
covariateCollapsedKey[2] = key[iii]; // The given covariate
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey ));
if( covariateRecalDatum != null ) {
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
@ -265,18 +281,6 @@ public class BaseRecalibration {
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
// Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) {
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
//}
//else {
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
//}
//return newQualityByte;
}
/**
@ -291,5 +295,4 @@ public class BaseRecalibration {
}
}
}
}

View File

@ -54,7 +54,6 @@ public class GATKSAMRecord extends BAMRecord {
// Base Quality Score Recalibrator specific attribute tags
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI";
public static final String BQSR_BASE_DELETION_QUALITIES = "BD";
public static final String BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG = "BR";
// the SAMRecord data we're caching
private String mReadString = null;
@ -163,27 +162,6 @@ public class GATKSAMRecord extends BAMRecord {
return super.equals(o);
}
@Override
public byte[] getBaseQualities() {
return super.getBaseQualities();
/*
if( getAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG ) != null ) {
return super.getBaseQualities();
} else {
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
final byte[] quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, super.getBaseQualities() );
setBaseQualities(quals);
setAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG, true );
return quals;
} else { // just use the qualities that are in the read since we don't have the sufficient information to recalibrate on the fly
return super.getBaseQualities();
}
}
*/
}
/**
* Accessors for base insertion and base deletion quality scores
*/
@ -191,13 +169,8 @@ public class GATKSAMRecord extends BAMRecord {
byte[] quals = getByteArrayAttribute( BQSR_BASE_INSERTION_QUALITIES );
if( quals == null ) {
quals = new byte[getBaseQualities().length];
Arrays.fill(quals, (byte) 45); // allow for differing default values between BaseInsertions and BaseDeletions
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
// else give default values which are flat Q45
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_INSERTION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
}
// add the qual array to the read so that we don't have to do the recalibration work again
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
setAttribute( BQSR_BASE_INSERTION_QUALITIES, quals );
}
return quals;
@ -207,13 +180,8 @@ public class GATKSAMRecord extends BAMRecord {
byte[] quals = getByteArrayAttribute( BQSR_BASE_DELETION_QUALITIES );
if( quals == null ) {
quals = new byte[getBaseQualities().length];
Arrays.fill(quals, (byte) 45);
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
// else give default values which are flat Q45
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_DELETION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
}
// add the qual array to the read so that we don't have to do the recalibration work again
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
setAttribute( BQSR_BASE_DELETION_QUALITIES, quals );
}
return quals;