From 9b8fd4c2ff6c63bc60fbab973590ca495cf99cfd Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 11 Feb 2012 10:57:20 -0500 Subject: [PATCH] Updating the half of the code that makes use of the recalibration information to work with the new refactoring of the bqsr. Reverting the covariate interface change in the original bqsr because the error model enum was moved to a different class and didn't make sense any more. --- .../traversals/TraverseActiveRegions.java | 6 +- .../gatk/walkers/ActiveRegionWalker.java | 4 +- .../walkers/annotator/MVLikelihoodRatio.java | 2 +- .../gatk/walkers/bqsr/ContextCovariate.java | 5 + .../sting/gatk/walkers/bqsr/Covariate.java | 1 + .../gatk/walkers/bqsr/CovariateKeySet.java | 12 +- .../gatk/walkers/bqsr/CycleCovariate.java | 5 + .../walkers/bqsr/QualityScoreCovariate.java | 36 +++--- .../gatk/walkers/bqsr/ReadGroupCovariate.java | 6 + .../gatk/walkers/bqsr/RecalDataManager.java | 69 ++++++----- .../recalibration/ContextCovariate.java | 2 +- .../recalibration/CountCovariatesWalker.java | 2 +- .../gatk/walkers/recalibration/Covariate.java | 2 +- .../walkers/recalibration/CycleCovariate.java | 2 +- .../walkers/recalibration/DinucCovariate.java | 2 +- .../recalibration/GCContentCovariate.java | 2 +- .../recalibration/HomopolymerCovariate.java | 2 +- .../MappingQualityCovariate.java | 2 +- .../recalibration/MinimumNQSCovariate.java | 2 +- .../recalibration/PositionCovariate.java | 2 +- .../recalibration/PrimerRoundCovariate.java | 2 +- .../recalibration/QualityScoreCovariate.java | 14 +-- .../recalibration/ReadGroupCovariate.java | 2 +- .../recalibration/RecalDataManager.java | 5 +- .../TableRecalibrationWalker.java | 2 +- .../recalibration/BaseRecalibration.java | 113 +++++++++--------- .../sting/utils/sam/GATKSAMRecord.java | 40 +------ 27 files changed, 165 insertions(+), 179 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 58c2df877..70fe43755 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -107,7 +107,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker + final M x = walker.map( activeRegion, null ); return walker.reduce( x, sum ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 244870c78..6403f15a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -5,14 +5,12 @@ import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -77,7 +75,7 @@ public abstract class ActiveRegionWalker extends Walker 0 to be used, otherwise we use the quality from the read. - private byte defaultInsertionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read. - private byte defaultDeletionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read. - // Initialize any member variables using the command-line arguments passed to the walkers @Override public void initialize(final RecalibrationArgumentCollection RAC) { - defaultMismatchesQuality = RAC.MISMATCHES_DEFAULT_QUALITY; - defaultInsertionsQuality = RAC.INSERTIONS_DEFAULT_QUALITY; - defaultDeletionsQuality = RAC.DELETIONS_DEFAULT_QUALITY; } @Override public CovariateValues getValues(final GATKSAMRecord read) { int readLength = read.getReadLength(); - - Byte [] mismatches = new Byte[readLength]; - Byte [] insertions = new Byte[readLength]; - Byte [] deletions = new Byte[readLength]; - + + Integer [] mismatches = new Integer[readLength]; + Integer [] insertions = new Integer[readLength]; + Integer [] deletions = new Integer[readLength]; + byte [] baseQualities = read.getBaseQualities(); + byte [] baseInsertionQualities = read.getBaseInsertionQualities(); + byte [] baseDeletionQualities = read.getBaseDeletionQualities(); - if (defaultMismatchesQuality >= 0) - Arrays.fill(mismatches, defaultMismatchesQuality); // if the user decides to override the base qualities in the read, use the flat value - else { - for (int i=0; i dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + public final NestedHashMap nestedHashMap; // The full dataset + private final HashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private final HashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + private final HashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color - private static boolean warnUserNullReadGroup = false; private static boolean warnUserNullPlatform = false; private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ + public enum BaseRecalibrationType { + BASE_SUBSTITUTION, + BASE_INSERTION, + BASE_DELETION + } public enum SOLID_RECAL_MODE { /** @@ -109,13 +113,18 @@ public class RecalDataManager { } public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) { - if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker + if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration nestedHashMap = null; - dataCollapsedReadGroup = new NestedHashMap(); - dataCollapsedQualityScore = new NestedHashMap(); - dataCollapsedByCovariate = new ArrayList(); - for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.add(new NestedHashMap()); + dataCollapsedReadGroup = new HashMap(); + dataCollapsedQualityScore = new HashMap(); + dataCollapsedByCovariate = new HashMap>(); + for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { + dataCollapsedReadGroup.put(errorModel, new NestedHashMap()); + dataCollapsedQualityScore.put(errorModel, new NestedHashMap()); + dataCollapsedByCovariate.put(errorModel, new ArrayList()); + for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap()); + } } } else { @@ -137,7 +146,7 @@ public class RecalDataManager { * @param fullDatum The RecalDatum which is the data for this mapping * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table */ - public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) { + public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) { // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around //data.put(key, thisDatum); // add the mapping to the main table @@ -151,9 +160,9 @@ public class RecalDataManager { // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) { readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group - collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey); + collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey); if (collapsedDatum == null) { - dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey); + dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey); } else { collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported @@ -163,9 +172,9 @@ public class RecalDataManager { // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... qualityScoreCollapsedKey[1] = key[1]; // and quality score - collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey); + collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey); if (collapsedDatum == null) { - dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey); + dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey); } else { collapsedDatum.increment(fullDatum); @@ -178,9 +187,9 @@ public class RecalDataManager { final Object theCovariateElement = key[iii + 2]; // and the given covariate if (theCovariateElement != null) { covariateCollapsedKey[2] = theCovariateElement; - collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey); + collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey); if (collapsedDatum == null) { - dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey); + dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey); } else { collapsedDatum.increment(fullDatum); @@ -198,11 +207,13 @@ public class RecalDataManager { */ public final void generateEmpiricalQualities(final int smoothing, final int maxQual) { - recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual); - recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual); - for (NestedHashMap map : dataCollapsedByCovariate) { - recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); - checkForSingletons(map.data); + for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { + recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual); + recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual); + for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) { + recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); + checkForSingletons(map.data); + } } } @@ -241,15 +252,15 @@ public class RecalDataManager { * @param covariate Which covariate indexes the desired collapsed HashMap * @return The desired collapsed HashMap */ - public final NestedHashMap getCollapsedTable(final int covariate) { + public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) { if (covariate == 0) { - return dataCollapsedReadGroup; // Table where everything except read group has been collapsed + return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed } else if (covariate == 1) { - return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed } else { - return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed + return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed } } @@ -260,7 +271,7 @@ public class RecalDataManager { * @param RAC The list of shared command line arguments */ public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { - GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup(); + GATKSAMReadGroupRecord readGroup = read.getReadGroup(); if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { readGroup.setPlatform(RAC.FORCE_PLATFORM); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java index 875782fdc..e1a7772db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java @@ -56,7 +56,7 @@ public class ContextCovariate implements ExperimentalCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { byte[] bases = read.getReadBases(); for (int i = 0; i < read.getReadLength(); i++) comparable[i] = (i < CONTEXT_SIZE) ? allN : new String(Arrays.copyOfRange(bases, i - CONTEXT_SIZE, i)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 626460be6..a99f35f45 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -378,7 +378,7 @@ public class CountCovariatesWalker extends LocusWalker dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap final int readLength = read.getReadLength(); final boolean negativeStrand = read.getReadNegativeStrandFlag(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java index 7b209ae5c..14ffd35a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java @@ -82,7 +82,7 @@ public class GCContentCovariate implements ExperimentalCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java index fd67edc3b..004fb0bdb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java @@ -95,7 +95,7 @@ public class HomopolymerCovariate implements ExperimentalCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index e22049890..54fa18106 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -55,7 +55,7 @@ public class MappingQualityCovariate implements ExperimentalCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read); // BUGBUG: this can be optimized } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index 1dfb915b9..ecaa55006 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -65,7 +65,7 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index fbd1efc47..fd720697f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -55,7 +55,7 @@ public class PositionCovariate implements ExperimentalCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 8dfa11884..d6bdea5bf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -62,7 +62,7 @@ public class PrimerRoundCovariate implements ExperimentalCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index 1ed4a6fe8..a29a0530c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -46,16 +46,10 @@ public class QualityScoreCovariate implements RequiredCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { - if (modelType == BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION) { - byte[] baseQualities = read.getBaseQualities(); - for (int i = 0; i < read.getReadLength(); i++) { - comparable[i] = (int) baseQualities[i]; - } - } - else { // model == BASE_INSERTION || model == BASE_DELETION - Arrays.fill(comparable, 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { + byte[] baseQualities = read.getBaseQualities(); + for (int i = 0; i < read.getReadLength(); i++) { + comparable[i] = (int) baseQualities[i]; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index 27e1d8263..33adf4417 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -44,7 +44,7 @@ public class ReadGroupCovariate implements RequiredCovariate { } @Override - public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable) { final String readGroupId = read.getReadGroup().getReadGroupId(); for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = readGroupId; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 311e33f8a..1a6b8cfcb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -63,7 +63,6 @@ public class RecalDataManager { public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color - private static boolean warnUserNullReadGroup = false; private static boolean warnUserNullPlatform = false; public enum SOLID_RECAL_MODE { @@ -604,7 +603,7 @@ public class RecalDataManager { * value for the ith position in the read and the jth covariate in * reqeustedCovariates list. */ - public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates, final BaseRecalibration.BaseRecalibrationType modelType) { + public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) { //compute all covariates for this read final int numRequestedCovariates = requestedCovariates.size(); final int readLength = gatkRead.getReadLength(); @@ -613,7 +612,7 @@ public class RecalDataManager { final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType); + requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder); for (int j = 0; j < readLength; j++) covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index cd848cd9e..08151321f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -405,7 +405,7 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); public static final String EOF_MARKER = "EOF"; private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here? - private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. + private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values. public BaseRecalibration( final File RECAL_FILE ) { // Get a list of all available covariates @@ -89,7 +81,7 @@ public class BaseRecalibration { throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); } else { // Found the covariate list in input file, loop through all of them and instantiate them String[] vals = line.split(","); - for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical + for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical boolean foundClass = false; for( Class covClass : classes ) { if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { @@ -160,7 +152,7 @@ public class BaseRecalibration { final String[] vals = line.split(","); // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); } @@ -172,39 +164,63 @@ public class BaseRecalibration { cov = requestedCovariates.get( iii ); key[iii] = cov.getValue( vals[iii] ); } - + final String modelString = vals[iii++]; + final RecalDataManager.BaseRecalibrationType errorModel = ( modelString.equals(CovariateKeySet.mismatchesCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION : + ( modelString.equals(CovariateKeySet.insertionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_INSERTION : + ( modelString.equals(CovariateKeySet.deletionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_DELETION : null ) ) ); + // Create a new datum using the number of observations, number of mismatches, and reported quality score final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); // Add that datum to all the collapsed tables which will be used in the sequential calculation - dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + + dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter } - public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals, final BaseRecalibrationType modelType ) { + public void recalibrateRead( final GATKSAMRecord read ) { - final byte[] recalQuals = originalQuals.clone(); - //compute all covariate values for this read - final Comparable[][] covariateValues_offset_x_covar = - RecalDataManager.computeCovariates(read, requestedCovariates, modelType); - - // For each base in the read - for( int offset = 0; offset < read.getReadLength(); offset++ ) { - - final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; - - Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); - if(qualityScore == null) - { - qualityScore = performSequentialQualityCalculation( fullCovariateKey ); - qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); - } - - recalQuals[offset] = qualityScore; - } - - preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low + RecalDataManager.computeCovariates(read, requestedCovariates); + final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read ); + + for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) { + final byte[] originalQuals = ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? read.getBaseQualities() : + ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? read.getBaseDeletionQualities() : + ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? read.getBaseDeletionQualities() : null ) ) ); + final byte[] recalQuals = originalQuals.clone(); + + // For each base in the read + for( int offset = 0; offset < read.getReadLength(); offset++ ) { - return recalQuals; + final Object[] fullCovariateKey = + ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? covariateKeySet.getMismatchesKeySet(offset) : + ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? covariateKeySet.getInsertionsKeySet(offset) : + ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? covariateKeySet.getDeletionsKeySet(offset) : null ) ) ); + + Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); + if( qualityScore == null ) { + qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); + qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); + } + + recalQuals[offset] = qualityScore; + } + + preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low + switch (errorModel) { + case BASE_SUBSTITUTION: + read.setBaseQualities( recalQuals ); + break; + case BASE_INSERTION: + read.setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, recalQuals ); + break; + case BASE_DELETION: + read.setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, recalQuals ); + break; + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); + } + } + } /** @@ -222,7 +238,7 @@ public class BaseRecalibration { * @param key The list of Comparables that were calculated from the covariates * @return A recalibrated quality score as a byte */ - private byte performSequentialQualityCalculation( final Object... key ) { + private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) { final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); final Object[] readGroupCollapsedKey = new Object[1]; @@ -231,7 +247,7 @@ public class BaseRecalibration { // The global quality shift (over the read group only) readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); + final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey )); double globalDeltaQ = 0.0; if( globalRecalDatum != null ) { final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); @@ -242,7 +258,7 @@ public class BaseRecalibration { // The shift in quality between reported and empirical qualityScoreCollapsedKey[0] = key[0]; qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); + final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey )); double deltaQReported = 0.0; if( qReportedRecalDatum != null ) { final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); @@ -256,7 +272,7 @@ public class BaseRecalibration { covariateCollapsedKey[1] = key[1]; for( int iii = 2; iii < key.length; iii++ ) { covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); + final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey )); if( covariateRecalDatum != null ) { deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); @@ -265,18 +281,6 @@ public class BaseRecalibration { final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); - - // Verbose printouts used to validate with old recalibrator - //if(key.contains(null)) { - // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", - // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); - //} - //else { - // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", - // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); - //} - - //return newQualityByte; } /** @@ -291,5 +295,4 @@ public class BaseRecalibration { } } } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index bdcf2b210..f6b3d759c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -54,7 +54,6 @@ public class GATKSAMRecord extends BAMRecord { // Base Quality Score Recalibrator specific attribute tags public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; - public static final String BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG = "BR"; // the SAMRecord data we're caching private String mReadString = null; @@ -163,27 +162,6 @@ public class GATKSAMRecord extends BAMRecord { return super.equals(o); } - - @Override - public byte[] getBaseQualities() { - return super.getBaseQualities(); - /* - if( getAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG ) != null ) { - return super.getBaseQualities(); - } else { - // if the recal data was populated in the engine then recalibrate the quality scores on the fly - if( GenomeAnalysisEngine.hasBaseRecalibration() ) { - final byte[] quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, super.getBaseQualities() ); - setBaseQualities(quals); - setAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG, true ); - return quals; - } else { // just use the qualities that are in the read since we don't have the sufficient information to recalibrate on the fly - return super.getBaseQualities(); - } - } - */ - } - /** * Accessors for base insertion and base deletion quality scores */ @@ -191,13 +169,8 @@ public class GATKSAMRecord extends BAMRecord { byte[] quals = getByteArrayAttribute( BQSR_BASE_INSERTION_QUALITIES ); if( quals == null ) { quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, (byte) 45); // allow for differing default values between BaseInsertions and BaseDeletions - // if the recal data was populated in the engine then recalibrate the quality scores on the fly - // else give default values which are flat Q45 - if( GenomeAnalysisEngine.hasBaseRecalibration() ) { - quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_INSERTION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities - } - // add the qual array to the read so that we don't have to do the recalibration work again + Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 setAttribute( BQSR_BASE_INSERTION_QUALITIES, quals ); } return quals; @@ -207,13 +180,8 @@ public class GATKSAMRecord extends BAMRecord { byte[] quals = getByteArrayAttribute( BQSR_BASE_DELETION_QUALITIES ); if( quals == null ) { quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, (byte) 45); - // if the recal data was populated in the engine then recalibrate the quality scores on the fly - // else give default values which are flat Q45 - if( GenomeAnalysisEngine.hasBaseRecalibration() ) { - quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_DELETION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities - } - // add the qual array to the read so that we don't have to do the recalibration work again + Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 setAttribute( BQSR_BASE_DELETION_QUALITIES, quals ); } return quals;