Moving the covariates and shared functionality to public

so Ryan can work on the recalibration on the fly without breaking the build. Supposedly all the secret sauce is in the BQSR walker, which sits in private.
This commit is contained in:
Mauricio Carneiro 2012-02-10 11:43:48 -05:00
parent 48cc4b913a
commit 1fb19a0f98
11 changed files with 1623 additions and 0 deletions

View File

@ -0,0 +1,101 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 9/26/11
*/
public class ContextCovariate implements StandardCovariate {
private int mismatchesContextSize;
private int insertionsContextSize;
private int deletionsContextSize;
private String mismatchesNoContext = "";
private String insertionsNoContext = "";
private String deletionsNoContext = "";
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE;
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE;
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE;
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
// initialize no context strings given the size of the context for each covariate type
mismatchesNoContext = makeAllNStringWithLength(mismatchesContextSize);
insertionsNoContext = makeAllNStringWithLength(insertionsContextSize);
deletionsNoContext = makeAllNStringWithLength( deletionsContextSize);
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int l = read.getReadLength();
String[] mismatches = new String [l];
String[] insertions = new String [l];
String[] deletions = new String [l];
byte[] bases = read.getReadBases();
for (int i = 0; i < read.getReadLength(); i++) {
mismatches[i] = contextWith(bases, i, mismatchesContextSize, mismatchesNoContext);
insertions[i] = contextWith(bases, i, insertionsContextSize, insertionsNoContext);
deletions[i] = contextWith(bases, i, deletionsContextSize, deletionsNoContext);
}
return new CovariateValues(mismatches, insertions, deletions);
}
/**
* calculates the context of a base indenpendent of the covariate mode
*
* @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context
* @param noContextString string to return if the position is not far enough in the read to have a full context before.
* @return
*/
private String contextWith(byte [] bases, int offset, int contextSize, String noContextString) {
return (offset < contextSize) ? noContextString : new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
}
private String makeAllNStringWithLength(int length) {
String s = "";
for (int i=0; i<length; i++)
s += "N";
return s;
}
}

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read.
* In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
* This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
*/
public interface Covariate {
/**
* Initialize any member variables using the command-line arguments passed to the walker
*
* @param RAC the recalibration argument collection
*/
public void initialize(RecalibrationArgumentCollection RAC);
/**
* Calculates covariate values for all positions in the read.
*
* @param read the read to calculate the covariates on.
* @return all the covariate values for every base in the read.
*/
public CovariateValues getValues(GATKSAMRecord read);
}
interface RequiredCovariate extends Covariate {}
interface StandardCovariate extends Covariate {}
interface ExperimentalCovariate extends Covariate {}

View File

@ -0,0 +1,63 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
/**
* The object temporarily held by a read that describes all of it's covariates.
*
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
*
* @author Mauricio Carneiro
* @since 2/8/12
*/
public class CovariateKeySet {
private Object[][] mismatchesKeySet;
private Object[][] insertionsKeySet;
private Object[][] deletionsKeySet;
private int nextCovariateIndex;
private final String mismatchesCovariateName = "M";
private final String insertionsCovariateName = "I";
private final String deletionsCovariateName = "D";
public CovariateKeySet(int readLength, int numberOfCovariates) {
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
this.mismatchesKeySet = new Object[readLength][numberOfCovariates];
this.insertionsKeySet = new Object[readLength][numberOfCovariates];
this.deletionsKeySet = new Object[readLength][numberOfCovariates];
initializeCovariateKeySet(this.mismatchesKeySet, this.mismatchesCovariateName);
initializeCovariateKeySet(this.insertionsKeySet, this.insertionsCovariateName);
initializeCovariateKeySet(this.deletionsKeySet, this.deletionsCovariateName);
this.nextCovariateIndex = 0;
}
public void addCovariate(CovariateValues covariate) {
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
nextCovariateIndex++;
}
public Object[] getMismatchesKeySet(int readPosition) {
return mismatchesKeySet[readPosition];
}
public Object[] getInsertionsKeySet(int readPosition) {
return insertionsKeySet[readPosition];
}
public Object[] getDeletionsKeySet(int readPosition) {
return deletionsKeySet[readPosition];
}
private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) {
for (int i=0; i<covariateValues.length; i++)
keySet[i][nextCovariateIndex] = covariateValues[i];
}
private void initializeCovariateKeySet (Object[][] keySet, String covariateName) {
int readLength = keySet.length;
int lastCovariateIndex = keySet[0].length - 1;
for (int i = 0; i < readLength; i++)
keySet[i][lastCovariateIndex] = covariateName;
}
}

View File

@ -0,0 +1,37 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
/**
* An object to hold the different covariate values for all bases in the read.
*
* Currently we have three different covariates for each read:
* - Mismatch
* - Insertion
* - Deletion
*
* @author Mauricio Carneiro
* @since 2/8/12
*/
public class CovariateValues {
private Comparable[] mismatches;
private Comparable[] insertions;
private Comparable[] deletions;
public CovariateValues(Comparable[] mismatch, Comparable[] insertion, Comparable[] deletion) {
this.mismatches = mismatch;
this.insertions = insertion;
this.deletions = deletion;
}
public Comparable[] getMismatches() {
return mismatches;
}
public Comparable[] getInsertions() {
return insertions;
}
public Comparable[] getDeletions() {
return deletions;
}
}

View File

@ -0,0 +1,199 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.EnumSet;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*
* The Cycle covariate.
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
* For example, for the read: AAACCCCGAAATTTTTACTG
* the cycle would be 11111111222333333344
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
*/
public class CycleCovariate implements StandardCovariate {
private final static EnumSet<NGSPlatform> DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS);
private final static EnumSet<NGSPlatform> FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM))
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform.");
}
// Used to pick out the covariate's value from attributes of the read
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
Integer [] cycles = new Integer[read.getReadLength()];
final NGSPlatform ngsPlatform = read.getNGSPlatform();
// Discrete cycle platforms
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final int init;
final int increment;
if (!read.getReadNegativeStrandFlag()) {
// Differentiate between first and second of pair.
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
// Therefore the cycle covariate must differentiate between first and second of pair reads.
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
// the current sequential model would consider the effects independently instead of jointly.
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
//second of pair, positive strand
init = -1;
increment = -1;
}
else {
//first of pair, positive strand
init = 1;
increment = 1;
}
}
else {
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
//second of pair, negative strand
init = -read.getReadLength();
increment = 1;
}
else {
//first of pair, negative strand
init = read.getReadLength();
increment = -1;
}
}
int cycle = init;
for (int i = 0; i < read.getReadLength(); i++) {
cycles[i] = cycle;
cycle += increment;
}
}
// Flow cycle platforms
else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final int readLength = read.getReadLength();
final byte[] bases = read.getReadBases();
// Differentiate between first and second of pair.
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
// Therefore the cycle covariate must differentiate between first and second of pair reads.
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
// the current sequential model would consider the effects independently instead of jointly.
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
int cycle = multiplyByNegative1 ? -1 : 1;
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
if (!read.getReadNegativeStrandFlag()) { // Forward direction
int iii = 0;
while (iii < readLength) {
while (iii < readLength && bases[iii] == (byte) 'T') {
cycles[iii] = cycle;
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'A') {
cycles[iii] = cycle;
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'C') {
cycles[iii] = cycle;
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'G') {
cycles[iii] = cycle;
iii++;
}
if (iii < readLength) {
if (multiplyByNegative1)
cycle--;
else
cycle++;
}
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = cycle;
iii++;
}
}
}
else { // Negative direction
int iii = readLength - 1;
while (iii >= 0) {
while (iii >= 0 && bases[iii] == (byte) 'T') {
cycles[iii] = cycle;
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'A') {
cycles[iii] = cycle;
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'C') {
cycles[iii] = cycle;
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'G') {
cycles[iii] = cycle;
iii--;
}
if (iii >= 0) {
if (multiplyByNegative1)
cycle--;
else
cycle++;
}
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = cycle;
iii--;
}
}
}
}
// Unknown platforms
else {
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
}
return new CovariateValues(cycles, cycles, cycles);
}
}

View File

@ -0,0 +1,77 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* The Reported Quality Score covariate.
*/
public class QualityScoreCovariate implements RequiredCovariate {
private byte defaultMismatchesQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
private byte defaultInsertionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
private byte defaultDeletionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
defaultMismatchesQuality = RAC.MISMATCHES_DEFAULT_QUALITY;
defaultInsertionsQuality = RAC.INSERTIONS_DEFAULT_QUALITY;
defaultDeletionsQuality = RAC.DELETIONS_DEFAULT_QUALITY;
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int readLength = read.getReadLength();
Byte [] mismatches = new Byte[readLength];
Byte [] insertions = new Byte[readLength];
Byte [] deletions = new Byte[readLength];
byte [] baseQualities = read.getBaseQualities();
if (defaultMismatchesQuality >= 0)
Arrays.fill(mismatches, defaultMismatchesQuality); // if the user decides to override the base qualities in the read, use the flat value
else {
for (int i=0; i<baseQualities.length; i++)
mismatches[i] = baseQualities[i];
}
Arrays.fill(insertions, defaultInsertionsQuality); // Some day in the future when base insertion and base deletion quals exist the samtools API will
Arrays.fill( deletions, defaultDeletionsQuality); // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat value (parameter)
return new CovariateValues(mismatches, insertions, deletions);
}
}

View File

@ -0,0 +1,57 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*
* The Read Group covariate.
*/
public class ReadGroupCovariate implements RequiredCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
final int l = read.getReadLength();
final String readGroupId = read.getReadGroup().getReadGroupId();
String [] readGroups = new String[l];
Arrays.fill(readGroups, readGroupId);
return new CovariateValues(readGroups, readGroups, readGroups);
}
}

View File

@ -0,0 +1,698 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 6, 2009
*
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
* It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
* This class holds the parsing methods that are shared between CountCovariates and TableRecalibration.
*/
public class RecalDataManager {
public final NestedHashMap nestedHashMap; // The full dataset
private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
private final ArrayList<NestedHashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNullPlatform = false;
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
public enum SOLID_RECAL_MODE {
/**
* Treat reference inserted bases as reference matching bases. Very unsafe!
*/
DO_NOTHING,
/**
* Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option.
*/
SET_Q_ZERO,
/**
* In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV.
*/
SET_Q_ZERO_BASE_N,
/**
* Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference.
*/
REMOVE_REF_BIAS
}
public enum SOLID_NOCALL_STRATEGY {
/**
* When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option.
*/
THROW_EXCEPTION,
/**
* Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare.
*/
LEAVE_READ_UNRECALIBRATED,
/**
* Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses.
*/
PURGE_READ
}
public RecalDataManager() {
nestedHashMap = new NestedHashMap();
dataCollapsedReadGroup = null;
dataCollapsedQualityScore = null;
dataCollapsedByCovariate = null;
}
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
nestedHashMap = null;
dataCollapsedReadGroup = new NestedHashMap();
dataCollapsedQualityScore = new NestedHashMap();
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add(new NestedHashMap());
}
}
else {
nestedHashMap = new NestedHashMap();
dataCollapsedReadGroup = null;
dataCollapsedQualityScore = null;
dataCollapsedByCovariate = null;
}
}
public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) {
return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
}
/**
* Add the given mapping to all of the collapsed hash tables
*
* @param key The list of comparables that is the key for this mapping
* @param fullDatum The RecalDatum which is the data for this mapping
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
*/
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) {
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
//data.put(key, thisDatum); // add the mapping to the main table
final int qualityScore = Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
final Object[] qualityScoreCollapsedKey = new Object[2];
final Object[] covariateCollapsedKey = new Object[3];
RecalDatum collapsedDatum;
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey);
}
else {
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
}
}
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
qualityScoreCollapsedKey[1] = key[1]; // and quality score
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
}
else {
collapsedDatum.increment(fullDatum);
}
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for (int iii = 0; iii < dataCollapsedByCovariate.size(); iii++) {
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
covariateCollapsedKey[1] = key[1]; // and quality score ...
final Object theCovariateElement = key[iii + 2]; // and the given covariate
if (theCovariateElement != null) {
covariateCollapsedKey[2] = theCovariateElement;
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
}
else {
collapsedDatum.increment(fullDatum);
}
}
}
}
/**
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
* that will be used in the sequential calculation in TableRecalibrationWalker
*
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
* @param maxQual At which value to cap the quality scores
*/
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual);
for (NestedHashMap map : dataCollapsedByCovariate) {
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
checkForSingletons(map.data);
}
}
private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) {
for (Object comp : data.keySet()) {
final Object val = data.get(comp);
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual);
}
else { // Another layer in the nested hash map
recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual);
}
}
}
private void checkForSingletons(final Map data) {
// todo -- this looks like it's better just as a data.valueSet() call?
for (Object comp : data.keySet()) {
final Object val = data.get(comp);
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
if (data.keySet().size() == 1) {
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
// in a previous step of the sequential calculation model
}
}
else { // Another layer in the nested hash map
checkForSingletons((Map) val);
}
}
}
/**
* Get the appropriate collapsed table out of the set of all the tables held by this Object
*
* @param covariate Which covariate indexes the desired collapsed HashMap
* @return The desired collapsed HashMap
*/
public final NestedHashMap getCollapsedTable(final int covariate) {
if (covariate == 0) {
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
}
else if (covariate == 1) {
return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
}
else {
return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
}
}
/**
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
*
* @param read The read to adjust
* @param RAC The list of shared command line arguments
*/
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM);
}
if (readGroup.getPlatform() == null) {
if (RAC.DEFAULT_PLATFORM != null) {
if (!warnUserNullPlatform) {
Utils.warnUser("The input .bam file contains reads with no platform information. " +
"Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " +
"First observed at read with name = " + read.getReadName());
warnUserNullPlatform = true;
}
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
}
else {
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName());
}
}
}
/**
* Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space
*
* @param read The SAMRecord to parse
*/
public static void parseColorSpace(final GATKSAMRecord read) {
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
if (ReadUtils.isSOLiDRead(read)) {
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpace;
if (attr instanceof String) {
colorSpace = ((String) attr).getBytes();
}
else {
throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
if (read.getReadNegativeStrandFlag()) {
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
}
final byte[] inconsistency = new byte[readBases.length];
int iii;
byte prevBase = colorSpace[0]; // The sentinel
for (iii = 0; iii < readBases.length; iii++) {
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1);
prevBase = readBases[iii];
}
read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency);
}
else {
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
}
}
}
/**
* Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases
* This method doesn't add the inconsistent tag to the read like parseColorSpace does
*
* @param read The SAMRecord to parse
* @param originalQualScores The array of original quality scores to modify during the correction
* @param solidRecalMode Which mode of solid recalibration to apply
* @param refBases The reference for this read
* @return A new array of quality scores that have been ref bias corrected
*/
public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpace;
if (attr instanceof String) {
colorSpace = ((String) attr).getBytes();
}
else {
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
final byte[] colorImpliedBases = readBases.clone();
byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
if (read.getReadNegativeStrandFlag()) {
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone());
}
final int[] inconsistency = new int[readBases.length];
byte prevBase = colorSpace[0]; // The sentinel
for (int iii = 0; iii < readBases.length; iii++) {
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
colorImpliedBases[iii] = thisBase;
inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1);
prevBase = readBases[iii];
}
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0
final boolean setBaseN = false;
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
}
else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) {
final boolean setBaseN = true;
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
}
else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead);
}
}
else {
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
return originalQualScores;
}
public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
if (ReadUtils.isSOLiDRead(read)) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpace;
if (attr instanceof String) {
colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel
}
else {
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
for (byte color : colorSpace) {
if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') {
return true; // There is a bad color in this SOLiD read and the user wants to skip over it
}
}
}
else {
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
}
return false; // There aren't any color no calls in this SOLiD read
}
/**
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
*
* @param read The SAMRecord to recalibrate
* @param readBases The bases in the read which have been RC'd if necessary
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param originalQualScores The array of original quality scores to set to zero if needed
* @param refBases The reference which has been RC'd if necessary
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
* @return The byte array of original quality scores some of which might have been set to zero
*/
private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) {
final boolean negStrand = read.getReadNegativeStrandFlag();
for (int iii = 1; iii < originalQualScores.length; iii++) {
if (inconsistency[iii] == 1) {
if (readBases[iii] == refBases[iii]) {
if (negStrand) {
originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0;
}
else {
originalQualScores[iii] = (byte) 0;
}
if (setBaseN) {
readBases[iii] = (byte) 'N';
}
}
// Set the prev base to Q0 as well
if (readBases[iii - 1] == refBases[iii - 1]) {
if (negStrand) {
originalQualScores[originalQualScores.length - iii] = (byte) 0;
}
else {
originalQualScores[iii - 1] = (byte) 0;
}
if (setBaseN) {
readBases[iii - 1] = (byte) 'N';
}
}
}
}
if (negStrand) {
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
}
read.setReadBases(readBases);
return originalQualScores;
}
/**
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
*
* @param read The SAMRecord to recalibrate
* @param readBases The bases in the read which have been RC'd if necessary
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
* @param refBases The reference which has been RC'd if necessary
*/
private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpaceQuals;
if (attr instanceof String) {
String x = (String) attr;
colorSpaceQuals = x.getBytes();
SAMUtils.fastqToPhred(colorSpaceQuals);
}
else {
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
for (int iii = 1; iii < inconsistency.length - 1; iii++) {
if (inconsistency[iii] == 1) {
for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read
if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
if (readBases[jjj] == refBases[jjj]) {
if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2);
if (rand == 0) { // The color implied base won the coin flip
readBases[jjj] = colorImpliedBases[jjj];
}
}
else {
final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
int diffInQuality = maxQuality - minQuality;
int numLow = minQuality;
if (numLow == 0) {
numLow++;
diffInQuality++;
}
final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh);
if (rand >= numLow) { // higher q score won
if (maxQuality == (int) colorSpaceQuals[jjj]) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had higher q score, and won out, so nothing to do here
}
else { // lower q score won
if (minQuality == (int) colorSpaceQuals[jjj]) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had lower q score, and won out, so nothing to do here
}
}
}
}
}
}
}
if (read.getReadNegativeStrandFlag()) {
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
}
read.setReadBases(readBases);
}
else { // No color space quality tag in file
throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
}
}
/**
* Given the base and the color calculate the next base in the sequence
*
* @param prevBase The base
* @param color The color
* @return The next base in the sequence
*/
private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) {
switch (color) {
case '0':
return prevBase;
case '1':
return performColorOne(prevBase);
case '2':
return performColorTwo(prevBase);
case '3':
return performColorThree(prevBase);
default:
throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color +
" Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias.");
}
}
/**
* Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
*
* @param read The read which contains the color space to check against
* @param offset The offset in the read at which to check
* @return Returns true if the base was inconsistent with the color space
*/
public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
if (attr != null) {
final byte[] inconsistency = (byte[]) attr;
// NOTE: The inconsistency array is in the direction of the read, not aligned to the reference!
if (read.getReadNegativeStrandFlag()) { // Negative direction
return inconsistency[inconsistency.length - offset - 1] != (byte) 0;
}
else { // Forward direction
return inconsistency[offset] != (byte) 0;
}
// This block of code is for if you want to check both the offset and the next base for color space inconsistency
//if( read.getReadNegativeStrandFlag() ) { // Negative direction
// if( offset == 0 ) {
// return inconsistency[0] != 0;
// } else {
// return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0);
// }
//} else { // Forward direction
// if( offset == inconsistency.length - 1 ) {
// return inconsistency[inconsistency.length - 1] != 0;
// } else {
// return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0);
// }
//}
}
else { // No inconsistency array, so nothing is inconsistent
return false;
}
}
/**
* Computes all requested covariates for every offset in the given read
* by calling covariate.getValues(..).
*
* @param read The read for which to compute covariate values.
* @param requestedCovariates The list of requested covariates.
* @return An array of covariate values where result[i][j] is the covariate
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*/
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
final int numRequestedCovariates = requestedCovariates.size();
final int readLength = read.getReadLength();
final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates);
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for (Covariate covariate : requestedCovariates)
covariateKeySet.addCovariate(covariate.getValues(read));
read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet);
}
/**
* Perform a certain transversion (A <-> C or G <-> T) on the base.
*
* @param base the base [AaCcGgTt]
* @return the transversion of the base, or the input base if it's not one of the understood ones
*/
private static byte performColorOne(byte base) {
switch (base) {
case 'A':
case 'a':
return 'C';
case 'C':
case 'c':
return 'A';
case 'G':
case 'g':
return 'T';
case 'T':
case 't':
return 'G';
default:
return base;
}
}
/**
* Perform a transition (A <-> G or C <-> T) on the base.
*
* @param base the base [AaCcGgTt]
* @return the transition of the base, or the input base if it's not one of the understood ones
*/
private static byte performColorTwo(byte base) {
switch (base) {
case 'A':
case 'a':
return 'G';
case 'C':
case 'c':
return 'T';
case 'G':
case 'g':
return 'A';
case 'T':
case 't':
return 'C';
default:
return base;
}
}
/**
* Return the complement (A <-> T or C <-> G) of a base.
*
* @param base the base [AaCcGgTt]
* @return the complementary base, or the input base if it's not one of the understood ones
*/
private static byte performColorThree(byte base) {
switch (base) {
case 'A':
case 'a':
return 'T';
case 'C':
case 'c':
return 'G';
case 'G':
case 'g':
return 'C';
case 'T':
case 't':
return 'A';
default:
return base;
}
}
}

View File

@ -0,0 +1,112 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
*/
public class RecalDatum extends RecalDatumOptimized {
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
//---------------------------------------------------------------------------------------------------------------
//
// constructors
//
//---------------------------------------------------------------------------------------------------------------
public RecalDatum() {
numObservations = 0L;
numMismatches = 0L;
estimatedQReported = 0.0;
empiricalQuality = 0.0;
}
public RecalDatum(final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality) {
numObservations = _numObservations;
numMismatches = _numMismatches;
estimatedQReported = _estimatedQReported;
empiricalQuality = _empiricalQuality;
}
public RecalDatum(final RecalDatum copy) {
this.numObservations = copy.numObservations;
this.numMismatches = copy.numMismatches;
this.estimatedQReported = copy.estimatedQReported;
this.empiricalQuality = copy.empiricalQuality;
}
//---------------------------------------------------------------------------------------------------------------
//
// increment methods
//
//---------------------------------------------------------------------------------------------------------------
public final void combine(final RecalDatum other) {
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
this.increment(other.numObservations, other.numMismatches);
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
}
//---------------------------------------------------------------------------------------------------------------
//
// methods to derive empirical quality score
//
//---------------------------------------------------------------------------------------------------------------
public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) {
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
}
//---------------------------------------------------------------------------------------------------------------
//
// misc. methods
//
//---------------------------------------------------------------------------------------------------------------
public final double getEstimatedQReported() {
return estimatedQReported;
}
public final double getEmpiricalQuality() {
return empiricalQuality;
}
private double calcExpectedErrors() {
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
}
private double qualToErrorProb(final double qual) {
return Math.pow(10.0, qual / -10.0);
}
}

View File

@ -0,0 +1,115 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.List;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Jan 6, 2010
*
* An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed.
* Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
*/
public class RecalDatumOptimized {
protected long numObservations; // number of bases seen in total
protected long numMismatches; // number of bases seen that didn't match the reference
//---------------------------------------------------------------------------------------------------------------
//
// constructors
//
//---------------------------------------------------------------------------------------------------------------
public RecalDatumOptimized() {
numObservations = 0L;
numMismatches = 0L;
}
public RecalDatumOptimized(final long _numObservations, final long _numMismatches) {
numObservations = _numObservations;
numMismatches = _numMismatches;
}
public RecalDatumOptimized(final RecalDatumOptimized copy) {
this.numObservations = copy.numObservations;
this.numMismatches = copy.numMismatches;
}
//---------------------------------------------------------------------------------------------------------------
//
// increment methods
//
//---------------------------------------------------------------------------------------------------------------
public synchronized final void increment(final long incObservations, final long incMismatches) {
numObservations += incObservations;
numMismatches += incMismatches;
}
public synchronized final void increment(final RecalDatumOptimized other) {
increment(other.numObservations, other.numMismatches);
}
public synchronized final void increment(final List<RecalDatumOptimized> data) {
for (RecalDatumOptimized other : data) {
this.increment(other);
}
}
//---------------------------------------------------------------------------------------------------------------
//
// methods to derive empirical quality score
//
//---------------------------------------------------------------------------------------------------------------
public final double empiricalQualDouble(final int smoothing, final double maxQual) {
final double doubleMismatches = (double) (numMismatches + smoothing);
final double doubleObservations = (double) (numObservations + smoothing);
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
return Math.min(empiricalQual, maxQual);
}
public final byte empiricalQualByte(final int smoothing) {
final double doubleMismatches = (double) (numMismatches + smoothing);
final double doubleObservations = (double) (numObservations + smoothing);
return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40
}
public final byte empiricalQualByte() {
return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero
}
public final String outputToCSV() {
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
}
}

View File

@ -0,0 +1,102 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 27, 2009
*
* A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
*/
public class RecalibrationArgumentCollection {
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
*/
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
/**
* CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
* their color space tag can not be recalibrated.
*/
@Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base mismatches
*/
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
public int MISMATCHES_CONTEXT_SIZE = 2;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base insertions
*/
@Argument(fullName = "insertions_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions", required = false)
public int INSERTIONS_CONTEXT_SIZE = 8;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base deletions
*/
@Argument(fullName = "deletions_context_size", shortName = "dcs", doc = "size of the k-mer context to be used for base deletions", required = false)
public int DELETIONS_CONTEXT_SIZE = 8;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
*/
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
public byte MISMATCHES_DEFAULT_QUALITY = -1;
/**
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
*/
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
public byte INSERTIONS_DEFAULT_QUALITY = 45;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
*/
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45;
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
}