BQSR with GATKReport implementation
* restructured BQSR to report recalibrated tables. * implemented empirical quality calculation to the BQSR stage (instead of on-the-fly recalibration) * linked quality score quantization to the BQSR stage, outputting a quantization histogram * included the arguments used in BQSR to the GATK Report * included all three tables (RG, QUAL and COVARIATES) to the GATK Report with empirical qualities On-the-fly recalibration with GATK Report * loads all tables from the GATKReport using existing infrastructure (with minor updates) * implemented initialiazation of the covariates using BQSR's argument list * reduced memory usage significantly by loading only the empirical quality and estimated quality reported for each bit set key * applied quality quantization to the base recalibration * excluded low quality bases from on-the-fly recalibration for mismatches, insertions or deletions
This commit is contained in:
parent
f421062b55
commit
9f74969e3a
|
|
@ -431,8 +431,7 @@ public class GATKReportTable {
|
|||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
if (column.getDataType().equals(GATKReportDataType.Byte) &&
|
||||
((String) value).length() == 1) {
|
||||
if (column.getDataType().equals(GATKReportDataType.Byte) && ((String) value).length() == 1) {
|
||||
newValue = ((String) value).charAt(0);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,10 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR
|
||||
|
|
@ -30,6 +27,7 @@ import java.util.List;
|
|||
public class BQSRKeyManager {
|
||||
private List<RequiredCovariateInfo> requiredCovariates;
|
||||
private List<OptionalCovariateInfo> optionalCovariates;
|
||||
private Map<String, Short> covariateNameToIDMap;
|
||||
|
||||
private int nRequiredBits; // Number of bits used to represent the required covariates
|
||||
private int nOptionalBits; // Number of bits used to represent the standard covaraites
|
||||
|
|
@ -48,6 +46,7 @@ public class BQSRKeyManager {
|
|||
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
|
||||
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
|
||||
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
|
||||
this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
|
||||
|
||||
nRequiredBits = 0;
|
||||
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
|
||||
|
|
@ -57,14 +56,16 @@ public class BQSRKeyManager {
|
|||
nRequiredBits += nBits;
|
||||
}
|
||||
|
||||
short i = 0;
|
||||
short id = 0;
|
||||
nOptionalBits = 0;
|
||||
for (Covariate optional : optionalCovariates) {
|
||||
int nBits = optional.numberOfBits(); // number of bits used by this covariate
|
||||
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
|
||||
BitSet optionalID = BitSetUtils.bitSetFrom(i); // calculate the optional covariate ID for this covariate
|
||||
BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate
|
||||
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
|
||||
i++;
|
||||
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
|
||||
this.covariateNameToIDMap.put(covariateName, id);
|
||||
id++;
|
||||
}
|
||||
|
||||
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
|
||||
|
|
@ -92,7 +93,7 @@ public class BQSRKeyManager {
|
|||
* @return one key in bitset representation per covariate
|
||||
*/
|
||||
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
|
||||
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
|
||||
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
|
||||
|
||||
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
|
||||
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
|
||||
|
|
@ -147,7 +148,7 @@ public class BQSRKeyManager {
|
|||
if (optionalCovariates.size() > 0) {
|
||||
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
|
||||
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
|
||||
int covariateID = (Short) key[covariateIDIndex]; // get the optional covariate id
|
||||
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
|
||||
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
|
||||
|
||||
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
|
||||
|
|
@ -162,7 +163,17 @@ public class BQSRKeyManager {
|
|||
|
||||
return bitSetKey;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Covariate id can be either the covariate name (String) or the actual id (short). This method
|
||||
* finds it's type and converts accordingly to the short notation.
|
||||
*
|
||||
* @param id the string or short representation of the optional covariate id
|
||||
* @return the short representation of the optional covariate id.
|
||||
*/
|
||||
private short parseCovariateID(Object id) {
|
||||
return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a key set of objects from a combined bitset key.
|
||||
|
|
@ -185,13 +196,27 @@ public class BQSRKeyManager {
|
|||
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
|
||||
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
|
||||
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
|
||||
objectKeys.add(id); // add the covariate id
|
||||
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
|
||||
}
|
||||
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
|
||||
|
||||
return objectKeys;
|
||||
}
|
||||
|
||||
public List<Covariate> getRequiredCovariates() {
|
||||
ArrayList<Covariate> list = new ArrayList<Covariate>(requiredCovariates.size());
|
||||
for (RequiredCovariateInfo info : requiredCovariates)
|
||||
list.add(info.covariate);
|
||||
return list;
|
||||
}
|
||||
|
||||
public List<Covariate> getOptionalCovariates() {
|
||||
ArrayList<Covariate> list = new ArrayList<Covariate>(optionalCovariates.size());
|
||||
for (OptionalCovariateInfo info : optionalCovariates)
|
||||
list.add(info.covariate);
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Translates a masked bitset into a bitset starting at 0
|
||||
*
|
||||
|
|
@ -253,7 +278,6 @@ public class BQSRKeyManager {
|
|||
return chopNBitsFrom(bitSet, leadingBits);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Aggregate information for each Covariate
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -188,7 +188,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
|
||||
@Override
|
||||
public BitSet bitSetFromKey(Object key) {
|
||||
return BitSetUtils.bitSetFrom((Short) key);
|
||||
return (key instanceof String) ? BitSetUtils.bitSetFrom(Short.parseShort((String) key)) : BitSetUtils.bitSetFrom((Short) key);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -79,8 +79,8 @@ public class QualityScoreCovariate implements RequiredCovariate {
|
|||
}
|
||||
|
||||
@Override
|
||||
public BitSet bitSetFromKey(Object key) {
|
||||
return BitSetUtils.bitSetFrom((Byte) key);
|
||||
public BitSet bitSetFromKey(Object key) {
|
||||
return (key instanceof String) ? BitSetUtils.bitSetFrom(Byte.parseByte((String) key)) : BitSetUtils.bitSetFrom((Byte) key);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -26,10 +26,15 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
|
|
@ -37,10 +42,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -53,18 +55,29 @@ import java.util.Map;
|
|||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
public final NestedHashMap nestedHashMap; // The full dataset
|
||||
private final HashMap<EventType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private final HashMap<EventType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private final HashMap<EventType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments";
|
||||
public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized";
|
||||
public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0";
|
||||
public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1";
|
||||
public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2";
|
||||
|
||||
public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value";
|
||||
public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore";
|
||||
public final static String READGROUP_COLUMN_NAME = "ReadGroup";
|
||||
public final static String EVENT_TYPE_COLUMN_NAME = "EventType";
|
||||
public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality";
|
||||
public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported";
|
||||
public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
|
||||
public final static String COVARIATE_VALUE_SCORE_COLUMN_NAME = "CovariateValue";
|
||||
public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private static boolean warnUserNullPlatform = false;
|
||||
|
||||
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||
|
||||
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
/**
|
||||
|
|
@ -82,7 +95,20 @@ public class RecalDataManager {
|
|||
/**
|
||||
* Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference.
|
||||
*/
|
||||
REMOVE_REF_BIAS
|
||||
REMOVE_REF_BIAS;
|
||||
|
||||
public static SOLID_RECAL_MODE recalModeFromString(String recalMode) {
|
||||
if (recalMode.equals("DO_NOTHING"))
|
||||
return SOLID_RECAL_MODE.DO_NOTHING;
|
||||
if (recalMode.equals("SET_Q_ZERO"))
|
||||
return SOLID_RECAL_MODE.SET_Q_ZERO;
|
||||
if (recalMode.equals("SET_Q_ZERO_BASE_N"))
|
||||
return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N;
|
||||
if (recalMode.equals("REMOVE_REF_BIAS"))
|
||||
return SOLID_RECAL_MODE.REMOVE_REF_BIAS;
|
||||
|
||||
throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value");
|
||||
}
|
||||
}
|
||||
|
||||
public enum SOLID_NOCALL_STRATEGY {
|
||||
|
|
@ -97,78 +123,125 @@ public class RecalDataManager {
|
|||
/**
|
||||
* Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses.
|
||||
*/
|
||||
PURGE_READ
|
||||
}
|
||||
PURGE_READ;
|
||||
|
||||
public RecalDataManager() {
|
||||
nestedHashMap = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
}
|
||||
public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) {
|
||||
if (nocallStrategy.equals("THROW_EXCEPTION"))
|
||||
return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
|
||||
if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED"))
|
||||
return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED;
|
||||
if (nocallStrategy.equals("PURGE_READ"))
|
||||
return SOLID_NOCALL_STRATEGY.PURGE_READ;
|
||||
|
||||
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
|
||||
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
|
||||
nestedHashMap = null;
|
||||
dataCollapsedReadGroup = new HashMap<EventType, NestedHashMap>();
|
||||
dataCollapsedQualityScore = new HashMap<EventType, NestedHashMap>();
|
||||
dataCollapsedByCovariate = new HashMap<EventType, ArrayList<NestedHashMap>>();
|
||||
for (final EventType errorModel : EventType.values()) {
|
||||
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
|
||||
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
|
||||
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
|
||||
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap());
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
nestedHashMap = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value");
|
||||
}
|
||||
}
|
||||
|
||||
public static ReadCovariates covariateKeySetFrom(GATKSAMRecord read) {
|
||||
return (ReadCovariates) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
}
|
||||
public static void listAvailableCovariates(Logger logger) {
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
|
||||
private void checkForSingletons(final Map data) {
|
||||
// todo -- this looks like it's better just as a data.valueSet() call?
|
||||
for (Object comp : data.keySet()) {
|
||||
final Object val = data.get(comp);
|
||||
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
|
||||
if (data.keySet().size() == 1) {
|
||||
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
|
||||
// in a previous step of the sequential calculation model
|
||||
}
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
checkForSingletons((Map) val);
|
||||
}
|
||||
}
|
||||
// Print and exit if that's what was requested
|
||||
logger.info("Available covariates:");
|
||||
for (Class<?> covClass : covariateClasses)
|
||||
logger.info(covClass.getSimpleName());
|
||||
logger.info("");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
*
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
* Generates two lists : required covariates and optional covariates based on the user's requests.
|
||||
*
|
||||
* Performs the following tasks in order:
|
||||
* 1. Adds all requierd covariates in order
|
||||
* 2. Check if the user asked to use the standard covariates and adds them all if that's the case
|
||||
* 3. Adds all covariates requested by the user that were not already added by the two previous steps
|
||||
*
|
||||
* @param argumentCollection the argument collection object for the recalibration walker
|
||||
* @return a pair of ordered lists : required covariates (first) and optional covariates (second)
|
||||
*/
|
||||
public final NestedHashMap getCollapsedTable(final int covariate, final EventType errorModel) {
|
||||
if (covariate == 0) {
|
||||
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
|
||||
}
|
||||
else if (covariate == 1) {
|
||||
return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed
|
||||
}
|
||||
else {
|
||||
return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
public static Pair<ArrayList<Covariate>, ArrayList<Covariate>> initializeCovariates(RecalibrationArgumentCollection argumentCollection) {
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
|
||||
|
||||
ArrayList<Covariate> requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates
|
||||
ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>();
|
||||
if (argumentCollection.USE_STANDARD_COVARIATES)
|
||||
optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user
|
||||
|
||||
if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified
|
||||
for (String requestedCovariateString : argumentCollection.COVARIATES) {
|
||||
boolean foundClass = false;
|
||||
for (Class<? extends Covariate> covClass : covariateClasses) {
|
||||
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
if (!requiredClasses.contains(covClass) &&
|
||||
(!argumentCollection.USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
|
||||
try {
|
||||
final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it
|
||||
optionalCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundClass) {
|
||||
throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
|
||||
}
|
||||
}
|
||||
}
|
||||
return new Pair<ArrayList<Covariate>, ArrayList<Covariate>>(requiredCovariates, optionalCovariates);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the recalibration table -> key manager map
|
||||
*
|
||||
* @param requiredCovariates list of required covariates (in order)
|
||||
* @param optionalCovariates list of optional covariates (in order)
|
||||
* @return a map with each key manager and it's corresponding recalibration table properly initialized
|
||||
*/
|
||||
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
|
||||
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
}
|
||||
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
return tablesAndKeysMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the table -> key manager map (unfortunate copy of the above code with minor modifications to accomodate the different return types (RecalDatum vs EmpiricalQual objects)
|
||||
*
|
||||
* @param requiredCovariates list of required covariates (in order)
|
||||
* @param optionalCovariates list of optional covariates (in order)
|
||||
* @return a map with each key manager and it's corresponding recalibration table properly initialized
|
||||
*/
|
||||
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> initializeEmpiricalTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
|
||||
final LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
}
|
||||
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
return tablesAndKeysMap;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
|
||||
*
|
||||
|
|
@ -526,8 +599,9 @@ public class RecalDataManager {
|
|||
*
|
||||
* @param read The read for which to compute covariate values.
|
||||
* @param requestedCovariates The list of requested covariates.
|
||||
* @return a matrix with all the covariates calculated for every base in the read
|
||||
*/
|
||||
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
|
||||
public static ReadCovariates computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
|
||||
final int numRequestedCovariates = requestedCovariates.size();
|
||||
final int readLength = read.getReadLength();
|
||||
final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates);
|
||||
|
|
@ -536,7 +610,7 @@ public class RecalDataManager {
|
|||
for (Covariate covariate : requestedCovariates)
|
||||
readCovariates.addCovariate(covariate.getValues(read));
|
||||
|
||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, readCovariates);
|
||||
return readCovariates;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -613,4 +687,42 @@ public class RecalDataManager {
|
|||
return base;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds the required covariates to a covariate list
|
||||
*
|
||||
* Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand.
|
||||
*
|
||||
* @param classes list of classes to add to the covariate list
|
||||
* @return the covariate list
|
||||
*/
|
||||
private static ArrayList<Covariate> addRequiredCovariatesToList(List<Class<? extends RequiredCovariate>> classes) {
|
||||
ArrayList<Covariate> dest = new ArrayList<Covariate>(classes.size());
|
||||
if (classes.size() != 2)
|
||||
throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected");
|
||||
|
||||
dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next.
|
||||
dest.add(new QualityScoreCovariate());
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the standard covariates to a covariate list
|
||||
*
|
||||
* @param classes list of classes to add to the covariate list
|
||||
* @return the covariate list
|
||||
*/
|
||||
private static ArrayList<Covariate> addStandardCovariatesToList(List<Class<? extends StandardCovariate>> classes) {
|
||||
ArrayList<Covariate> dest = new ArrayList<Covariate>(classes.size());
|
||||
for (Class<?> covClass : classes) {
|
||||
try {
|
||||
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||
dest.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,6 +87,10 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) {
|
||||
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
|
||||
}
|
||||
|
||||
public final void calcEstimatedReportedQuality() {
|
||||
this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / (double) numObservations);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ public class RecalibrationArgumentCollection {
|
|||
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
||||
*/
|
||||
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
|
||||
protected List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
|
|
@ -60,25 +60,25 @@ public class RecalibrationArgumentCollection {
|
|||
*/
|
||||
@Gather(BQSRGatherer.class)
|
||||
@Output
|
||||
protected PrintStream RECAL_FILE;
|
||||
public PrintStream RECAL_FILE;
|
||||
|
||||
/**
|
||||
* List all implemented covariates.
|
||||
*/
|
||||
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
|
||||
protected boolean LIST_ONLY = false;
|
||||
public boolean LIST_ONLY = false;
|
||||
|
||||
/**
|
||||
* Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list.
|
||||
*/
|
||||
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
|
||||
protected String[] COVARIATES = null;
|
||||
public String[] COVARIATES = null;
|
||||
|
||||
/*
|
||||
* Use the standard set of covariates in addition to the ones listed using the -cov argument
|
||||
*/
|
||||
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
|
||||
protected boolean USE_STANDARD_COVARIATES = true;
|
||||
public boolean USE_STANDARD_COVARIATES = true;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
|
|
@ -88,7 +88,7 @@ public class RecalibrationArgumentCollection {
|
|||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
protected boolean RUN_WITHOUT_DBSNP = false;
|
||||
public boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
|
|
@ -152,6 +152,9 @@ public class RecalibrationArgumentCollection {
|
|||
@Hidden
|
||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
|
||||
public int QUANTIZING_LEVELS = 16;
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: Mar 22, 2012
|
||||
*
|
||||
* Object that holds the empirical quality and estimated reported quality values for on-the-fly recalibration. This is a simplification of the RecalDatum object
|
||||
*/
|
||||
|
||||
public class EmpiricalQual {
|
||||
|
||||
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
|
||||
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
|
||||
|
||||
private EmpiricalQual() {}
|
||||
|
||||
public EmpiricalQual(final double estimatedQReported, final double empiricalQuality) {
|
||||
this.estimatedQReported = estimatedQReported;
|
||||
this.empiricalQuality = empiricalQuality;
|
||||
}
|
||||
|
||||
public final double getEstimatedQReported() {
|
||||
return estimatedQReported;
|
||||
}
|
||||
|
||||
public final double getEmpiricalQuality() {
|
||||
return empiricalQuality;
|
||||
}
|
||||
}
|
||||
|
|
@ -25,248 +25,267 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual;
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Utility methods to facilitate on-the-fly base quality score recalibration.
|
||||
*
|
||||
* User: rpoplin
|
||||
* User: carneiro and rpoplin
|
||||
* Date: 2/4/12
|
||||
*/
|
||||
|
||||
public class BaseRecalibration {
|
||||
private List<Byte> qualQuantizationMap; // histogram containing the map for qual quantization (calculated after recalibration is done)
|
||||
private LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> keysAndTablesMap; // quick access reference to the read group table and its key manager
|
||||
|
||||
private ArrayList<HashMap<BitSet, RecalDatum>> collapsedHashes = new ArrayList<HashMap<BitSet, RecalDatum>> (); // All the collapsed data tables
|
||||
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
|
||||
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of all covariates to be used in this calculation
|
||||
private final ArrayList<Covariate> requiredCovariates = new ArrayList<Covariate>(); // List of required covariates to be used in this calculation
|
||||
private final ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>(); // List of optional covariates to be used in this calculation
|
||||
|
||||
public static final Pattern REQUIRED_COVARIATE_PATTERN = Pattern.compile("^# Required Covariates.*");
|
||||
public static final Pattern OPTIONAL_COVARIATE_PATTERN = Pattern.compile("^# Optional Covariates.*");
|
||||
public static final String EOF_MARKER = "EOF";
|
||||
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
|
||||
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
|
||||
|
||||
private static final byte SMOOTHING_CONSTANT = 1;
|
||||
|
||||
ArrayList<BQSRKeyManager> keyManagers = new ArrayList<BQSRKeyManager>();
|
||||
/**
|
||||
* Should ALWAYS use the constructor with the GATK Report file
|
||||
*/
|
||||
private BaseRecalibration() {}
|
||||
|
||||
/**
|
||||
* Constructor using a GATK Report file
|
||||
*
|
||||
* @param RECAL_FILE a GATK Report file containing the recalibration information
|
||||
*/
|
||||
public BaseRecalibration(final File RECAL_FILE) {
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // todo -- initialize with the parameters from the csv file!
|
||||
GATKReport report = new GATKReport(RECAL_FILE);
|
||||
|
||||
int lineNumber = 0;
|
||||
GATKReportTable argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE);
|
||||
RecalibrationArgumentCollection RAC = initializeArgumentCollectionTable(argumentTable);
|
||||
|
||||
boolean foundRequiredCovariates = false;
|
||||
boolean foundOptionalCovariates = false;
|
||||
boolean initializedKeyManagers = false;
|
||||
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
|
||||
qualQuantizationMap = initializeQuantizationTable(quantizedTable);
|
||||
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
boolean sawEOF = false;
|
||||
try {
|
||||
for (String line : new XReadLines(RECAL_FILE)) {
|
||||
lineNumber++;
|
||||
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
|
||||
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
|
||||
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
|
||||
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
|
||||
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
|
||||
|
||||
sawEOF = EOF_MARKER.equals(line);
|
||||
if (sawEOF)
|
||||
break;
|
||||
|
||||
boolean requiredCovariatesLine = REQUIRED_COVARIATE_PATTERN.matcher(line).matches();
|
||||
boolean optionalCovariatesLine = OPTIONAL_COVARIATE_PATTERN.matcher(line).matches();
|
||||
|
||||
if (requiredCovariatesLine && foundRequiredCovariates)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate required covariates line");
|
||||
|
||||
if (optionalCovariatesLine && foundOptionalCovariates)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate optional covariates line");
|
||||
|
||||
if (optionalCovariatesLine && !foundRequiredCovariates)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Optional covariates reported before Required covariates");
|
||||
|
||||
if (requiredCovariatesLine || optionalCovariatesLine) {
|
||||
String [] covariateNames = line.split(": ")[1].split(","); // take the second half of the string (past the ":") and split it by "," to get the list of required covariates
|
||||
|
||||
List<Covariate> covariateList = requiredCovariatesLine ? requiredCovariates : optionalCovariates; // set the appropriate covariate list to update
|
||||
|
||||
for (String covariateName : covariateNames) {
|
||||
boolean foundClass = false;
|
||||
for (Class<?> covClass : classes) {
|
||||
if ((covariateName + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate) covClass.newInstance();
|
||||
covariate.initialize(RAC);
|
||||
requestedCovariates.add(covariate);
|
||||
covariateList.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!foundClass)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (covariateName + "Covariate") + ") isn't a valid covariate option.");
|
||||
}
|
||||
foundRequiredCovariates = foundRequiredCovariates || requiredCovariatesLine;
|
||||
foundOptionalCovariates = foundOptionalCovariates || optionalCovariatesLine;
|
||||
}
|
||||
|
||||
else if (!line.startsWith("#")) { // if this is not a comment line that we don't care about, it is DATA!
|
||||
if (!foundRequiredCovariates || !foundOptionalCovariates) // At this point all the covariates should have been found and initialized
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
|
||||
|
||||
if (!initializedKeyManagers) {
|
||||
ArrayList<Covariate> emptyList = new ArrayList<Covariate>(0);
|
||||
ArrayList<Covariate> requiredCovariatesUpToThis = new ArrayList<Covariate>(); // Initialize one key manager for each table of required covariate
|
||||
for (Covariate covariate : requiredCovariates) { // Every required covariate table includes all preceding required covariates (e.g. RG ; RG,Q )
|
||||
requiredCovariatesUpToThis.add(covariate);
|
||||
keyManagers.add(new BQSRKeyManager(requiredCovariatesUpToThis, emptyList));
|
||||
}
|
||||
keyManagers.add(new BQSRKeyManager(requiredCovariates, optionalCovariates)); // One master key manager for the collapsed tables
|
||||
|
||||
initializedKeyManagers = true;
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch (NumberFormatException e) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
|
||||
if (!sawEOF) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
|
||||
generateEmpiricalQualities(SMOOTHING_CONSTANT);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
*
|
||||
* @param file The CSV file we read the line from (for exception throwing purposes)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
boolean hasOptionalCovariates = optionalCovariates.size() > 0; // Do we have optional covariates in this key?
|
||||
int addOptionalCovariates = hasOptionalCovariates ? 2 : 0; // If we have optional covariates at all, add two to the size of the array (to acommodate the covariate and the id)
|
||||
final Object[] key = new Object[requiredCovariates.size() + addOptionalCovariates + 1]; // Reserve enough space for the required covariates, optional covariate, id and eventType
|
||||
|
||||
int indexCovariateValue = key.length - 3; // In the order of keys, the optional covariate comes right after the required covariates
|
||||
int indexCovariateID = key.length - 2; // followed by the covariate ID
|
||||
int indexEventType = key.length - 1; // and the event type
|
||||
|
||||
addKeysToArray(key, vals, requiredCovariates, 0); // Add the required covariates keys
|
||||
|
||||
if (hasOptionalCovariates) {
|
||||
key[indexCovariateID] = Short.parseShort(vals[indexCovariateID]); // Add the optional covariate ID
|
||||
Covariate covariate = optionalCovariates.get((Short) key[indexCovariateID]); // Get the covariate object for this ID
|
||||
key[indexCovariateValue] = covariate.getValue(vals[indexCovariateValue]); // Add the optional covariate value, given the ID
|
||||
}
|
||||
key[indexEventType] = EventType.eventFrom(vals[indexEventType]); // Add the event type
|
||||
|
||||
int datumIndex = key.length; // The recal datum starts at the end of the key (after the event type)
|
||||
long count = Long.parseLong(vals[datumIndex]); // Number of observations
|
||||
long errors = Long.parseLong(vals[datumIndex + 1]); // Number of errors observed
|
||||
double reportedQual = Double.parseDouble(vals[1]); // The reported Q score --> todo -- I don't like having the Q score hard coded in vals[1]. Generalize it!
|
||||
final RecalDatum datum = new RecalDatum(count, errors, reportedQual, 0.0); // Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
|
||||
addToAllTables(key, datum); // Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
*
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
*/
|
||||
private void addToAllTables(final Object[] key, final RecalDatum fullDatum) {
|
||||
int nHashes = requiredCovariates.size(); // We will always need one hash per required covariate
|
||||
if (optionalCovariates.size() > 0) // If we do have optional covariates
|
||||
nHashes += 1; // we will need one extra hash table with the optional covariate encoded in the key set on top of the required covariates
|
||||
for (Covariate cov : requestedCovariates)
|
||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||
|
||||
|
||||
for (int hashIndex = 0; hashIndex < nHashes; hashIndex++) {
|
||||
HashMap<BitSet, RecalDatum> table; // object to hold the hash table we are going to manipulate
|
||||
if (hashIndex >= collapsedHashes.size()) { // if we haven't yet created the collapsed hash table for this index, create it now!
|
||||
table = new HashMap<BitSet, RecalDatum>();
|
||||
collapsedHashes.add(table); // Because this is the only place where we add tables to the ArrayList, they will always be in the order we want.
|
||||
}
|
||||
else
|
||||
table = collapsedHashes.get(hashIndex); // if the table has been previously created, just assign it to the "table" object for manipulation
|
||||
|
||||
int copyTo = hashIndex + 1; // this will copy the covariates up to the index of the one we are including now (1 for RG, 2 for QS,...)
|
||||
if (copyTo > requiredCovariates.size()) // only in the case where we have optional covariates we need to increase the size of the array
|
||||
copyTo = requiredCovariates.size() + 2; // if we have optional covarites, add the optional covariate and it's id to the size of the key
|
||||
Object[] tableKey = new Object[copyTo + 1]; // create a new array that will hold as many keys as hashIndex (1 for RG hash, 2 for QualityScore hash, 3 for covariate hash plus the event type
|
||||
System.arraycopy(key, 0, tableKey, 0, copyTo); // copy the keys for the corresponding covariates into the tableKey.
|
||||
tableKey[tableKey.length-1] = key[key.length - 1]; // add the event type. The event type is always the last key, on both key sets.
|
||||
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, EmpiricalQual> table; // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
|
||||
BitSet hashKey = keyManagers.get(hashIndex).bitSetFromKey(tableKey); // Add bitset key with fullDatum to the appropriate hash
|
||||
RecalDatum datum = table.get(hashKey);
|
||||
if (datum == null)
|
||||
datum = fullDatum;
|
||||
else if (hashIndex == 0) // Special case for the ReadGroup covariate
|
||||
datum.combine(fullDatum);
|
||||
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
|
||||
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
|
||||
table = parseReadGroupTable(keyManager, reportTable);
|
||||
}
|
||||
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
|
||||
table = parseQualityScoreTable(keyManager, reportTable);
|
||||
}
|
||||
else
|
||||
datum.increment(fullDatum);
|
||||
table.put(hashKey, datum);
|
||||
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
|
||||
|
||||
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
|
||||
}
|
||||
|
||||
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
|
||||
final Map<BitSet, EmpiricalQual> table = parseAllCovariatesTable(keyManager, reportTable);
|
||||
keysAndTablesMap.put(keyManager, table); // adding the pair table+key to the map
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
|
||||
*
|
||||
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private void generateEmpiricalQualities(final int smoothing) {
|
||||
for (final HashMap<BitSet, RecalDatum> table : collapsedHashes)
|
||||
for (final RecalDatum datum : table.values())
|
||||
datum.calcCombinedEmpiricalQuality(smoothing, QualityUtils.MAX_QUAL_SCORE);
|
||||
private Map<BitSet, EmpiricalQual> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_SCORE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, EmpiricalQual> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, EmpiricalQual> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared parsing functionality for all tables.
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, EmpiricalQual> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
|
||||
Map<BitSet, EmpiricalQual> result = new HashMap<BitSet, EmpiricalQual>(reportTable.getNumRows()*2);
|
||||
|
||||
public void recalibrateRead(final GATKSAMRecord read) {
|
||||
//compute all covariate values for this read
|
||||
RecalDataManager.computeCovariates(read, requestedCovariates);
|
||||
final ReadCovariates readCovariates = RecalDataManager.covariateKeySetFrom(read);
|
||||
for (Object primaryKey : reportTable.getPrimaryKeys()) {
|
||||
int nKeys = columnNamesOrderedList.size();
|
||||
Object [] keySet = new Object[nKeys];
|
||||
for (int i = 0; i < nKeys; i++)
|
||||
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
|
||||
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
|
||||
BitSet bitKey = keyManager.bitSetFromKey(keySet);
|
||||
|
||||
for (final EventType errorModel : EventType.values()) {
|
||||
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
|
||||
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
|
||||
EmpiricalQual empiricalQual = new EmpiricalQual(estimatedQReported, empiricalQuality);
|
||||
|
||||
result.put(bitKey, empiricalQual);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
|
||||
*
|
||||
* @param table the GATKReportTable containing the quantization mappings
|
||||
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
|
||||
*/
|
||||
private List<Byte> initializeQuantizationTable(GATKReportTable table) {
|
||||
Byte[] result = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
|
||||
for (Object primaryKey : table.getPrimaryKeys()) {
|
||||
Object value = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
|
||||
byte originalQual = Byte.parseByte(primaryKey.toString());
|
||||
byte quantizedQual = Byte.parseByte(value.toString());
|
||||
result[originalQual] = quantizedQual;
|
||||
}
|
||||
return Arrays.asList(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
|
||||
*
|
||||
* @param table the GATKReportTable containing the arguments and its corresponding values
|
||||
* @return a RAC object properly initialized with all the objects in the table
|
||||
*/
|
||||
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
|
||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
for (Object primaryKey : table.getPrimaryKeys()) {
|
||||
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
|
||||
if (value.equals("null"))
|
||||
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
|
||||
|
||||
if (primaryKey.equals("covariate") && value != null)
|
||||
RAC.COVARIATES = value.toString().split(",");
|
||||
|
||||
else if (primaryKey.equals("standard_covs"))
|
||||
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
|
||||
|
||||
else if (primaryKey.equals("solid_recal_mode"))
|
||||
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
|
||||
|
||||
else if (primaryKey.equals("solid_nocall_strategy"))
|
||||
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
|
||||
|
||||
else if (primaryKey.equals("mismatches_context_size"))
|
||||
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("insertions_context_size"))
|
||||
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("deletions_context_size"))
|
||||
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("mismatches_default_quality"))
|
||||
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("insertions_default_quality"))
|
||||
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("deletions_default_quality"))
|
||||
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("low_quality_tail"))
|
||||
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("default_platform"))
|
||||
RAC.DEFAULT_PLATFORM = (String) value;
|
||||
|
||||
else if (primaryKey.equals("force_platform"))
|
||||
RAC.FORCE_PLATFORM = (String) value;
|
||||
|
||||
else if (primaryKey.equals("quantizing_levels"))
|
||||
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
|
||||
}
|
||||
|
||||
return RAC;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recalibrates the base qualities of a read
|
||||
*
|
||||
* It updates the base qualities of the read with the new recalibrated qualities (for all event types)
|
||||
*
|
||||
* @param read the read to recalibrate
|
||||
*/
|
||||
public void recalibrateRead(final GATKSAMRecord read) {
|
||||
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
|
||||
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
|
||||
final byte[] originalQuals = read.getBaseQualities(errorModel);
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
// For each base in the read
|
||||
for (int offset = 0; offset < read.getReadLength(); offset++) {
|
||||
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel);
|
||||
final byte qualityScore = performSequentialQualityCalculation(keySet, errorModel);
|
||||
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
|
||||
byte qualityScore = originalQuals[offset];
|
||||
|
||||
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
|
||||
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
|
||||
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
|
||||
}
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
||||
preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
|
||||
read.setBaseQualities(recalQuals, errorModel);
|
||||
}
|
||||
}
|
||||
|
|
@ -286,86 +305,66 @@ public class BaseRecalibration {
|
|||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
*
|
||||
* todo -- I extremely dislike the way all this math is hardcoded... should rethink the data structures for this method in particular.
|
||||
*
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @param errorModel the event type
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
|
||||
final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);
|
||||
|
||||
final int readGroupKeyIndex = 0;
|
||||
final int qualKeyIndex = 1;
|
||||
final int covariatesKeyIndex = 2;
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
List<BitSet> bitKeys = keyManagers.get(readGroupKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException("There should only be one key for the RG collapsed table, something went wrong here");
|
||||
|
||||
final RecalDatum globalRecalDatum = collapsedHashes.get(readGroupKeyIndex).get(bitKeys.get(0));
|
||||
|
||||
double globalDeltaQ = 0.0;
|
||||
if (globalRecalDatum != null) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
bitKeys = keyManagers.get(qualKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException("There should only be one key for the Qual collapsed table, something went wrong here");
|
||||
|
||||
final RecalDatum qReportedRecalDatum = collapsedHashes.get(qualKeyIndex).get(bitKeys.get(0));
|
||||
double deltaQReported = 0.0;
|
||||
if (qReportedRecalDatum != null) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
bitKeys = keyManagers.get(covariatesKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||
double deltaQCovariates = 0.0;
|
||||
double deltaQCovariateEmpirical;
|
||||
for (BitSet k : bitKeys) {
|
||||
final RecalDatum covariateRecalDatum = collapsedHashes.get(covariatesKeyIndex).get(k);
|
||||
if (covariateRecalDatum != null) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, EmpiricalQual>> mapEntry : keysAndTablesMap.entrySet()) {
|
||||
BQSRKeyManager keyManager = mapEntry.getKey();
|
||||
Map<BitSet, EmpiricalQual> table = mapEntry.getValue();
|
||||
|
||||
switch(keyManager.getRequiredCovariates().size()) {
|
||||
case 1: // this is the ReadGroup table
|
||||
List<BitSet> bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to the read group
|
||||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
|
||||
|
||||
final EmpiricalQual empiricalQualRG = table.get(bitKeys.get(0));
|
||||
if (empiricalQualRG != null) {
|
||||
final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality();
|
||||
final double aggregrateQReported = empiricalQualRG.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
if (keyManager.getOptionalCovariates().isEmpty()) { // this is the QualityScore table
|
||||
bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to the reported quality score
|
||||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
|
||||
|
||||
final EmpiricalQual empiricalQualQS = table.get(bitKeys.get(0));
|
||||
if (empiricalQualQS != null) {
|
||||
final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
}
|
||||
else { // this is the table with all the covariates
|
||||
bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to each covariate by itself in turn
|
||||
for (BitSet k : bitKeys) {
|
||||
final EmpiricalQual empiricalQualCO = table.get(k);
|
||||
if (empiricalQualCO != null) {
|
||||
double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality();
|
||||
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE);
|
||||
double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula
|
||||
recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_QUAL_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL
|
||||
|
||||
return qualQuantizationMap.get((int) recalibratedQual); // return the quantized version of the recalibrated quality
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
*
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
|
||||
for (int iii = 0; iii < recalQuals.length; iii++) {
|
||||
if (originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality to add keys
|
||||
*
|
||||
* @param array the target array we are creating the keys in
|
||||
* @param keys the actual keys we're using as a source
|
||||
* @param covariateList the covariate list to loop through
|
||||
* @param keyIndex the index in the keys and the arrays objects to run from
|
||||
*/
|
||||
private void addKeysToArray(final Object[] array, final String[] keys, List<Covariate> covariateList, int keyIndex) {
|
||||
for (Covariate covariate : covariateList) {
|
||||
array[keyIndex] = covariate.getValue(keys[keyIndex]);
|
||||
keyIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,8 +52,8 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end
|
||||
|
||||
// Base Quality Score Recalibrator specific attribute tags
|
||||
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI";
|
||||
public static final String BQSR_BASE_DELETION_QUALITIES = "BD";
|
||||
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions
|
||||
public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions
|
||||
|
||||
// the SAMRecord data we're caching
|
||||
private String mReadString = null;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -12,10 +17,13 @@ import java.io.File;
|
|||
*/
|
||||
public class BaseRecalibrationUnitTest {
|
||||
|
||||
@Test(enabled=true)
|
||||
public void testReadingCSV() {
|
||||
File csv = new File("public/testdata/exampleCSV.csv");
|
||||
@Test(enabled=false)
|
||||
public void testReadingReport() {
|
||||
File csv = new File("public/testdata/exampleGATKREPORT.grp");
|
||||
BaseRecalibration baseRecalibration = new BaseRecalibration(csv);
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA));
|
||||
baseRecalibration.recalibrateRead(read);
|
||||
System.out.println("Success");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue