Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
b4a7c0d98d
|
|
@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
|
||||
import java.io.*;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
|
|
@ -141,6 +142,11 @@ public class GATKReport {
|
|||
tables.put(table.getTableName(), table);
|
||||
}
|
||||
|
||||
public void addTables(List<GATKReportTable> gatkReportTables) {
|
||||
for (GATKReportTable table : gatkReportTables)
|
||||
addTable(table);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if table with a given name exists
|
||||
*
|
||||
|
|
|
|||
|
|
@ -26,17 +26,13 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Gatherer;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatumOptimized;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
|
|
@ -45,80 +41,31 @@ import java.util.Map;
|
|||
|
||||
|
||||
public class BQSRGatherer extends Gatherer {
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private static final String EOF_MARKER = "EOF";
|
||||
|
||||
private HashMap<String, RecalDatumOptimized> dataMap = new HashMap<String, RecalDatumOptimized>();
|
||||
|
||||
|
||||
private void addCSVData (String line) {
|
||||
String[] covariates = line.split(",");
|
||||
String key = "";
|
||||
RecalDatumOptimized values;
|
||||
|
||||
for (int i = 0; i < covariates.length-3; i++)
|
||||
key += covariates[i] + ",";
|
||||
|
||||
if (covariates.length < 3)
|
||||
throw new ReviewedStingException("Line only has 1 covariate : " + line);
|
||||
|
||||
values = new RecalDatumOptimized(Long.parseLong(covariates[covariates.length - 3]), Long.parseLong(covariates[covariates.length - 2]));
|
||||
|
||||
RecalDatumOptimized currentValues = dataMap.get(key);
|
||||
if (currentValues == null)
|
||||
dataMap.put(key, values);
|
||||
else
|
||||
currentValues.increment(values);
|
||||
|
||||
}
|
||||
|
||||
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
|
||||
private static final String MISSING_OUTPUT_FILE = "missing output file name";
|
||||
|
||||
@Override
|
||||
public void gather(List<File> inputs, File output) {
|
||||
PrintStream o;
|
||||
RecalibrationReport generalReport = null;
|
||||
PrintStream outputFile;
|
||||
try {
|
||||
o = new PrintStream(output);
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new UserException("File to be output by CountCovariates Gather function was not found");
|
||||
outputFile = new PrintStream(output);
|
||||
} catch(FileNotFoundException e) {
|
||||
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
|
||||
}
|
||||
|
||||
boolean sawEOF = false;
|
||||
boolean printedHeader = false;
|
||||
|
||||
// Read input files
|
||||
for ( File RECAL_FILE : inputs) {
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true; // sanity check
|
||||
break;
|
||||
}
|
||||
|
||||
else if(line.startsWith("#")) {
|
||||
if (!printedHeader)
|
||||
o.println(line);
|
||||
}
|
||||
|
||||
else // Found a line of data
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
printedHeader = true;
|
||||
for (File input : inputs) {
|
||||
RecalibrationReport inputReport = new RecalibrationReport(input);
|
||||
if (generalReport == null)
|
||||
generalReport = inputReport;
|
||||
else
|
||||
generalReport.combine(inputReport);
|
||||
}
|
||||
if (generalReport == null)
|
||||
throw new ReviewedStingException(EMPTY_INPUT_LIST);
|
||||
|
||||
// Write output file from dataMap
|
||||
for(Map.Entry<String, RecalDatumOptimized> entry : dataMap.entrySet())
|
||||
o.println(entry.getKey() + entry.getValue().outputToCSV());
|
||||
o.println("EOF");
|
||||
generalReport.calculateEmpiricalAndQuantizedQualities();
|
||||
generalReport.output(outputFile);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,17 +25,17 @@ import java.util.*;
|
|||
* @since 3/6/12
|
||||
*/
|
||||
public class BQSRKeyManager {
|
||||
private List<RequiredCovariateInfo> requiredCovariates;
|
||||
private List<OptionalCovariateInfo> optionalCovariates;
|
||||
private Map<String, Short> covariateNameToIDMap;
|
||||
private final List<RequiredCovariateInfo> requiredCovariates;
|
||||
private final List<OptionalCovariateInfo> optionalCovariates;
|
||||
private final Map<String, Short> covariateNameToIDMap;
|
||||
|
||||
private int nRequiredBits; // Number of bits used to represent the required covariates
|
||||
private int nOptionalBits; // Number of bits used to represent the standard covaraites
|
||||
private int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
|
||||
private int totalNumberOfBits; // Sum of all of the above plus the event bits
|
||||
private int nRequiredBits; // Number of bits used to represent the required covariates
|
||||
private int nOptionalBits; // Number of bits used to represent the standard covaraites
|
||||
private final int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
|
||||
private final int totalNumberOfBits; // Sum of all of the above plus the event bits
|
||||
|
||||
private BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
|
||||
private BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
|
||||
private final BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
|
||||
private final BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
|
||||
|
||||
/**
|
||||
* Initializes the KeyManager with the total number of covariates to use
|
||||
|
|
@ -44,34 +44,34 @@ public class BQSRKeyManager {
|
|||
* @param optionalCovariates the ordered list of optional covariates
|
||||
*/
|
||||
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
|
||||
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
|
||||
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
|
||||
this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
|
||||
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
|
||||
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
|
||||
this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
|
||||
|
||||
nRequiredBits = 0;
|
||||
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
|
||||
int nBits = required.numberOfBits(); // number of bits used by this covariate
|
||||
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
|
||||
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate
|
||||
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
|
||||
int nBits = required.numberOfBits(); // number of bits used by this covariate
|
||||
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
|
||||
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate
|
||||
nRequiredBits += nBits;
|
||||
}
|
||||
|
||||
short id = 0;
|
||||
nOptionalBits = 0;
|
||||
for (Covariate optional : optionalCovariates) {
|
||||
int nBits = optional.numberOfBits(); // number of bits used by this covariate
|
||||
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
|
||||
BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate
|
||||
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
|
||||
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
|
||||
int nBits = optional.numberOfBits(); // number of bits used by this covariate
|
||||
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
|
||||
BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate
|
||||
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
|
||||
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
|
||||
this.covariateNameToIDMap.put(covariateName, id);
|
||||
id++;
|
||||
}
|
||||
|
||||
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
|
||||
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
|
||||
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
|
||||
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
|
||||
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
|
||||
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
|
||||
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
|
||||
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -93,32 +93,32 @@ public class BQSRKeyManager {
|
|||
* @return one key in bitset representation per covariate
|
||||
*/
|
||||
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
|
||||
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
|
||||
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
|
||||
|
||||
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
|
||||
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
|
||||
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
|
||||
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
|
||||
|
||||
int covariateIndex = 0;
|
||||
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
|
||||
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
|
||||
for (RequiredCovariateInfo infoRequired : requiredCovariates)
|
||||
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
|
||||
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
|
||||
|
||||
for (OptionalCovariateInfo infoOptional : optionalCovariates) {
|
||||
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
|
||||
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
|
||||
if (covariateKey == null)
|
||||
continue; // do not add nulls to the final set of keys.
|
||||
continue; // do not add nulls to the final set of keys.
|
||||
|
||||
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
|
||||
optionalKey.or(requiredKey); // import all the required covariates
|
||||
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
|
||||
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||
allBitSets.add(optionalKey); // add this key to the list of keys
|
||||
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
|
||||
optionalKey.or(requiredKey); // import all the required covariates
|
||||
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
|
||||
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||
allBitSets.add(optionalKey); // add this key to the list of keys
|
||||
}
|
||||
|
||||
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
|
||||
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||
allBitSets.add(requiredKey); // add this key to the list of keys
|
||||
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
|
||||
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||
allBitSets.add(requiredKey); // add this key to the list of keys
|
||||
}
|
||||
|
||||
return allBitSets;
|
||||
|
|
@ -141,25 +141,25 @@ public class BQSRKeyManager {
|
|||
|
||||
int requiredCovariate = 0;
|
||||
for (RequiredCovariateInfo infoRequired : requiredCovariates) {
|
||||
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
|
||||
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
|
||||
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
|
||||
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
|
||||
}
|
||||
|
||||
if (optionalCovariates.size() > 0) {
|
||||
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
|
||||
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
|
||||
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
|
||||
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
|
||||
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
|
||||
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
|
||||
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
|
||||
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
|
||||
|
||||
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
|
||||
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
|
||||
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
|
||||
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
|
||||
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||
}
|
||||
|
||||
int eventIndex = key.length - 1; // the event type is always the last key
|
||||
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
|
||||
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
|
||||
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
|
||||
int eventIndex = key.length - 1; // the event type is always the last key
|
||||
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
|
||||
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
|
||||
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
|
||||
|
||||
return bitSetKey;
|
||||
}
|
||||
|
|
@ -186,19 +186,19 @@ public class BQSRKeyManager {
|
|||
public List<Object> keySetFrom(BitSet key) {
|
||||
List<Object> objectKeys = new ArrayList<Object>();
|
||||
for (RequiredCovariateInfo info : requiredCovariates) {
|
||||
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
|
||||
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
|
||||
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
|
||||
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
|
||||
}
|
||||
|
||||
if (optionalCovariates.size() > 0) {
|
||||
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
|
||||
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits);// mask out the covariate order (to identify which covariate this is)
|
||||
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
|
||||
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
|
||||
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
|
||||
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
|
||||
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
|
||||
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is)
|
||||
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
|
||||
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
|
||||
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
|
||||
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
|
||||
}
|
||||
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
|
||||
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
|
||||
|
||||
return objectKeys;
|
||||
}
|
||||
|
|
@ -227,7 +227,7 @@ public class BQSRKeyManager {
|
|||
private BitSet chopNBitsFrom(BitSet key, int n) {
|
||||
BitSet choppedKey = new BitSet();
|
||||
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
|
||||
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
|
||||
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
|
||||
return choppedKey;
|
||||
}
|
||||
|
||||
|
|
@ -269,7 +269,7 @@ public class BQSRKeyManager {
|
|||
|
||||
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
|
||||
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
|
||||
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
|
||||
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
|
||||
}
|
||||
|
||||
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
|
||||
|
|
@ -282,22 +282,20 @@ public class BQSRKeyManager {
|
|||
* Aggregate information for each Covariate
|
||||
*/
|
||||
class RequiredCovariateInfo {
|
||||
public int bitsBefore; // number of bits before this covariate in the combined bitset key
|
||||
public int nBits; // number of bits used by this covariate (cached access to covariate.nBits())
|
||||
public BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
|
||||
public Covariate covariate; // this allows reverse lookup of the Covariates in order
|
||||
public final int bitsBefore; // number of bits before this covariate in the combined bitset key
|
||||
public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
|
||||
public final Covariate covariate; // this allows reverse lookup of the Covariates in order
|
||||
|
||||
RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) {
|
||||
this.bitsBefore = bitsBefore;
|
||||
this.nBits = nBits;
|
||||
this.mask = mask;
|
||||
this.covariate = covariate;
|
||||
}
|
||||
}
|
||||
|
||||
class OptionalCovariateInfo {
|
||||
public BitSet covariateID; // cache the covariate ID
|
||||
public Covariate covariate;
|
||||
public final BitSet covariateID; // cache the covariate ID
|
||||
public final Covariate covariate;
|
||||
|
||||
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) {
|
||||
this.covariateID = covariateID;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Class that encapsulates the information necessary for quality score quantization for BQSR
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 3/26/12
|
||||
*/
|
||||
public class QuantizationInfo {
|
||||
private List<Byte> quantizedQuals;
|
||||
private List<Long> empiricalQualCounts;
|
||||
|
||||
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
|
||||
this.quantizedQuals = quantizedQuals;
|
||||
this.empiricalQualCounts = empiricalQualCounts;
|
||||
}
|
||||
|
||||
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) {
|
||||
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
|
||||
for (int i = 0; i < qualHistogram.length; i++)
|
||||
qualHistogram[i] = 0L;
|
||||
|
||||
Map<BitSet, RecalDatum> qualTable = null; // look for the quality score table
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
|
||||
BQSRKeyManager keyManager = entry.getKey();
|
||||
if (keyManager.getRequiredCovariates().size() == 2) // it should be the only one with 2 required covaraites
|
||||
qualTable = entry.getValue();
|
||||
}
|
||||
|
||||
if (qualTable == null)
|
||||
throw new ReviewedStingException("Could not find QualityScore table.");
|
||||
|
||||
for (RecalDatum datum : qualTable.values()) {
|
||||
int empiricalQual = (int) Math.round(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL )
|
||||
long nObservations = datum.numObservations;
|
||||
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
|
||||
}
|
||||
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
|
||||
quantizeQualityScores(nLevels);
|
||||
}
|
||||
|
||||
|
||||
public void quantizeQualityScores(int nLevels) {
|
||||
QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels
|
||||
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
|
||||
}
|
||||
|
||||
public List<Byte> getQuantizedQuals() {
|
||||
return quantizedQuals;
|
||||
}
|
||||
|
||||
public GATKReportTable generateReportTable() {
|
||||
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
|
||||
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
quantizedTable.addColumn(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, 0L);
|
||||
quantizedTable.addColumn(RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, (byte) 0);
|
||||
|
||||
for (int qual = 0; qual <= QualityUtils.MAX_QUAL_SCORE; qual++) {
|
||||
quantizedTable.set(qual, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual));
|
||||
quantizedTable.set(qual, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual));
|
||||
}
|
||||
return quantizedTable;
|
||||
}
|
||||
}
|
||||
|
|
@ -28,7 +28,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -42,6 +43,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -71,13 +73,14 @@ public class RecalDataManager {
|
|||
public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
|
||||
public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue";
|
||||
public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
|
||||
public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
|
||||
public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors";
|
||||
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private static boolean warnUserNullPlatform = false;
|
||||
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
/**
|
||||
* Treat reference inserted bases as reference matching bases. Very unsafe!
|
||||
|
|
@ -136,25 +139,38 @@ public class RecalDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
public static void listAvailableCovariates(Logger logger) {
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
logger.info("Available covariates:");
|
||||
for (Class<?> covClass : covariateClasses)
|
||||
logger.info(covClass.getSimpleName());
|
||||
logger.info("");
|
||||
/**
|
||||
* Initializes the recalibration table -> key manager map
|
||||
*
|
||||
* @param requiredCovariates list of required covariates (in order)
|
||||
* @param optionalCovariates list of optional covariates (in order)
|
||||
* @return a map with each key manager and it's corresponding recalibration table properly initialized
|
||||
*/
|
||||
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
|
||||
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
}
|
||||
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
return tablesAndKeysMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates two lists : required covariates and optional covariates based on the user's requests.
|
||||
*
|
||||
*
|
||||
* Performs the following tasks in order:
|
||||
* 1. Adds all requierd covariates in order
|
||||
* 2. Check if the user asked to use the standard covariates and adds them all if that's the case
|
||||
* 3. Adds all covariates requested by the user that were not already added by the two previous steps
|
||||
*
|
||||
* 3. Adds all covariates requested by the user that were not already added by the two previous steps
|
||||
*
|
||||
* @param argumentCollection the argument collection object for the recalibration walker
|
||||
* @return a pair of ordered lists : required covariates (first) and optional covariates (second)
|
||||
*/
|
||||
|
|
@ -194,52 +210,102 @@ public class RecalDataManager {
|
|||
return new Pair<ArrayList<Covariate>, ArrayList<Covariate>>(requiredCovariates, optionalCovariates);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the recalibration table -> key manager map
|
||||
*
|
||||
* @param requiredCovariates list of required covariates (in order)
|
||||
* @param optionalCovariates list of optional covariates (in order)
|
||||
* @return a map with each key manager and it's corresponding recalibration table properly initialized
|
||||
*/
|
||||
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
|
||||
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
}
|
||||
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
return tablesAndKeysMap;
|
||||
public static void listAvailableCovariates(Logger logger) {
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
logger.info("Available covariates:");
|
||||
for (Class<?> covClass : covariateClasses)
|
||||
logger.info(covClass.getSimpleName());
|
||||
logger.info("");
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the table -> key manager map (unfortunate copy of the above code with minor modifications to accomodate the different return types (RecalDatum vs EmpiricalQual objects)
|
||||
*
|
||||
* @param requiredCovariates list of required covariates (in order)
|
||||
* @param optionalCovariates list of optional covariates (in order)
|
||||
* @return a map with each key manager and it's corresponding recalibration table properly initialized
|
||||
*/
|
||||
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> initializeEmpiricalTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
|
||||
final LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
public static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) {
|
||||
List<GATKReportTable> result = new LinkedList<GATKReportTable>();
|
||||
int tableIndex = 0;
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
|
||||
BQSRKeyManager keyManager = entry.getKey();
|
||||
Map<BitSet, RecalDatum> recalTable = entry.getValue();
|
||||
|
||||
GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "");
|
||||
final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
|
||||
final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.2f");
|
||||
final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.2f");
|
||||
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
|
||||
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
|
||||
|
||||
long primaryKey = 0L;
|
||||
|
||||
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
|
||||
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
|
||||
|
||||
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
|
||||
|
||||
for (Covariate covariate : requiredList) {
|
||||
String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order
|
||||
columnNames.add(new Pair<String,String>(name, "%s")); // save the required covariate name so we can reference it in the future
|
||||
}
|
||||
|
||||
if (optionalList.size() > 0) {
|
||||
columnNames.add(covariateValue);
|
||||
columnNames.add(covariateName);
|
||||
}
|
||||
|
||||
columnNames.add(eventType); // the order of these column names is important here
|
||||
columnNames.add(empiricalQuality);
|
||||
columnNames.add(estimatedQReported);
|
||||
columnNames.add(nObservations);
|
||||
columnNames.add(nErrors);
|
||||
|
||||
|
||||
reportTable.addPrimaryKey("PrimaryKey", false); // every table must have a primary key (hidden)
|
||||
for (Pair<String, String> columnName : columnNames)
|
||||
reportTable.addColumn(columnName.getFirst(), true, columnName.getSecond()); // every table must have the event type
|
||||
|
||||
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
|
||||
BitSet bitSetKey = recalTableEntry.getKey();
|
||||
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
|
||||
Iterator<Pair<String, String>> iterator = columnNames.iterator();
|
||||
for (Object key : keyManager.keySetFrom(bitSetKey)) {
|
||||
String columnName = iterator.next().getFirst();
|
||||
columnData.put(columnName, key);
|
||||
}
|
||||
RecalDatum datum = recalTableEntry.getValue();
|
||||
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); // iterator.next() gives the column name for Empirical Quality
|
||||
columnData.put(iterator.next().getFirst(), Math.round(datum.getEstimatedQReported())); // iterator.next() gives the column name for EstimatedQReported
|
||||
columnData.put(iterator.next().getFirst(), datum.numObservations);
|
||||
columnData.put(iterator.next().getFirst(), datum.numMismatches);
|
||||
|
||||
for (Map.Entry<String, Object> dataEntry : columnData.entrySet()) {
|
||||
String columnName = dataEntry.getKey();
|
||||
Object value = dataEntry.getValue();
|
||||
reportTable.set(primaryKey, columnName, value.toString());
|
||||
}
|
||||
primaryKey++;
|
||||
}
|
||||
result.add(reportTable);
|
||||
}
|
||||
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
|
||||
return tablesAndKeysMap;
|
||||
return result;
|
||||
}
|
||||
|
||||
public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
|
||||
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
|
||||
}
|
||||
|
||||
public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
|
||||
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
|
||||
}
|
||||
|
||||
private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List<GATKReportTable> recalTables, PrintStream outputFile) {
|
||||
GATKReport report = new GATKReport();
|
||||
report.addTable(argumentTable);
|
||||
report.addTable(quantizationTable);
|
||||
report.addTables(recalTables);
|
||||
report.print(outputFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
|
||||
|
|
|
|||
|
|
@ -38,6 +38,8 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
|
||||
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
|
||||
|
||||
private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
|
|
@ -75,7 +77,6 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
|
||||
this.increment(other.numObservations, other.numMismatches);
|
||||
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
|
||||
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -84,8 +85,8 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) {
|
||||
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
|
||||
public final void calcCombinedEmpiricalQuality(final int maxQual) {
|
||||
this.empiricalQuality = empiricalQualDouble(SMOOTHING_CONSTANT, maxQual); // cache the value so we don't call log over and over again
|
||||
}
|
||||
|
||||
public final void calcEstimatedReportedQuality() {
|
||||
|
|
@ -106,6 +107,11 @@ public class RecalDatum extends RecalDatumOptimized {
|
|||
return empiricalQuality;
|
||||
}
|
||||
|
||||
public final void resetCalculatedQualities() {
|
||||
empiricalQuality = 0.0;
|
||||
estimatedQReported = 0.0;
|
||||
}
|
||||
|
||||
private double calcExpectedErrors() {
|
||||
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collections;
|
||||
|
|
@ -156,5 +158,25 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
|
||||
public int QUANTIZING_LEVELS = 16;
|
||||
|
||||
public GATKReportTable generateReportTable() {
|
||||
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run");
|
||||
argumentsTable.addPrimaryKey("Argument");
|
||||
argumentsTable.addColumn(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, "null");
|
||||
argumentsTable.set("covariate", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES));
|
||||
argumentsTable.set("standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, USE_STANDARD_COVARIATES);
|
||||
argumentsTable.set("run_without_dbsnp", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
|
||||
argumentsTable.set("solid_recal_mode", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
|
||||
argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
|
||||
argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
|
||||
argumentsTable.set("insertions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_CONTEXT_SIZE);
|
||||
argumentsTable.set("deletions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_CONTEXT_SIZE);
|
||||
argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
|
||||
argumentsTable.set("insertions_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
|
||||
argumentsTable.set("low_quality_tail", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
|
||||
argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
|
||||
argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
|
||||
argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
|
||||
return argumentsTable;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,290 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* This class has all the static functionality for reading a recalibration report file into memory.
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 3/26/12
|
||||
*/
|
||||
public class RecalibrationReport {
|
||||
private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done)
|
||||
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
|
||||
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
|
||||
|
||||
GATKReportTable argumentTable; // keep the argument table untouched just for output purposes
|
||||
RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter | todo -- this should be a new parameter, not necessarily coming from the original table parameter list
|
||||
|
||||
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
|
||||
|
||||
public RecalibrationReport(final File RECAL_FILE) {
|
||||
GATKReport report = new GATKReport(RECAL_FILE);
|
||||
|
||||
argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE);
|
||||
RAC = initializeArgumentCollectionTable(argumentTable);
|
||||
|
||||
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
|
||||
quantizationInfo = initializeQuantizationTable(quantizedTable);
|
||||
|
||||
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
|
||||
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
|
||||
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
|
||||
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
|
||||
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
|
||||
|
||||
for (Covariate cov : requestedCovariates)
|
||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||
|
||||
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
|
||||
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
|
||||
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
|
||||
table = parseReadGroupTable(keyManager, reportTable);
|
||||
}
|
||||
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
|
||||
table = parseQualityScoreTable(keyManager, reportTable);
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
|
||||
|
||||
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
|
||||
}
|
||||
|
||||
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
|
||||
final Map<BitSet, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable);
|
||||
keysAndTablesMap.put(keyManager, table);
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines two recalibration reports by adding all observations and errors
|
||||
*
|
||||
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate them
|
||||
* after combining. The reason for not calculating it is because this function is inteded for combining a series of
|
||||
* recalibration reports, and it only makes sense to calculate the empirical qualities and quantized qualities after all
|
||||
* the recalibration reports have been combined. Having the user recalculate when appropriate, makes this method faster
|
||||
*
|
||||
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
|
||||
*
|
||||
* @param other the recalibration report to combine with this one
|
||||
*/
|
||||
public void combine(RecalibrationReport other) {
|
||||
Iterator<Map<BitSet, RecalDatum>> tableIterator = keysAndTablesMap.values().iterator(); // because these are ordered (linked hashmaps) we can iterate over the 'this' and do a for loop on the 'other' tables and be sure that we are looking at the equivalent tables on both objects
|
||||
for (Map<BitSet, RecalDatum> otherTable : other.getKeysAndTablesMap().values()) { // iterate over all tables for 'other'
|
||||
Map<BitSet, RecalDatum> thisTable = tableIterator.next(); // iterate over all tables for 'this'
|
||||
for (Map.Entry<BitSet, RecalDatum> entry : otherTable.entrySet()) { // for each table, go through all the entries in the 'other' dataset to update 'this' dataset
|
||||
BitSet key = entry.getKey();
|
||||
RecalDatum otherDatum = entry.getValue();
|
||||
RecalDatum thisDatum = thisTable.get(key);
|
||||
thisDatum.increment(otherDatum); // add the two datum objects into 'this'
|
||||
thisDatum.resetCalculatedQualities(); // reset the empirical quality to make sure the user doesn't forget to recalculate it
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public QuantizationInfo getQuantizationInfo() {
|
||||
return quantizationInfo;
|
||||
}
|
||||
|
||||
public LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> getKeysAndTablesMap() {
|
||||
return keysAndTablesMap;
|
||||
}
|
||||
|
||||
public ArrayList<Covariate> getRequestedCovariates() {
|
||||
return requestedCovariates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared parsing functionality for all tables.
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
|
||||
Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2);
|
||||
|
||||
for (Object primaryKey : reportTable.getPrimaryKeys()) {
|
||||
int nKeys = columnNamesOrderedList.size();
|
||||
Object [] keySet = new Object[nKeys];
|
||||
for (int i = 0; i < nKeys; i++)
|
||||
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
|
||||
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
|
||||
BitSet bitKey = keyManager.bitSetFromKey(keySet);
|
||||
|
||||
long nObservations = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
|
||||
long nErrors = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
|
||||
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
|
||||
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
|
||||
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
|
||||
|
||||
result.put(bitKey, recalDatum);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
|
||||
*
|
||||
* @param table the GATKReportTable containing the quantization mappings
|
||||
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
|
||||
*/
|
||||
private QuantizationInfo initializeQuantizationTable(GATKReportTable table) {
|
||||
Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
|
||||
Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1];
|
||||
for (Object primaryKey : table.getPrimaryKeys()) {
|
||||
Object quantizedObject = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
|
||||
Object countObject = table.get(primaryKey, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
|
||||
byte originalQual = Byte.parseByte(primaryKey.toString());
|
||||
byte quantizedQual = Byte.parseByte(quantizedObject.toString());
|
||||
long quantizedCount = Long.parseLong(countObject.toString());
|
||||
quals[originalQual] = quantizedQual;
|
||||
counts[originalQual] = quantizedCount;
|
||||
}
|
||||
return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts));
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
|
||||
*
|
||||
* @param table the GATKReportTable containing the arguments and its corresponding values
|
||||
* @return a RAC object properly initialized with all the objects in the table
|
||||
*/
|
||||
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
|
||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
for (Object primaryKey : table.getPrimaryKeys()) {
|
||||
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
|
||||
if (value.equals("null"))
|
||||
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
|
||||
|
||||
if (primaryKey.equals("covariate") && value != null)
|
||||
RAC.COVARIATES = value.toString().split(",");
|
||||
|
||||
else if (primaryKey.equals("standard_covs"))
|
||||
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
|
||||
|
||||
else if (primaryKey.equals("solid_recal_mode"))
|
||||
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
|
||||
|
||||
else if (primaryKey.equals("solid_nocall_strategy"))
|
||||
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
|
||||
|
||||
else if (primaryKey.equals("mismatches_context_size"))
|
||||
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("insertions_context_size"))
|
||||
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("deletions_context_size"))
|
||||
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("mismatches_default_quality"))
|
||||
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("insertions_default_quality"))
|
||||
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("deletions_default_quality"))
|
||||
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("low_quality_tail"))
|
||||
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("default_platform"))
|
||||
RAC.DEFAULT_PLATFORM = (String) value;
|
||||
|
||||
else if (primaryKey.equals("force_platform"))
|
||||
RAC.FORCE_PLATFORM = (String) value;
|
||||
|
||||
else if (primaryKey.equals("quantizing_levels"))
|
||||
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
|
||||
}
|
||||
|
||||
return RAC;
|
||||
}
|
||||
|
||||
/**
|
||||
* this functionality avoids recalculating the empirical qualities, estimated reported quality
|
||||
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
|
||||
*/
|
||||
public void calculateEmpiricalAndQuantizedQualities() {
|
||||
quantizationInfo.quantizeQualityScores(RAC.QUANTIZING_LEVELS);
|
||||
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values()) {
|
||||
for (RecalDatum datum : table.values()) {
|
||||
datum.calcCombinedEmpiricalQuality(QualityUtils.MAX_QUAL_SCORE);
|
||||
datum.calcEstimatedReportedQuality();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void output(PrintStream output) {
|
||||
RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output);
|
||||
}
|
||||
}
|
||||
|
|
@ -25,13 +25,9 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual;
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -46,224 +42,27 @@ import java.util.*;
|
|||
*/
|
||||
|
||||
public class BaseRecalibration {
|
||||
private List<Byte> qualQuantizationMap; // histogram containing the map for qual quantization (calculated after recalibration is done)
|
||||
private LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> keysAndTablesMap; // quick access reference to the read group table and its key manager
|
||||
private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
|
||||
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
|
||||
|
||||
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
|
||||
|
||||
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
|
||||
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
|
||||
|
||||
/**
|
||||
* Should ALWAYS use the constructor with the GATK Report file
|
||||
*/
|
||||
private BaseRecalibration() {}
|
||||
|
||||
/**
|
||||
* Constructor using a GATK Report file
|
||||
*
|
||||
* @param RECAL_FILE a GATK Report file containing the recalibration information
|
||||
*/
|
||||
public BaseRecalibration(final File RECAL_FILE) {
|
||||
GATKReport report = new GATKReport(RECAL_FILE);
|
||||
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
|
||||
|
||||
GATKReportTable argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE);
|
||||
RecalibrationArgumentCollection RAC = initializeArgumentCollectionTable(argumentTable);
|
||||
|
||||
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
|
||||
qualQuantizationMap = initializeQuantizationTable(quantizedTable);
|
||||
|
||||
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
|
||||
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
|
||||
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
|
||||
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
|
||||
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
|
||||
|
||||
for (Covariate cov : requestedCovariates)
|
||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||
|
||||
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>();
|
||||
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
|
||||
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
|
||||
for (Covariate covariate : requiredCovariates) {
|
||||
requiredCovariatesToAdd.add(covariate);
|
||||
final Map<BitSet, EmpiricalQual> table; // initializing a new recal table for each required covariate (cumulatively)
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
|
||||
|
||||
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
|
||||
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
|
||||
table = parseReadGroupTable(keyManager, reportTable);
|
||||
}
|
||||
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
|
||||
table = parseQualityScoreTable(keyManager, reportTable);
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
|
||||
|
||||
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
|
||||
}
|
||||
|
||||
|
||||
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
|
||||
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
|
||||
final Map<BitSet, EmpiricalQual> table = parseAllCovariatesTable(keyManager, reportTable);
|
||||
keysAndTablesMap.put(keyManager, table); // adding the pair table+key to the map
|
||||
quantizationInfo = recalibrationReport.getQuantizationInfo();
|
||||
keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
|
||||
requestedCovariates = recalibrationReport.getRequestedCovariates();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, EmpiricalQual> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, EmpiricalQual> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, EmpiricalQual> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
|
||||
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
|
||||
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
|
||||
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
|
||||
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared parsing functionality for all tables.
|
||||
*
|
||||
* @param keyManager the key manager for this table
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
|
||||
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
|
||||
*/
|
||||
private Map<BitSet, EmpiricalQual> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
|
||||
Map<BitSet, EmpiricalQual> result = new HashMap<BitSet, EmpiricalQual>(reportTable.getNumRows()*2);
|
||||
|
||||
for (Object primaryKey : reportTable.getPrimaryKeys()) {
|
||||
int nKeys = columnNamesOrderedList.size();
|
||||
Object [] keySet = new Object[nKeys];
|
||||
for (int i = 0; i < nKeys; i++)
|
||||
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
|
||||
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
|
||||
BitSet bitKey = keyManager.bitSetFromKey(keySet);
|
||||
|
||||
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
|
||||
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
|
||||
EmpiricalQual empiricalQual = new EmpiricalQual(estimatedQReported, empiricalQuality);
|
||||
|
||||
result.put(bitKey, empiricalQual);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
|
||||
*
|
||||
* @param table the GATKReportTable containing the quantization mappings
|
||||
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
|
||||
*/
|
||||
private List<Byte> initializeQuantizationTable(GATKReportTable table) {
|
||||
Byte[] result = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
|
||||
for (Object primaryKey : table.getPrimaryKeys()) {
|
||||
Object value = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
|
||||
byte originalQual = Byte.parseByte(primaryKey.toString());
|
||||
byte quantizedQual = Byte.parseByte(value.toString());
|
||||
result[originalQual] = quantizedQual;
|
||||
}
|
||||
return Arrays.asList(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
|
||||
*
|
||||
* @param table the GATKReportTable containing the arguments and its corresponding values
|
||||
* @return a RAC object properly initialized with all the objects in the table
|
||||
*/
|
||||
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
|
||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
for (Object primaryKey : table.getPrimaryKeys()) {
|
||||
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
|
||||
if (value.equals("null"))
|
||||
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
|
||||
|
||||
if (primaryKey.equals("covariate") && value != null)
|
||||
RAC.COVARIATES = value.toString().split(",");
|
||||
|
||||
else if (primaryKey.equals("standard_covs"))
|
||||
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
|
||||
|
||||
else if (primaryKey.equals("solid_recal_mode"))
|
||||
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
|
||||
|
||||
else if (primaryKey.equals("solid_nocall_strategy"))
|
||||
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
|
||||
|
||||
else if (primaryKey.equals("mismatches_context_size"))
|
||||
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("insertions_context_size"))
|
||||
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("deletions_context_size"))
|
||||
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
|
||||
|
||||
else if (primaryKey.equals("mismatches_default_quality"))
|
||||
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("insertions_default_quality"))
|
||||
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("deletions_default_quality"))
|
||||
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("low_quality_tail"))
|
||||
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
|
||||
|
||||
else if (primaryKey.equals("default_platform"))
|
||||
RAC.DEFAULT_PLATFORM = (String) value;
|
||||
|
||||
else if (primaryKey.equals("force_platform"))
|
||||
RAC.FORCE_PLATFORM = (String) value;
|
||||
|
||||
else if (primaryKey.equals("quantizing_levels"))
|
||||
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
|
||||
}
|
||||
|
||||
return RAC;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recalibrates the base qualities of a read
|
||||
*
|
||||
|
|
@ -316,9 +115,9 @@ public class BaseRecalibration {
|
|||
double deltaQReported = 0.0;
|
||||
double deltaQCovariates = 0.0;
|
||||
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, EmpiricalQual>> mapEntry : keysAndTablesMap.entrySet()) {
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> mapEntry : keysAndTablesMap.entrySet()) {
|
||||
BQSRKeyManager keyManager = mapEntry.getKey();
|
||||
Map<BitSet, EmpiricalQual> table = mapEntry.getValue();
|
||||
Map<BitSet, RecalDatum> table = mapEntry.getValue();
|
||||
|
||||
switch(keyManager.getRequiredCovariates().size()) {
|
||||
case 1: // this is the ReadGroup table
|
||||
|
|
@ -326,7 +125,7 @@ public class BaseRecalibration {
|
|||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
|
||||
|
||||
final EmpiricalQual empiricalQualRG = table.get(bitKeys.get(0));
|
||||
final RecalDatum empiricalQualRG = table.get(bitKeys.get(0));
|
||||
if (empiricalQualRG != null) {
|
||||
final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality();
|
||||
final double aggregrateQReported = empiricalQualRG.getEstimatedQReported();
|
||||
|
|
@ -339,7 +138,7 @@ public class BaseRecalibration {
|
|||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
|
||||
|
||||
final EmpiricalQual empiricalQualQS = table.get(bitKeys.get(0));
|
||||
final RecalDatum empiricalQualQS = table.get(bitKeys.get(0));
|
||||
if (empiricalQualQS != null) {
|
||||
final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
|
|
@ -348,7 +147,7 @@ public class BaseRecalibration {
|
|||
else { // this is the table with all the covariates
|
||||
bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to each covariate by itself in turn
|
||||
for (BitSet k : bitKeys) {
|
||||
final EmpiricalQual empiricalQualCO = table.get(k);
|
||||
final RecalDatum empiricalQualCO = table.get(k);
|
||||
if (empiricalQualCO != null) {
|
||||
double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality();
|
||||
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||
|
|
@ -364,7 +163,8 @@ public class BaseRecalibration {
|
|||
double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula
|
||||
recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_QUAL_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL
|
||||
|
||||
return qualQuantizationMap.get((int) recalibratedQual); // return the quantized version of the recalibrated quality
|
||||
|
||||
return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,476 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A general algorithm for quantizing quality score distributions to use a specific number of levels
|
||||
*
|
||||
* Takes a histogram of quality scores and a desired number of levels and produces a
|
||||
* map from original quality scores -> quantized quality scores.
|
||||
*
|
||||
* Note that this data structure is fairly heavy-weight, holding lots of debugging and
|
||||
* calculation information. If you want to use it efficiently at scale with lots of
|
||||
* read groups the right way to do this:
|
||||
*
|
||||
* Map<ReadGroup, List<Byte>> map
|
||||
* for each read group rg:
|
||||
* hist = getQualHist(rg)
|
||||
* QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual)
|
||||
* map.set(rg, qq.getOriginalToQuantizedMap())
|
||||
*
|
||||
* This map would then be used to look up the appropriate original -> quantized
|
||||
* quals for each read as it comes in.
|
||||
*
|
||||
* @author Mark Depristo
|
||||
* @since 3/2/12
|
||||
*/
|
||||
public class QualQuantizer {
|
||||
final private static Set<QualInterval> MY_EMPTY_SET = Collections.emptySet();
|
||||
|
||||
private static Logger logger = Logger.getLogger(QualQuantizer.class);
|
||||
|
||||
/**
|
||||
* Inputs to the QualQuantizer
|
||||
*/
|
||||
final int nLevels, minInterestingQual;
|
||||
final List<Long> nObservationsPerQual;
|
||||
|
||||
/**
|
||||
* Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28).
|
||||
*
|
||||
* Has the same range as nObservationsPerQual
|
||||
*/
|
||||
final List<Byte> originalToQuantizedMap;
|
||||
|
||||
/** Sorted set of qual intervals.
|
||||
*
|
||||
* After quantize() this data structure contains only the top-level qual intervals
|
||||
*/
|
||||
final TreeSet<QualInterval> quantizedIntervals;
|
||||
|
||||
/**
|
||||
* Protected creator for testng use only
|
||||
*/
|
||||
protected QualQuantizer(final int minInterestingQual) {
|
||||
this.nObservationsPerQual = Collections.emptyList();
|
||||
this.nLevels = 0;
|
||||
this.minInterestingQual = minInterestingQual;
|
||||
this.quantizedIntervals = null;
|
||||
this.originalToQuantizedMap = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a QualQuantizer for the histogram that has nLevels
|
||||
*
|
||||
* Note this is the only interface to the system. After creating this object
|
||||
* the map can be obtained via getOriginalToQuantizedMap()
|
||||
*
|
||||
* @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that
|
||||
* this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the
|
||||
* way up to the largest quality score possible in the reads. OK if the histogram includes many 0
|
||||
* count bins, as these are quantized for free.
|
||||
* @param nLevels the desired number of distinct quality scores to represent the full original range. Must
|
||||
* be at least 1.
|
||||
* @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely
|
||||
* merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and
|
||||
* quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing
|
||||
* all data with Q0-Q10.
|
||||
*/
|
||||
public QualQuantizer(final List<Long> nObservationsPerQual, final int nLevels, final int minInterestingQual) {
|
||||
this.nObservationsPerQual = nObservationsPerQual;
|
||||
this.nLevels = nLevels;
|
||||
this.minInterestingQual = minInterestingQual;
|
||||
|
||||
// some sanity checking
|
||||
if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedStingException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual));
|
||||
if ( nLevels < 0 ) throw new ReviewedStingException("nLevels must be >= 0");
|
||||
if ( minInterestingQual < 0 ) throw new ReviewedStingException("minInterestingQual must be >= 0");
|
||||
|
||||
// actually run the quantizer
|
||||
this.quantizedIntervals = quantize();
|
||||
|
||||
// store the map
|
||||
this.originalToQuantizedMap = intervalsToMap(quantizedIntervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents an contiguous interval of quality scores.
|
||||
*
|
||||
* qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2
|
||||
*/
|
||||
@Invariant({
|
||||
"qStart <= qEnd",
|
||||
"qStart >= 0",
|
||||
"qEnd <= 1000",
|
||||
"nObservations >= 0",
|
||||
"nErrors >= 0",
|
||||
"nErrors <= nObservations",
|
||||
"fixedQual >= -1 && fixedQual <= QualityUtils.MAX_QUAL_SCORE",
|
||||
"mergeOrder >= 0"})
|
||||
protected final class QualInterval implements Comparable<QualInterval> {
|
||||
final int qStart, qEnd, fixedQual, level;
|
||||
final long nObservations, nErrors;
|
||||
final Set<QualInterval> subIntervals;
|
||||
|
||||
/** for debugging / visualization. When was this interval created? */
|
||||
int mergeOrder;
|
||||
|
||||
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) {
|
||||
this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET);
|
||||
}
|
||||
|
||||
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set<QualInterval> subIntervals) {
|
||||
this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals);
|
||||
}
|
||||
|
||||
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) {
|
||||
this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET);
|
||||
}
|
||||
|
||||
@Requires("level >= 0")
|
||||
public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set<QualInterval> subIntervals) {
|
||||
this.qStart = qStart;
|
||||
this.qEnd = qEnd;
|
||||
this.nObservations = nObservations;
|
||||
this.nErrors = nErrors;
|
||||
this.fixedQual = fixedQual;
|
||||
this.level = level;
|
||||
this.mergeOrder = 0;
|
||||
this.subIntervals = Collections.unmodifiableSet(subIntervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Human readable name of this interval: e.g., 10-12
|
||||
* @return
|
||||
*/
|
||||
public String getName() {
|
||||
return qStart + "-" + qEnd;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "QQ:" + getName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the error rate (in real space) of this interval, or 0 if there are no obserations
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result >= 0.0")
|
||||
public double getErrorRate() {
|
||||
if ( hasFixedQual() )
|
||||
return QualityUtils.qualToErrorProb((byte)fixedQual);
|
||||
else if ( nObservations == 0 )
|
||||
return 0.0;
|
||||
else
|
||||
return (nErrors+1) / (1.0 * (nObservations+1));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the QUAL of the error rate of this interval, or the fixed
|
||||
* qual if this interval was created with a fixed qual.
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE")
|
||||
public byte getQual() {
|
||||
if ( ! hasFixedQual() )
|
||||
return QualityUtils.probToQual(1-getErrorRate(), 0);
|
||||
else
|
||||
return (byte)fixedQual;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this bin is using a fixed qual
|
||||
*/
|
||||
public boolean hasFixedQual() {
|
||||
return fixedQual != -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final QualInterval qualInterval) {
|
||||
return new Integer(this.qStart).compareTo(qualInterval.qStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a interval representing the merge of this interval and toMerge
|
||||
*
|
||||
* Errors and observations are combined
|
||||
* Subintervals updated in order of left to right (determined by qStart)
|
||||
* Level is 1 + highest level of this and toMerge
|
||||
* Order must be updated elsewhere
|
||||
*
|
||||
* @param toMerge
|
||||
* @return newly created merged QualInterval
|
||||
*/
|
||||
@Requires({"toMerge != null"})
|
||||
@Ensures({
|
||||
"result != null",
|
||||
"result.nObservations >= this.nObservations",
|
||||
"result.nObservations >= toMerge.nObservations",
|
||||
"result.nErrors >= this.nErrors",
|
||||
"result.nErrors >= toMerge.nErrors",
|
||||
"result.qStart == Math.min(this.qStart, toMerge.qStart)",
|
||||
"result.qEnd == Math.max(this.qEnd, toMerge.qEnd)",
|
||||
"result.level > Math.max(this.level, toMerge.level)",
|
||||
"result.subIntervals.size() == 2"
|
||||
})
|
||||
public QualInterval merge(final QualInterval toMerge) {
|
||||
final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge;
|
||||
final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this;
|
||||
|
||||
if ( left.qEnd + 1 != right.qStart )
|
||||
throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right);
|
||||
|
||||
final long nCombinedObs = left.nObservations + right.nObservations;
|
||||
final long nCombinedErr = left.nErrors + right.nErrors;
|
||||
|
||||
final int level = Math.max(left.level, right.level) + 1;
|
||||
final Set<QualInterval> subIntervals = new HashSet<QualInterval>(Arrays.asList(left, right));
|
||||
QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals);
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
||||
public double getPenalty() {
|
||||
return calcPenalty(getErrorRate());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the penalty of this interval, given the overall error rate for the interval
|
||||
*
|
||||
* If the globalErrorRate is e, this value is:
|
||||
*
|
||||
* sum_i |log10(e_i) - log10(e)| * nObservations_i
|
||||
*
|
||||
* each the index i applies to all leaves of the tree accessible from this interval
|
||||
* (found recursively from subIntervals as necessary)
|
||||
*
|
||||
* @param globalErrorRate overall error rate in real space against which we calculate the penalty
|
||||
* @return the cost of approximating the bins in this interval with the globalErrorRate
|
||||
*/
|
||||
@Requires("globalErrorRate >= 0.0")
|
||||
@Ensures("result >= 0.0")
|
||||
private double calcPenalty(final double globalErrorRate) {
|
||||
if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
|
||||
return 0.0;
|
||||
|
||||
if ( subIntervals.isEmpty() ) {
|
||||
// this is leave node
|
||||
if ( this.qEnd <= minInterestingQual )
|
||||
// It's free to merge up quality scores below the smallest interesting one
|
||||
return 0;
|
||||
else {
|
||||
return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations;
|
||||
}
|
||||
} else {
|
||||
double sum = 0;
|
||||
for ( final QualInterval interval : subIntervals )
|
||||
sum += interval.calcPenalty(globalErrorRate);
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main method for computing the quantization intervals.
|
||||
*
|
||||
* Invoked in the constructor after all input variables are initialized. Walks
|
||||
* over the inputs and builds the min. penalty forest of intervals with exactly nLevel
|
||||
* root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed
|
||||
* to find the optimal combination.
|
||||
*
|
||||
* TODO: develop a smarter algorithm
|
||||
*
|
||||
* @return the forest of intervals with size == nLevels
|
||||
*/
|
||||
@Ensures({"! result.isEmpty()", "result.size() == nLevels"})
|
||||
private TreeSet<QualInterval> quantize() {
|
||||
// create intervals for each qual individually
|
||||
final TreeSet<QualInterval> intervals = new TreeSet<QualInterval>();
|
||||
for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) {
|
||||
final long nObs = nObservationsPerQual.get(qStart);
|
||||
final double errorRate = QualityUtils.qualToErrorProb((byte)qStart);
|
||||
final double nErrors = nObs * errorRate;
|
||||
final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart);
|
||||
intervals.add(qi);
|
||||
}
|
||||
|
||||
// greedy algorithm:
|
||||
// while ( n intervals >= nLevels ):
|
||||
// find intervals to merge with least penalty
|
||||
// merge it
|
||||
while ( intervals.size() > nLevels ) {
|
||||
mergeLowestPenaltyIntervals(intervals);
|
||||
}
|
||||
|
||||
return intervals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that finds and mergest together the lowest penalty pair
|
||||
* of intervals
|
||||
* @param intervals
|
||||
*/
|
||||
@Requires("! intervals.isEmpty()")
|
||||
private void mergeLowestPenaltyIntervals(final TreeSet<QualInterval> intervals) {
|
||||
// setup the iterators
|
||||
final Iterator<QualInterval> it1 = intervals.iterator();
|
||||
final Iterator<QualInterval> it1p = intervals.iterator();
|
||||
it1p.next(); // skip one
|
||||
|
||||
// walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty
|
||||
QualInterval minMerge = null;
|
||||
if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size());
|
||||
int lastMergeOrder = 0;
|
||||
while ( it1p.hasNext() ) {
|
||||
final QualInterval left = it1.next();
|
||||
final QualInterval right = it1p.next();
|
||||
final QualInterval merged = left.merge(right);
|
||||
lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder);
|
||||
if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) {
|
||||
if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge);
|
||||
minMerge = merged;
|
||||
}
|
||||
}
|
||||
|
||||
// now actually go ahead and merge the minMerge pair
|
||||
if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge);
|
||||
intervals.removeAll(minMerge.subIntervals);
|
||||
intervals.add(minMerge);
|
||||
minMerge.mergeOrder = lastMergeOrder + 1;
|
||||
if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a final forest of intervals constructs a list mapping
|
||||
* list.get(i) => quantized qual to use for original quality score i
|
||||
*
|
||||
* This function should be called only once to initialize the corresponding
|
||||
* cached value in this object, as the calculation is a bit costly.
|
||||
*
|
||||
* @param intervals
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result.size() == getNQualsInHistogram()")
|
||||
private List<Byte> intervalsToMap(final TreeSet<QualInterval> intervals) {
|
||||
final List<Byte> map = new ArrayList<Byte>(getNQualsInHistogram());
|
||||
map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE));
|
||||
for ( final QualInterval interval : intervals ) {
|
||||
for ( int q = interval.qStart; q <= interval.qEnd; q++ ) {
|
||||
map.set(q, interval.getQual());
|
||||
}
|
||||
}
|
||||
|
||||
if ( Collections.min(map) == Byte.MIN_VALUE )
|
||||
throw new ReviewedStingException("quantized quality score map contains an un-initialized value");
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
@Ensures("result > 0")
|
||||
private final int getNQualsInHistogram() {
|
||||
return nObservationsPerQual.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out a GATKReport to visualize the QualQuantization process of this data
|
||||
* @param out
|
||||
*/
|
||||
public void writeReport(PrintStream out) {
|
||||
final GATKReport report = new GATKReport();
|
||||
|
||||
addQualHistogramToReport(report);
|
||||
addIntervalsToReport(report);
|
||||
|
||||
report.print(out);
|
||||
}
|
||||
|
||||
private final void addQualHistogramToReport(final GATKReport report) {
|
||||
report.addTable("QualHistogram", "Quality score histogram provided to report");
|
||||
GATKReportTable table = report.getTable("QualHistogram");
|
||||
|
||||
table.addPrimaryKey("qual");
|
||||
table.addColumn("count", "NA");
|
||||
|
||||
for ( int q = 0; q < nObservationsPerQual.size(); q++ ) {
|
||||
table.set(q, "count", nObservationsPerQual.get(q));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private final void addIntervalsToReport(final GATKReport report) {
|
||||
report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals");
|
||||
GATKReportTable table = report.getTable("QualQuantizerIntervals");
|
||||
|
||||
table.addPrimaryKey("name");
|
||||
table.addColumn("qStart", "NA");
|
||||
table.addColumn("qEnd", "NA");
|
||||
table.addColumn("level", "NA");
|
||||
table.addColumn("merge.order", "NA");
|
||||
table.addColumn("nErrors", "NA");
|
||||
table.addColumn("nObservations", "NA");
|
||||
table.addColumn("qual", "NA");
|
||||
table.addColumn("penalty", "NA");
|
||||
table.addColumn("root.node", "NA");
|
||||
//table.addColumn("subintervals", "NA");
|
||||
|
||||
for ( QualInterval interval : quantizedIntervals)
|
||||
addIntervalToReport(table, interval, true);
|
||||
}
|
||||
|
||||
private final void addIntervalToReport(final GATKReportTable table, QualInterval interval, final boolean atRootP) {
|
||||
final String name = interval.getName();
|
||||
table.set(name, "qStart", interval.qStart);
|
||||
table.set(name, "qEnd", interval.qEnd);
|
||||
table.set(name, "level", interval.level);
|
||||
table.set(name, "merge.order", interval.mergeOrder);
|
||||
table.set(name, "nErrors", interval.nErrors);
|
||||
table.set(name, "nObservations", interval.nObservations);
|
||||
table.set(name, "qual", interval.getQual());
|
||||
table.set(name, "penalty", String.format("%.1f", interval.getPenalty()));
|
||||
table.set(name, "root.node", atRootP);
|
||||
|
||||
for ( final QualInterval sub : interval.subIntervals )
|
||||
addIntervalToReport(table, sub, false);
|
||||
}
|
||||
|
||||
public List<Byte> getOriginalToQuantizedMap() {
|
||||
return originalToQuantizedMap;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -13,17 +16,69 @@ import java.util.List;
|
|||
public class BQSRGathererUnitTest {
|
||||
RecalibrationArgumentCollection RAC;
|
||||
|
||||
private static File recal1 = new File("public/testdata/exampleCSV.csv");
|
||||
private static File recal2 = new File("public/testdata/exampleCSV.2.csv");
|
||||
private static File recal = new File("public/testdata/exampleGRP.grp");
|
||||
|
||||
//todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure
|
||||
@Test(enabled = false)
|
||||
public void testCombineTwoFiles() {
|
||||
public void testCombineSimilarFiles() {
|
||||
BQSRGatherer gatherer = new BQSRGatherer();
|
||||
List<File> recalFiles = new LinkedList<File> ();
|
||||
File output = new File("foo.csv");
|
||||
|
||||
recalFiles.add(recal1);
|
||||
recalFiles.add(recal2);
|
||||
File output = new File("foo.grp");
|
||||
recalFiles.add(recal);
|
||||
recalFiles.add(recal);
|
||||
gatherer.gather(recalFiles, output);
|
||||
|
||||
GATKReport originalReport = new GATKReport(recal);
|
||||
GATKReport calculatedReport = new GATKReport(output);
|
||||
for (GATKReportTable originalTable : originalReport.getTables()) {
|
||||
GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName());
|
||||
List<String> columnsToTest = new LinkedList<String>();
|
||||
if (originalTable.getTableName().equals(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL
|
||||
columnsToTest.add(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
|
||||
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1);
|
||||
}
|
||||
|
||||
else if (originalTable.getTableName().equals(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE)) {
|
||||
columnsToTest.add(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
|
||||
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
|
||||
}
|
||||
|
||||
else if (originalTable.getTableName().startsWith("RecalTable")) {
|
||||
columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
|
||||
columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
|
||||
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Common testing functionality given the columns to test and the multiplication factor to the expected result
|
||||
*
|
||||
* @param original the original table
|
||||
* @param calculated the calculated table
|
||||
* @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor)
|
||||
* @param factor 1 to test for equality, any other value to multiply the original value and match with the calculated
|
||||
*/
|
||||
private void testTablesWithColumnsAndFactor(GATKReportTable original, GATKReportTable calculated, List<String> columnsToTest, int factor) {
|
||||
for (Object primaryKey : original.getPrimaryKeys()) { // tables don't necessarily have the same primary keys
|
||||
for (String column : columnsToTest) {
|
||||
Object actual = calculated.get(primaryKey, column);
|
||||
Object expected = original.get(primaryKey, column);
|
||||
|
||||
if (factor != 1) {
|
||||
if (expected instanceof Double)
|
||||
expected = (Double) expected * factor;
|
||||
else if (expected instanceof Long)
|
||||
expected = (Long) expected * factor;
|
||||
else if (expected instanceof Integer)
|
||||
expected = (Integer) expected * factor;
|
||||
else if (expected instanceof Byte) {
|
||||
expected = (Byte) expected * factor;
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(actual, expected, "Primary key: " + primaryKey + " Original Table: " + original.getTableName() + " Calc Table: " + calculated.getTableName());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,132 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/7/12
|
||||
*/
|
||||
public class BQSRKeyManagerUnitTest {
|
||||
RecalibrationArgumentCollection RAC;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
RAC = new RecalibrationArgumentCollection();
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCombineBitSets() {
|
||||
final int nRequired = 2;
|
||||
final ArrayList<Covariate> covariates = new ArrayList<Covariate>();
|
||||
covariates.add(new ReadGroupCovariate());
|
||||
covariates.add(new QualityScoreCovariate());
|
||||
covariates.add(new CycleCovariate());
|
||||
covariates.add(new ContextCovariate());
|
||||
createReadAndTest(covariates, nRequired);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testOnlyRequiredCovariates() {
|
||||
final int nRequired = 2;
|
||||
final ArrayList<Covariate> covariates = new ArrayList<Covariate>(2);
|
||||
covariates.add(new ReadGroupCovariate());
|
||||
covariates.add(new QualityScoreCovariate());
|
||||
createReadAndTest(covariates, nRequired);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testOnlyOneCovariate() {
|
||||
final int nRequired = 1;
|
||||
final ArrayList<Covariate> covariates = new ArrayList<Covariate>(2);
|
||||
covariates.add(new ReadGroupCovariate());
|
||||
createReadAndTest(covariates, nRequired);
|
||||
}
|
||||
|
||||
private void createReadAndTest(List<Covariate> covariates, int nRequired) {
|
||||
int readLength = 1000;
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M");
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord("ID"));
|
||||
read.getReadGroup().setPlatform("illumina");
|
||||
|
||||
runTestOnRead(read, covariates, nRequired);
|
||||
read.setReadNegativeStrandFlag(true);
|
||||
runTestOnRead(read, covariates, nRequired);
|
||||
read.setReadPairedFlag(true);
|
||||
read.setSecondOfPairFlag(true);
|
||||
runTestOnRead(read, covariates, nRequired);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
runTestOnRead(read, covariates, nRequired);
|
||||
}
|
||||
|
||||
private void runTestOnRead(GATKSAMRecord read, List<Covariate> covariateList, int nRequired) {
|
||||
final BitSet[][][] covariateKeys = new BitSet[covariateList.size()][EventType.values().length][];
|
||||
int i = 0;
|
||||
for (Covariate cov : covariateList) {
|
||||
cov.initialize(RAC);
|
||||
CovariateValues covValues = cov.getValues(read);
|
||||
covariateKeys[i][EventType.BASE_SUBSTITUTION.index] = covValues.getMismatches();
|
||||
covariateKeys[i][EventType.BASE_INSERTION.index] = covValues.getInsertions();
|
||||
covariateKeys[i][EventType.BASE_DELETION.index] = covValues.getDeletions();
|
||||
i++;
|
||||
}
|
||||
List<Covariate> requiredCovariates = new LinkedList<Covariate>();
|
||||
List<Covariate> optionalCovariates = new LinkedList<Covariate>();
|
||||
|
||||
for (int j=0; j<nRequired; j++)
|
||||
requiredCovariates.add(covariateList.get(j));
|
||||
for (int j=nRequired; j<covariateList.size(); j++)
|
||||
optionalCovariates.add(covariateList.get(j));
|
||||
|
||||
BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
|
||||
|
||||
for (int l = 0; l < read.getReadLength(); l++) {
|
||||
for (int eventType = 0; eventType < EventType.values().length; eventType++) {
|
||||
BitSet[] keySet = new BitSet[covariateList.size()];
|
||||
Object[] expectedRequired = new Object[covariateList.size()];
|
||||
Object[] expectedCovariate = new Object[covariateList.size()];
|
||||
|
||||
for (int j = 0; j < covariateList.size(); j++) {
|
||||
keySet[j] = covariateKeys[j][eventType][l];
|
||||
|
||||
if (j < nRequired)
|
||||
expectedRequired[j] = covariateList.get(j).keyFromBitSet(keySet[j]);
|
||||
else
|
||||
expectedCovariate[j - nRequired] = covariateList.get(j).keyFromBitSet(keySet[j]);
|
||||
}
|
||||
|
||||
List<BitSet> hashKeys = keyManager.bitSetsFromAllKeys(keySet, EventType.eventFrom(eventType));
|
||||
short cov = 0;
|
||||
for (BitSet key : hashKeys) {
|
||||
Object[] actual = keyManager.keySetFrom(key).toArray();
|
||||
|
||||
// Build the expected array
|
||||
Object[] expected = new Object[nRequired + (optionalCovariates.size() > 0 ? 3 : 1)];
|
||||
System.arraycopy(expectedRequired, 0, expected, 0, nRequired);
|
||||
if (optionalCovariates.size() > 0) {
|
||||
expected[expected.length-3] = expectedCovariate[cov];
|
||||
expected[expected.length-2] = optionalCovariates.get(cov++).getClass().getSimpleName().split("Covariate")[0];
|
||||
}
|
||||
expected[expected.length-1] = EventType.eventFrom(eventType);
|
||||
|
||||
// System.out.println("Actual : " + Utils.join(",", Arrays.asList(actual)));
|
||||
// System.out.println("Expected: " + Utils.join(",", Arrays.asList(expected)));
|
||||
// System.out.println();
|
||||
|
||||
for (int k = 0; k < expected.length; k++)
|
||||
Assert.assertEquals(actual[k], expected[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -33,19 +33,15 @@ import net.sf.picard.reference.ReferenceSequenceFile;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue