Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2012-03-27 15:01:03 -04:00
commit b4a7c0d98d
14 changed files with 2813 additions and 427 deletions

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.*;
import java.util.Collection;
import java.util.List;
import java.util.TreeMap;
/**
@ -141,6 +142,11 @@ public class GATKReport {
tables.put(table.getTableName(), table);
}
public void addTables(List<GATKReportTable> gatkReportTables) {
for (GATKReportTable table : gatkReportTables)
addTable(table);
}
/**
* Return true if table with a given name exists
*

View File

@ -26,17 +26,13 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatumOptimized;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* User: carneiro
@ -45,80 +41,31 @@ import java.util.Map;
public class BQSRGatherer extends Gatherer {
/////////////////////////////
// Private Member Variables
/////////////////////////////
private static final String EOF_MARKER = "EOF";
private HashMap<String, RecalDatumOptimized> dataMap = new HashMap<String, RecalDatumOptimized>();
private void addCSVData (String line) {
String[] covariates = line.split(",");
String key = "";
RecalDatumOptimized values;
for (int i = 0; i < covariates.length-3; i++)
key += covariates[i] + ",";
if (covariates.length < 3)
throw new ReviewedStingException("Line only has 1 covariate : " + line);
values = new RecalDatumOptimized(Long.parseLong(covariates[covariates.length - 3]), Long.parseLong(covariates[covariates.length - 2]));
RecalDatumOptimized currentValues = dataMap.get(key);
if (currentValues == null)
dataMap.put(key, values);
else
currentValues.increment(values);
}
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
private static final String MISSING_OUTPUT_FILE = "missing output file name";
@Override
public void gather(List<File> inputs, File output) {
PrintStream o;
RecalibrationReport generalReport = null;
PrintStream outputFile;
try {
o = new PrintStream(output);
} catch ( FileNotFoundException e) {
throw new UserException("File to be output by CountCovariates Gather function was not found");
outputFile = new PrintStream(output);
} catch(FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
}
boolean sawEOF = false;
boolean printedHeader = false;
// Read input files
for ( File RECAL_FILE : inputs) {
try {
for ( String line : new XReadLines(RECAL_FILE) ) {
if ( EOF_MARKER.equals(line) ) {
sawEOF = true; // sanity check
break;
}
else if(line.startsWith("#")) {
if (!printedHeader)
o.println(line);
}
else // Found a line of data
addCSVData(line); // Parse the line and add the data to the HashMap
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
}
if ( !sawEOF ) {
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
}
printedHeader = true;
for (File input : inputs) {
RecalibrationReport inputReport = new RecalibrationReport(input);
if (generalReport == null)
generalReport = inputReport;
else
generalReport.combine(inputReport);
}
if (generalReport == null)
throw new ReviewedStingException(EMPTY_INPUT_LIST);
// Write output file from dataMap
for(Map.Entry<String, RecalDatumOptimized> entry : dataMap.entrySet())
o.println(entry.getKey() + entry.getValue().outputToCSV());
o.println("EOF");
generalReport.calculateEmpiricalAndQuantizedQualities();
generalReport.output(outputFile);
}
}

View File

@ -25,17 +25,17 @@ import java.util.*;
* @since 3/6/12
*/
public class BQSRKeyManager {
private List<RequiredCovariateInfo> requiredCovariates;
private List<OptionalCovariateInfo> optionalCovariates;
private Map<String, Short> covariateNameToIDMap;
private final List<RequiredCovariateInfo> requiredCovariates;
private final List<OptionalCovariateInfo> optionalCovariates;
private final Map<String, Short> covariateNameToIDMap;
private int nRequiredBits; // Number of bits used to represent the required covariates
private int nOptionalBits; // Number of bits used to represent the standard covaraites
private int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
private int totalNumberOfBits; // Sum of all of the above plus the event bits
private int nRequiredBits; // Number of bits used to represent the required covariates
private int nOptionalBits; // Number of bits used to represent the standard covaraites
private final int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
private final int totalNumberOfBits; // Sum of all of the above plus the event bits
private BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
private BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
private final BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
private final BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
/**
* Initializes the KeyManager with the total number of covariates to use
@ -44,34 +44,34 @@ public class BQSRKeyManager {
* @param optionalCovariates the ordered list of optional covariates
*/
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
nRequiredBits = 0;
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
int nBits = required.numberOfBits(); // number of bits used by this covariate
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
int nBits = required.numberOfBits(); // number of bits used by this covariate
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate
nRequiredBits += nBits;
}
short id = 0;
nOptionalBits = 0;
for (Covariate optional : optionalCovariates) {
int nBits = optional.numberOfBits(); // number of bits used by this covariate
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
int nBits = optional.numberOfBits(); // number of bits used by this covariate
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
this.covariateNameToIDMap.put(covariateName, id);
id++;
}
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
}
/**
@ -93,32 +93,32 @@ public class BQSRKeyManager {
* @return one key in bitset representation per covariate
*/
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
int covariateIndex = 0;
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
for (RequiredCovariateInfo infoRequired : requiredCovariates)
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
for (OptionalCovariateInfo infoOptional : optionalCovariates) {
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
if (covariateKey == null)
continue; // do not add nulls to the final set of keys.
continue; // do not add nulls to the final set of keys.
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
optionalKey.or(requiredKey); // import all the required covariates
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(optionalKey); // add this key to the list of keys
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
optionalKey.or(requiredKey); // import all the required covariates
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(optionalKey); // add this key to the list of keys
}
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(requiredKey); // add this key to the list of keys
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(requiredKey); // add this key to the list of keys
}
return allBitSets;
@ -141,25 +141,25 @@ public class BQSRKeyManager {
int requiredCovariate = 0;
for (RequiredCovariateInfo infoRequired : requiredCovariates) {
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
}
if (optionalCovariates.size() > 0) {
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
}
int eventIndex = key.length - 1; // the event type is always the last key
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
int eventIndex = key.length - 1; // the event type is always the last key
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
return bitSetKey;
}
@ -186,19 +186,19 @@ public class BQSRKeyManager {
public List<Object> keySetFrom(BitSet key) {
List<Object> objectKeys = new ArrayList<Object>();
for (RequiredCovariateInfo info : requiredCovariates) {
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
}
if (optionalCovariates.size() > 0) {
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits);// mask out the covariate order (to identify which covariate this is)
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is)
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
}
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
return objectKeys;
}
@ -227,7 +227,7 @@ public class BQSRKeyManager {
private BitSet chopNBitsFrom(BitSet key, int n) {
BitSet choppedKey = new BitSet();
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
return choppedKey;
}
@ -269,7 +269,7 @@ public class BQSRKeyManager {
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
}
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
@ -282,22 +282,20 @@ public class BQSRKeyManager {
* Aggregate information for each Covariate
*/
class RequiredCovariateInfo {
public int bitsBefore; // number of bits before this covariate in the combined bitset key
public int nBits; // number of bits used by this covariate (cached access to covariate.nBits())
public BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
public Covariate covariate; // this allows reverse lookup of the Covariates in order
public final int bitsBefore; // number of bits before this covariate in the combined bitset key
public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
public final Covariate covariate; // this allows reverse lookup of the Covariates in order
RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) {
this.bitsBefore = bitsBefore;
this.nBits = nBits;
this.mask = mask;
this.covariate = covariate;
}
}
class OptionalCovariateInfo {
public BitSet covariateID; // cache the covariate ID
public Covariate covariate;
public final BitSet covariateID; // cache the covariate ID
public final Covariate covariate;
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) {
this.covariateID = covariateID;

View File

@ -0,0 +1,74 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Map;
/**
* Class that encapsulates the information necessary for quality score quantization for BQSR
*
* @author carneiro
* @since 3/26/12
*/
public class QuantizationInfo {
private List<Byte> quantizedQuals;
private List<Long> empiricalQualCounts;
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
this.quantizedQuals = quantizedQuals;
this.empiricalQualCounts = empiricalQualCounts;
}
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
Map<BitSet, RecalDatum> qualTable = null; // look for the quality score table
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
if (keyManager.getRequiredCovariates().size() == 2) // it should be the only one with 2 required covaraites
qualTable = entry.getValue();
}
if (qualTable == null)
throw new ReviewedStingException("Could not find QualityScore table.");
for (RecalDatum datum : qualTable.values()) {
int empiricalQual = (int) Math.round(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL )
long nObservations = datum.numObservations;
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
}
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
quantizeQualityScores(nLevels);
}
public void quantizeQualityScores(int nLevels) {
QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
}
public List<Byte> getQuantizedQuals() {
return quantizedQuals;
}
public GATKReportTable generateReportTable() {
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
quantizedTable.addColumn(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, 0L);
quantizedTable.addColumn(RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, (byte) 0);
for (int qual = 0; qual <= QualityUtils.MAX_QUAL_SCORE; qual++) {
quantizedTable.set(qual, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual));
quantizedTable.set(qual, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual));
}
return quantizedTable;
}
}

View File

@ -28,7 +28,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
@ -42,6 +43,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.PrintStream;
import java.util.*;
/**
@ -71,13 +73,14 @@ public class RecalDataManager {
public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue";
public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors";
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullPlatform = false;
public enum SOLID_RECAL_MODE {
/**
* Treat reference inserted bases as reference matching bases. Very unsafe!
@ -136,25 +139,38 @@ public class RecalDataManager {
}
}
public static void listAvailableCovariates(Logger logger) {
// Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
// Print and exit if that's what was requested
logger.info("Available covariates:");
for (Class<?> covClass : covariateClasses)
logger.info(covClass.getSimpleName());
logger.info("");
/**
* Initializes the recalibration table -> key manager map
*
* @param requiredCovariates list of required covariates (in order)
* @param optionalCovariates list of optional covariates (in order)
* @return a map with each key manager and it's corresponding recalibration table properly initialized
*/
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
}
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
}
/**
* Generates two lists : required covariates and optional covariates based on the user's requests.
*
*
* Performs the following tasks in order:
* 1. Adds all requierd covariates in order
* 2. Check if the user asked to use the standard covariates and adds them all if that's the case
* 3. Adds all covariates requested by the user that were not already added by the two previous steps
*
* 3. Adds all covariates requested by the user that were not already added by the two previous steps
*
* @param argumentCollection the argument collection object for the recalibration walker
* @return a pair of ordered lists : required covariates (first) and optional covariates (second)
*/
@ -194,52 +210,102 @@ public class RecalDataManager {
return new Pair<ArrayList<Covariate>, ArrayList<Covariate>>(requiredCovariates, optionalCovariates);
}
/**
* Initializes the recalibration table -> key manager map
*
* @param requiredCovariates list of required covariates (in order)
* @param optionalCovariates list of optional covariates (in order)
* @return a map with each key manager and it's corresponding recalibration table properly initialized
*/
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
}
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
public static void listAvailableCovariates(Logger logger) {
// Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
// Print and exit if that's what was requested
logger.info("Available covariates:");
for (Class<?> covClass : covariateClasses)
logger.info(covClass.getSimpleName());
logger.info("");
}
/**
* Initializes the table -> key manager map (unfortunate copy of the above code with minor modifications to accomodate the different return types (RecalDatum vs EmpiricalQual objects)
*
* @param requiredCovariates list of required covariates (in order)
* @param optionalCovariates list of optional covariates (in order)
* @return a map with each key manager and it's corresponding recalibration table properly initialized
*/
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> initializeEmpiricalTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
public static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) {
List<GATKReportTable> result = new LinkedList<GATKReportTable>();
int tableIndex = 0;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
Map<BitSet, RecalDatum> recalTable = entry.getValue();
GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "");
final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.2f");
final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.2f");
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
long primaryKey = 0L;
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
for (Covariate covariate : requiredList) {
String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order
columnNames.add(new Pair<String,String>(name, "%s")); // save the required covariate name so we can reference it in the future
}
if (optionalList.size() > 0) {
columnNames.add(covariateValue);
columnNames.add(covariateName);
}
columnNames.add(eventType); // the order of these column names is important here
columnNames.add(empiricalQuality);
columnNames.add(estimatedQReported);
columnNames.add(nObservations);
columnNames.add(nErrors);
reportTable.addPrimaryKey("PrimaryKey", false); // every table must have a primary key (hidden)
for (Pair<String, String> columnName : columnNames)
reportTable.addColumn(columnName.getFirst(), true, columnName.getSecond()); // every table must have the event type
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
BitSet bitSetKey = recalTableEntry.getKey();
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
Iterator<Pair<String, String>> iterator = columnNames.iterator();
for (Object key : keyManager.keySetFrom(bitSetKey)) {
String columnName = iterator.next().getFirst();
columnData.put(columnName, key);
}
RecalDatum datum = recalTableEntry.getValue();
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); // iterator.next() gives the column name for Empirical Quality
columnData.put(iterator.next().getFirst(), Math.round(datum.getEstimatedQReported())); // iterator.next() gives the column name for EstimatedQReported
columnData.put(iterator.next().getFirst(), datum.numObservations);
columnData.put(iterator.next().getFirst(), datum.numMismatches);
for (Map.Entry<String, Object> dataEntry : columnData.entrySet()) {
String columnName = dataEntry.getKey();
Object value = dataEntry.getValue();
reportTable.set(primaryKey, columnName, value.toString());
}
primaryKey++;
}
result.add(reportTable);
}
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
return result;
}
public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List<GATKReportTable> recalTables, PrintStream outputFile) {
GATKReport report = new GATKReport();
report.addTable(argumentTable);
report.addTable(quantizationTable);
report.addTables(recalTables);
report.print(outputFile);
}
/**
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string

View File

@ -38,6 +38,8 @@ public class RecalDatum extends RecalDatumOptimized {
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero
//---------------------------------------------------------------------------------------------------------------
//
// constructors
@ -75,7 +77,6 @@ public class RecalDatum extends RecalDatumOptimized {
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
this.increment(other.numObservations, other.numMismatches);
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
}
//---------------------------------------------------------------------------------------------------------------
@ -84,8 +85,8 @@ public class RecalDatum extends RecalDatumOptimized {
//
//---------------------------------------------------------------------------------------------------------------
public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) {
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
public final void calcCombinedEmpiricalQuality(final int maxQual) {
this.empiricalQuality = empiricalQualDouble(SMOOTHING_CONSTANT, maxQual); // cache the value so we don't call log over and over again
}
public final void calcEstimatedReportedQuality() {
@ -106,6 +107,11 @@ public class RecalDatum extends RecalDatumOptimized {
return empiricalQuality;
}
public final void resetCalculatedQualities() {
empiricalQuality = 0.0;
estimatedQReported = 0.0;
}
private double calcExpectedErrors() {
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
}

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.Utils;
import java.io.PrintStream;
import java.util.Collections;
@ -156,5 +158,25 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16;
public GATKReportTable generateReportTable() {
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run");
argumentsTable.addPrimaryKey("Argument");
argumentsTable.addColumn(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, "null");
argumentsTable.set("covariate", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES));
argumentsTable.set("standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, USE_STANDARD_COVARIATES);
argumentsTable.set("run_without_dbsnp", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
argumentsTable.set("solid_recal_mode", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.set("insertions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_CONTEXT_SIZE);
argumentsTable.set("deletions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_CONTEXT_SIZE);
argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.set("insertions_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
argumentsTable.set("low_quality_tail", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
return argumentsTable;
}
}

View File

@ -0,0 +1,290 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* This class has all the static functionality for reading a recalibration report file into memory.
*
* @author carneiro
* @since 3/26/12
*/
public class RecalibrationReport {
private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
GATKReportTable argumentTable; // keep the argument table untouched just for output purposes
RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter | todo -- this should be a new parameter, not necessarily coming from the original table parameter list
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
public RecalibrationReport(final File RECAL_FILE) {
GATKReport report = new GATKReport(RECAL_FILE);
argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE);
RAC = initializeArgumentCollectionTable(argumentTable);
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
quantizationInfo = initializeQuantizationTable(quantizedTable);
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
table = parseReadGroupTable(keyManager, reportTable);
}
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
table = parseQualityScoreTable(keyManager, reportTable);
}
else
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
}
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
final Map<BitSet, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable);
keysAndTablesMap.put(keyManager, table);
}
/**
* Combines two recalibration reports by adding all observations and errors
*
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate them
* after combining. The reason for not calculating it is because this function is inteded for combining a series of
* recalibration reports, and it only makes sense to calculate the empirical qualities and quantized qualities after all
* the recalibration reports have been combined. Having the user recalculate when appropriate, makes this method faster
*
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
*
* @param other the recalibration report to combine with this one
*/
public void combine(RecalibrationReport other) {
Iterator<Map<BitSet, RecalDatum>> tableIterator = keysAndTablesMap.values().iterator(); // because these are ordered (linked hashmaps) we can iterate over the 'this' and do a for loop on the 'other' tables and be sure that we are looking at the equivalent tables on both objects
for (Map<BitSet, RecalDatum> otherTable : other.getKeysAndTablesMap().values()) { // iterate over all tables for 'other'
Map<BitSet, RecalDatum> thisTable = tableIterator.next(); // iterate over all tables for 'this'
for (Map.Entry<BitSet, RecalDatum> entry : otherTable.entrySet()) { // for each table, go through all the entries in the 'other' dataset to update 'this' dataset
BitSet key = entry.getKey();
RecalDatum otherDatum = entry.getValue();
RecalDatum thisDatum = thisTable.get(key);
thisDatum.increment(otherDatum); // add the two datum objects into 'this'
thisDatum.resetCalculatedQualities(); // reset the empirical quality to make sure the user doesn't forget to recalculate it
}
}
}
public QuantizationInfo getQuantizationInfo() {
return quantizationInfo;
}
public LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> getKeysAndTablesMap() {
return keysAndTablesMap;
}
public ArrayList<Covariate> getRequestedCovariates() {
return requestedCovariates;
}
/**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
*
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Shared parsing functionality for all tables.
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2);
for (Object primaryKey : reportTable.getPrimaryKeys()) {
int nKeys = columnNamesOrderedList.size();
Object [] keySet = new Object[nKeys];
for (int i = 0; i < nKeys; i++)
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
BitSet bitKey = keyManager.bitSetFromKey(keySet);
long nObservations = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
long nErrors = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
result.put(bitKey, recalDatum);
}
return result;
}
/**
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
*
* @param table the GATKReportTable containing the quantization mappings
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
*/
private QuantizationInfo initializeQuantizationTable(GATKReportTable table) {
Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1];
for (Object primaryKey : table.getPrimaryKeys()) {
Object quantizedObject = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
Object countObject = table.get(primaryKey, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
byte originalQual = Byte.parseByte(primaryKey.toString());
byte quantizedQual = Byte.parseByte(quantizedObject.toString());
long quantizedCount = Long.parseLong(countObject.toString());
quals[originalQual] = quantizedQual;
counts[originalQual] = quantizedCount;
}
return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts));
}
/**
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
*
* @param table the GATKReportTable containing the arguments and its corresponding values
* @return a RAC object properly initialized with all the objects in the table
*/
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for (Object primaryKey : table.getPrimaryKeys()) {
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
if (value.equals("null"))
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
if (primaryKey.equals("covariate") && value != null)
RAC.COVARIATES = value.toString().split(",");
else if (primaryKey.equals("standard_covs"))
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("solid_recal_mode"))
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
else if (primaryKey.equals("solid_nocall_strategy"))
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
else if (primaryKey.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("insertions_context_size"))
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("deletions_context_size"))
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("insertions_default_quality"))
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("deletions_default_quality"))
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("low_quality_tail"))
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
else if (primaryKey.equals("default_platform"))
RAC.DEFAULT_PLATFORM = (String) value;
else if (primaryKey.equals("force_platform"))
RAC.FORCE_PLATFORM = (String) value;
else if (primaryKey.equals("quantizing_levels"))
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
}
return RAC;
}
/**
* this functionality avoids recalculating the empirical qualities, estimated reported quality
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
*/
public void calculateEmpiricalAndQuantizedQualities() {
quantizationInfo.quantizeQualityScores(RAC.QUANTIZING_LEVELS);
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values()) {
for (RecalDatum datum : table.values()) {
datum.calcCombinedEmpiricalQuality(QualityUtils.MAX_QUAL_SCORE);
datum.calcEstimatedReportedQuality();
}
}
}
public void output(PrintStream output) {
RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output);
}
}

View File

@ -25,13 +25,9 @@
package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -46,224 +42,27 @@ import java.util.*;
*/
public class BaseRecalibration {
private List<Byte> qualQuantizationMap; // histogram containing the map for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
/**
* Should ALWAYS use the constructor with the GATK Report file
*/
private BaseRecalibration() {}
/**
* Constructor using a GATK Report file
*
* @param RECAL_FILE a GATK Report file containing the recalibration information
*/
public BaseRecalibration(final File RECAL_FILE) {
GATKReport report = new GATKReport(RECAL_FILE);
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
GATKReportTable argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE);
RecalibrationArgumentCollection RAC = initializeArgumentCollectionTable(argumentTable);
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
qualQuantizationMap = initializeQuantizationTable(quantizedTable);
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, EmpiricalQual> table; // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
table = parseReadGroupTable(keyManager, reportTable);
}
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
table = parseQualityScoreTable(keyManager, reportTable);
}
else
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
}
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
final Map<BitSet, EmpiricalQual> table = parseAllCovariatesTable(keyManager, reportTable);
keysAndTablesMap.put(keyManager, table); // adding the pair table+key to the map
quantizationInfo = recalibrationReport.getQuantizationInfo();
keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
requestedCovariates = recalibrationReport.getRequestedCovariates();
}
/**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
*
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Shared parsing functionality for all tables.
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
Map<BitSet, EmpiricalQual> result = new HashMap<BitSet, EmpiricalQual>(reportTable.getNumRows()*2);
for (Object primaryKey : reportTable.getPrimaryKeys()) {
int nKeys = columnNamesOrderedList.size();
Object [] keySet = new Object[nKeys];
for (int i = 0; i < nKeys; i++)
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
BitSet bitKey = keyManager.bitSetFromKey(keySet);
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
EmpiricalQual empiricalQual = new EmpiricalQual(estimatedQReported, empiricalQuality);
result.put(bitKey, empiricalQual);
}
return result;
}
/**
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
*
* @param table the GATKReportTable containing the quantization mappings
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
*/
private List<Byte> initializeQuantizationTable(GATKReportTable table) {
Byte[] result = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
for (Object primaryKey : table.getPrimaryKeys()) {
Object value = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
byte originalQual = Byte.parseByte(primaryKey.toString());
byte quantizedQual = Byte.parseByte(value.toString());
result[originalQual] = quantizedQual;
}
return Arrays.asList(result);
}
/**
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
*
* @param table the GATKReportTable containing the arguments and its corresponding values
* @return a RAC object properly initialized with all the objects in the table
*/
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for (Object primaryKey : table.getPrimaryKeys()) {
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
if (value.equals("null"))
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
if (primaryKey.equals("covariate") && value != null)
RAC.COVARIATES = value.toString().split(",");
else if (primaryKey.equals("standard_covs"))
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("solid_recal_mode"))
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
else if (primaryKey.equals("solid_nocall_strategy"))
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
else if (primaryKey.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("insertions_context_size"))
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("deletions_context_size"))
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("insertions_default_quality"))
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("deletions_default_quality"))
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("low_quality_tail"))
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
else if (primaryKey.equals("default_platform"))
RAC.DEFAULT_PLATFORM = (String) value;
else if (primaryKey.equals("force_platform"))
RAC.FORCE_PLATFORM = (String) value;
else if (primaryKey.equals("quantizing_levels"))
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
}
return RAC;
}
/**
* Recalibrates the base qualities of a read
*
@ -316,9 +115,9 @@ public class BaseRecalibration {
double deltaQReported = 0.0;
double deltaQCovariates = 0.0;
for (Map.Entry<BQSRKeyManager, Map<BitSet, EmpiricalQual>> mapEntry : keysAndTablesMap.entrySet()) {
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> mapEntry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = mapEntry.getKey();
Map<BitSet, EmpiricalQual> table = mapEntry.getValue();
Map<BitSet, RecalDatum> table = mapEntry.getValue();
switch(keyManager.getRequiredCovariates().size()) {
case 1: // this is the ReadGroup table
@ -326,7 +125,7 @@ public class BaseRecalibration {
if (bitKeys.size() > 1)
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
final EmpiricalQual empiricalQualRG = table.get(bitKeys.get(0));
final RecalDatum empiricalQualRG = table.get(bitKeys.get(0));
if (empiricalQualRG != null) {
final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality();
final double aggregrateQReported = empiricalQualRG.getEstimatedQReported();
@ -339,7 +138,7 @@ public class BaseRecalibration {
if (bitKeys.size() > 1)
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
final EmpiricalQual empiricalQualQS = table.get(bitKeys.get(0));
final RecalDatum empiricalQualQS = table.get(bitKeys.get(0));
if (empiricalQualQS != null) {
final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality();
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
@ -348,7 +147,7 @@ public class BaseRecalibration {
else { // this is the table with all the covariates
bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to each covariate by itself in turn
for (BitSet k : bitKeys) {
final EmpiricalQual empiricalQualCO = table.get(k);
final RecalDatum empiricalQualCO = table.get(k);
if (empiricalQualCO != null) {
double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality();
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
@ -364,7 +163,8 @@ public class BaseRecalibration {
double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula
recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_QUAL_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL
return qualQuantizationMap.get((int) recalibratedQual); // return the quantized version of the recalibrated quality
return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality
}
}

View File

@ -0,0 +1,476 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.recalibration;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.PrintStream;
import java.util.*;
/**
* A general algorithm for quantizing quality score distributions to use a specific number of levels
*
* Takes a histogram of quality scores and a desired number of levels and produces a
* map from original quality scores -> quantized quality scores.
*
* Note that this data structure is fairly heavy-weight, holding lots of debugging and
* calculation information. If you want to use it efficiently at scale with lots of
* read groups the right way to do this:
*
* Map<ReadGroup, List<Byte>> map
* for each read group rg:
* hist = getQualHist(rg)
* QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual)
* map.set(rg, qq.getOriginalToQuantizedMap())
*
* This map would then be used to look up the appropriate original -> quantized
* quals for each read as it comes in.
*
* @author Mark Depristo
* @since 3/2/12
*/
public class QualQuantizer {
final private static Set<QualInterval> MY_EMPTY_SET = Collections.emptySet();
private static Logger logger = Logger.getLogger(QualQuantizer.class);
/**
* Inputs to the QualQuantizer
*/
final int nLevels, minInterestingQual;
final List<Long> nObservationsPerQual;
/**
* Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28).
*
* Has the same range as nObservationsPerQual
*/
final List<Byte> originalToQuantizedMap;
/** Sorted set of qual intervals.
*
* After quantize() this data structure contains only the top-level qual intervals
*/
final TreeSet<QualInterval> quantizedIntervals;
/**
* Protected creator for testng use only
*/
protected QualQuantizer(final int minInterestingQual) {
this.nObservationsPerQual = Collections.emptyList();
this.nLevels = 0;
this.minInterestingQual = minInterestingQual;
this.quantizedIntervals = null;
this.originalToQuantizedMap = null;
}
/**
* Creates a QualQuantizer for the histogram that has nLevels
*
* Note this is the only interface to the system. After creating this object
* the map can be obtained via getOriginalToQuantizedMap()
*
* @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that
* this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the
* way up to the largest quality score possible in the reads. OK if the histogram includes many 0
* count bins, as these are quantized for free.
* @param nLevels the desired number of distinct quality scores to represent the full original range. Must
* be at least 1.
* @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely
* merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and
* quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing
* all data with Q0-Q10.
*/
public QualQuantizer(final List<Long> nObservationsPerQual, final int nLevels, final int minInterestingQual) {
this.nObservationsPerQual = nObservationsPerQual;
this.nLevels = nLevels;
this.minInterestingQual = minInterestingQual;
// some sanity checking
if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedStingException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual));
if ( nLevels < 0 ) throw new ReviewedStingException("nLevels must be >= 0");
if ( minInterestingQual < 0 ) throw new ReviewedStingException("minInterestingQual must be >= 0");
// actually run the quantizer
this.quantizedIntervals = quantize();
// store the map
this.originalToQuantizedMap = intervalsToMap(quantizedIntervals);
}
/**
* Represents an contiguous interval of quality scores.
*
* qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2
*/
@Invariant({
"qStart <= qEnd",
"qStart >= 0",
"qEnd <= 1000",
"nObservations >= 0",
"nErrors >= 0",
"nErrors <= nObservations",
"fixedQual >= -1 && fixedQual <= QualityUtils.MAX_QUAL_SCORE",
"mergeOrder >= 0"})
protected final class QualInterval implements Comparable<QualInterval> {
final int qStart, qEnd, fixedQual, level;
final long nObservations, nErrors;
final Set<QualInterval> subIntervals;
/** for debugging / visualization. When was this interval created? */
int mergeOrder;
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) {
this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET);
}
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set<QualInterval> subIntervals) {
this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals);
}
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) {
this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET);
}
@Requires("level >= 0")
public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set<QualInterval> subIntervals) {
this.qStart = qStart;
this.qEnd = qEnd;
this.nObservations = nObservations;
this.nErrors = nErrors;
this.fixedQual = fixedQual;
this.level = level;
this.mergeOrder = 0;
this.subIntervals = Collections.unmodifiableSet(subIntervals);
}
/**
* Human readable name of this interval: e.g., 10-12
* @return
*/
public String getName() {
return qStart + "-" + qEnd;
}
@Override
public String toString() {
return "QQ:" + getName();
}
/**
* Returns the error rate (in real space) of this interval, or 0 if there are no obserations
* @return
*/
@Ensures("result >= 0.0")
public double getErrorRate() {
if ( hasFixedQual() )
return QualityUtils.qualToErrorProb((byte)fixedQual);
else if ( nObservations == 0 )
return 0.0;
else
return (nErrors+1) / (1.0 * (nObservations+1));
}
/**
* Returns the QUAL of the error rate of this interval, or the fixed
* qual if this interval was created with a fixed qual.
* @return
*/
@Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE")
public byte getQual() {
if ( ! hasFixedQual() )
return QualityUtils.probToQual(1-getErrorRate(), 0);
else
return (byte)fixedQual;
}
/**
* @return true if this bin is using a fixed qual
*/
public boolean hasFixedQual() {
return fixedQual != -1;
}
@Override
public int compareTo(final QualInterval qualInterval) {
return new Integer(this.qStart).compareTo(qualInterval.qStart);
}
/**
* Create a interval representing the merge of this interval and toMerge
*
* Errors and observations are combined
* Subintervals updated in order of left to right (determined by qStart)
* Level is 1 + highest level of this and toMerge
* Order must be updated elsewhere
*
* @param toMerge
* @return newly created merged QualInterval
*/
@Requires({"toMerge != null"})
@Ensures({
"result != null",
"result.nObservations >= this.nObservations",
"result.nObservations >= toMerge.nObservations",
"result.nErrors >= this.nErrors",
"result.nErrors >= toMerge.nErrors",
"result.qStart == Math.min(this.qStart, toMerge.qStart)",
"result.qEnd == Math.max(this.qEnd, toMerge.qEnd)",
"result.level > Math.max(this.level, toMerge.level)",
"result.subIntervals.size() == 2"
})
public QualInterval merge(final QualInterval toMerge) {
final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge;
final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this;
if ( left.qEnd + 1 != right.qStart )
throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right);
final long nCombinedObs = left.nObservations + right.nObservations;
final long nCombinedErr = left.nErrors + right.nErrors;
final int level = Math.max(left.level, right.level) + 1;
final Set<QualInterval> subIntervals = new HashSet<QualInterval>(Arrays.asList(left, right));
QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals);
return merged;
}
public double getPenalty() {
return calcPenalty(getErrorRate());
}
/**
* Calculate the penalty of this interval, given the overall error rate for the interval
*
* If the globalErrorRate is e, this value is:
*
* sum_i |log10(e_i) - log10(e)| * nObservations_i
*
* each the index i applies to all leaves of the tree accessible from this interval
* (found recursively from subIntervals as necessary)
*
* @param globalErrorRate overall error rate in real space against which we calculate the penalty
* @return the cost of approximating the bins in this interval with the globalErrorRate
*/
@Requires("globalErrorRate >= 0.0")
@Ensures("result >= 0.0")
private double calcPenalty(final double globalErrorRate) {
if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
return 0.0;
if ( subIntervals.isEmpty() ) {
// this is leave node
if ( this.qEnd <= minInterestingQual )
// It's free to merge up quality scores below the smallest interesting one
return 0;
else {
return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations;
}
} else {
double sum = 0;
for ( final QualInterval interval : subIntervals )
sum += interval.calcPenalty(globalErrorRate);
return sum;
}
}
}
/**
* Main method for computing the quantization intervals.
*
* Invoked in the constructor after all input variables are initialized. Walks
* over the inputs and builds the min. penalty forest of intervals with exactly nLevel
* root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed
* to find the optimal combination.
*
* TODO: develop a smarter algorithm
*
* @return the forest of intervals with size == nLevels
*/
@Ensures({"! result.isEmpty()", "result.size() == nLevels"})
private TreeSet<QualInterval> quantize() {
// create intervals for each qual individually
final TreeSet<QualInterval> intervals = new TreeSet<QualInterval>();
for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) {
final long nObs = nObservationsPerQual.get(qStart);
final double errorRate = QualityUtils.qualToErrorProb((byte)qStart);
final double nErrors = nObs * errorRate;
final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart);
intervals.add(qi);
}
// greedy algorithm:
// while ( n intervals >= nLevels ):
// find intervals to merge with least penalty
// merge it
while ( intervals.size() > nLevels ) {
mergeLowestPenaltyIntervals(intervals);
}
return intervals;
}
/**
* Helper function that finds and mergest together the lowest penalty pair
* of intervals
* @param intervals
*/
@Requires("! intervals.isEmpty()")
private void mergeLowestPenaltyIntervals(final TreeSet<QualInterval> intervals) {
// setup the iterators
final Iterator<QualInterval> it1 = intervals.iterator();
final Iterator<QualInterval> it1p = intervals.iterator();
it1p.next(); // skip one
// walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty
QualInterval minMerge = null;
if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size());
int lastMergeOrder = 0;
while ( it1p.hasNext() ) {
final QualInterval left = it1.next();
final QualInterval right = it1p.next();
final QualInterval merged = left.merge(right);
lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder);
if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) {
if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge);
minMerge = merged;
}
}
// now actually go ahead and merge the minMerge pair
if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge);
intervals.removeAll(minMerge.subIntervals);
intervals.add(minMerge);
minMerge.mergeOrder = lastMergeOrder + 1;
if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals);
}
/**
* Given a final forest of intervals constructs a list mapping
* list.get(i) => quantized qual to use for original quality score i
*
* This function should be called only once to initialize the corresponding
* cached value in this object, as the calculation is a bit costly.
*
* @param intervals
* @return
*/
@Ensures("result.size() == getNQualsInHistogram()")
private List<Byte> intervalsToMap(final TreeSet<QualInterval> intervals) {
final List<Byte> map = new ArrayList<Byte>(getNQualsInHistogram());
map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE));
for ( final QualInterval interval : intervals ) {
for ( int q = interval.qStart; q <= interval.qEnd; q++ ) {
map.set(q, interval.getQual());
}
}
if ( Collections.min(map) == Byte.MIN_VALUE )
throw new ReviewedStingException("quantized quality score map contains an un-initialized value");
return map;
}
@Ensures("result > 0")
private final int getNQualsInHistogram() {
return nObservationsPerQual.size();
}
/**
* Write out a GATKReport to visualize the QualQuantization process of this data
* @param out
*/
public void writeReport(PrintStream out) {
final GATKReport report = new GATKReport();
addQualHistogramToReport(report);
addIntervalsToReport(report);
report.print(out);
}
private final void addQualHistogramToReport(final GATKReport report) {
report.addTable("QualHistogram", "Quality score histogram provided to report");
GATKReportTable table = report.getTable("QualHistogram");
table.addPrimaryKey("qual");
table.addColumn("count", "NA");
for ( int q = 0; q < nObservationsPerQual.size(); q++ ) {
table.set(q, "count", nObservationsPerQual.get(q));
}
}
private final void addIntervalsToReport(final GATKReport report) {
report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals");
GATKReportTable table = report.getTable("QualQuantizerIntervals");
table.addPrimaryKey("name");
table.addColumn("qStart", "NA");
table.addColumn("qEnd", "NA");
table.addColumn("level", "NA");
table.addColumn("merge.order", "NA");
table.addColumn("nErrors", "NA");
table.addColumn("nObservations", "NA");
table.addColumn("qual", "NA");
table.addColumn("penalty", "NA");
table.addColumn("root.node", "NA");
//table.addColumn("subintervals", "NA");
for ( QualInterval interval : quantizedIntervals)
addIntervalToReport(table, interval, true);
}
private final void addIntervalToReport(final GATKReportTable table, QualInterval interval, final boolean atRootP) {
final String name = interval.getName();
table.set(name, "qStart", interval.qStart);
table.set(name, "qEnd", interval.qEnd);
table.set(name, "level", interval.level);
table.set(name, "merge.order", interval.mergeOrder);
table.set(name, "nErrors", interval.nErrors);
table.set(name, "nObservations", interval.nObservations);
table.set(name, "qual", interval.getQual());
table.set(name, "penalty", String.format("%.1f", interval.getPenalty()));
table.set(name, "root.node", atRootP);
for ( final QualInterval sub : interval.subIntervals )
addIntervalToReport(table, sub, false);
}
public List<Byte> getOriginalToQuantizedMap() {
return originalToQuantizedMap;
}
}

View File

@ -1,5 +1,8 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
@ -13,17 +16,69 @@ import java.util.List;
public class BQSRGathererUnitTest {
RecalibrationArgumentCollection RAC;
private static File recal1 = new File("public/testdata/exampleCSV.csv");
private static File recal2 = new File("public/testdata/exampleCSV.2.csv");
private static File recal = new File("public/testdata/exampleGRP.grp");
//todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure
@Test(enabled = false)
public void testCombineTwoFiles() {
public void testCombineSimilarFiles() {
BQSRGatherer gatherer = new BQSRGatherer();
List<File> recalFiles = new LinkedList<File> ();
File output = new File("foo.csv");
recalFiles.add(recal1);
recalFiles.add(recal2);
File output = new File("foo.grp");
recalFiles.add(recal);
recalFiles.add(recal);
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(recal);
GATKReport calculatedReport = new GATKReport(output);
for (GATKReportTable originalTable : originalReport.getTables()) {
GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName());
List<String> columnsToTest = new LinkedList<String>();
if (originalTable.getTableName().equals(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL
columnsToTest.add(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1);
}
else if (originalTable.getTableName().equals(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE)) {
columnsToTest.add(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
}
else if (originalTable.getTableName().startsWith("RecalTable")) {
columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
}
}
}
/**
* Common testing functionality given the columns to test and the multiplication factor to the expected result
*
* @param original the original table
* @param calculated the calculated table
* @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor)
* @param factor 1 to test for equality, any other value to multiply the original value and match with the calculated
*/
private void testTablesWithColumnsAndFactor(GATKReportTable original, GATKReportTable calculated, List<String> columnsToTest, int factor) {
for (Object primaryKey : original.getPrimaryKeys()) { // tables don't necessarily have the same primary keys
for (String column : columnsToTest) {
Object actual = calculated.get(primaryKey, column);
Object expected = original.get(primaryKey, column);
if (factor != 1) {
if (expected instanceof Double)
expected = (Double) expected * factor;
else if (expected instanceof Long)
expected = (Long) expected * factor;
else if (expected instanceof Integer)
expected = (Integer) expected * factor;
else if (expected instanceof Byte) {
expected = (Byte) expected * factor;
}
}
Assert.assertEquals(actual, expected, "Primary key: " + primaryKey + " Original Table: " + original.getTableName() + " Calc Table: " + calculated.getTableName());
}
}
}
}

View File

@ -0,0 +1,132 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.LinkedList;
import java.util.List;
/**
* @author Mauricio Carneiro
* @since 3/7/12
*/
public class BQSRKeyManagerUnitTest {
RecalibrationArgumentCollection RAC;
@BeforeClass
public void init() {
RAC = new RecalibrationArgumentCollection();
}
@Test(enabled = true)
public void testCombineBitSets() {
final int nRequired = 2;
final ArrayList<Covariate> covariates = new ArrayList<Covariate>();
covariates.add(new ReadGroupCovariate());
covariates.add(new QualityScoreCovariate());
covariates.add(new CycleCovariate());
covariates.add(new ContextCovariate());
createReadAndTest(covariates, nRequired);
}
@Test(enabled = true)
public void testOnlyRequiredCovariates() {
final int nRequired = 2;
final ArrayList<Covariate> covariates = new ArrayList<Covariate>(2);
covariates.add(new ReadGroupCovariate());
covariates.add(new QualityScoreCovariate());
createReadAndTest(covariates, nRequired);
}
@Test(enabled = true)
public void testOnlyOneCovariate() {
final int nRequired = 1;
final ArrayList<Covariate> covariates = new ArrayList<Covariate>(2);
covariates.add(new ReadGroupCovariate());
createReadAndTest(covariates, nRequired);
}
private void createReadAndTest(List<Covariate> covariates, int nRequired) {
int readLength = 1000;
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M");
read.setReadGroup(new GATKSAMReadGroupRecord("ID"));
read.getReadGroup().setPlatform("illumina");
runTestOnRead(read, covariates, nRequired);
read.setReadNegativeStrandFlag(true);
runTestOnRead(read, covariates, nRequired);
read.setReadPairedFlag(true);
read.setSecondOfPairFlag(true);
runTestOnRead(read, covariates, nRequired);
read.setReadNegativeStrandFlag(false);
runTestOnRead(read, covariates, nRequired);
}
private void runTestOnRead(GATKSAMRecord read, List<Covariate> covariateList, int nRequired) {
final BitSet[][][] covariateKeys = new BitSet[covariateList.size()][EventType.values().length][];
int i = 0;
for (Covariate cov : covariateList) {
cov.initialize(RAC);
CovariateValues covValues = cov.getValues(read);
covariateKeys[i][EventType.BASE_SUBSTITUTION.index] = covValues.getMismatches();
covariateKeys[i][EventType.BASE_INSERTION.index] = covValues.getInsertions();
covariateKeys[i][EventType.BASE_DELETION.index] = covValues.getDeletions();
i++;
}
List<Covariate> requiredCovariates = new LinkedList<Covariate>();
List<Covariate> optionalCovariates = new LinkedList<Covariate>();
for (int j=0; j<nRequired; j++)
requiredCovariates.add(covariateList.get(j));
for (int j=nRequired; j<covariateList.size(); j++)
optionalCovariates.add(covariateList.get(j));
BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
for (int l = 0; l < read.getReadLength(); l++) {
for (int eventType = 0; eventType < EventType.values().length; eventType++) {
BitSet[] keySet = new BitSet[covariateList.size()];
Object[] expectedRequired = new Object[covariateList.size()];
Object[] expectedCovariate = new Object[covariateList.size()];
for (int j = 0; j < covariateList.size(); j++) {
keySet[j] = covariateKeys[j][eventType][l];
if (j < nRequired)
expectedRequired[j] = covariateList.get(j).keyFromBitSet(keySet[j]);
else
expectedCovariate[j - nRequired] = covariateList.get(j).keyFromBitSet(keySet[j]);
}
List<BitSet> hashKeys = keyManager.bitSetsFromAllKeys(keySet, EventType.eventFrom(eventType));
short cov = 0;
for (BitSet key : hashKeys) {
Object[] actual = keyManager.keySetFrom(key).toArray();
// Build the expected array
Object[] expected = new Object[nRequired + (optionalCovariates.size() > 0 ? 3 : 1)];
System.arraycopy(expectedRequired, 0, expected, 0, nRequired);
if (optionalCovariates.size() > 0) {
expected[expected.length-3] = expectedCovariate[cov];
expected[expected.length-2] = optionalCovariates.get(cov++).getClass().getSimpleName().split("Covariate")[0];
}
expected[expected.length-1] = EventType.eventFrom(eventType);
// System.out.println("Actual : " + Utils.join(",", Arrays.asList(actual)));
// System.out.println("Expected: " + Utils.join(",", Arrays.asList(expected)));
// System.out.println();
for (int k = 0; k < expected.length; k++)
Assert.assertEquals(actual[k], expected[k]);
}
}
}
}
}

View File

@ -33,19 +33,15 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

1518
public/testdata/exampleGRP.grp vendored 100644

File diff suppressed because it is too large Load Diff