Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2012-03-27 15:19:46 -04:00
commit 74615c42df
15 changed files with 2814 additions and 428 deletions

View File

@ -1,4 +1,4 @@
Copyright (c) 2011 The Broad Institute Copyright (c) 2012 The Broad Institute
Permission is hereby granted, free of charge, to any person Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation obtaining a copy of this software and associated documentation

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.*; import java.io.*;
import java.util.Collection; import java.util.Collection;
import java.util.List;
import java.util.TreeMap; import java.util.TreeMap;
/** /**
@ -141,6 +142,11 @@ public class GATKReport {
tables.put(table.getTableName(), table); tables.put(table.getTableName(), table);
} }
public void addTables(List<GATKReportTable> gatkReportTables) {
for (GATKReportTable table : gatkReportTables)
addTable(table);
}
/** /**
* Return true if table with a given name exists * Return true if table with a given name exists
* *

View File

@ -26,17 +26,13 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.commandline.Gatherer; import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatumOptimized;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
/** /**
* User: carneiro * User: carneiro
@ -45,80 +41,31 @@ import java.util.Map;
public class BQSRGatherer extends Gatherer { public class BQSRGatherer extends Gatherer {
///////////////////////////// private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
// Private Member Variables private static final String MISSING_OUTPUT_FILE = "missing output file name";
/////////////////////////////
private static final String EOF_MARKER = "EOF";
private HashMap<String, RecalDatumOptimized> dataMap = new HashMap<String, RecalDatumOptimized>();
private void addCSVData (String line) {
String[] covariates = line.split(",");
String key = "";
RecalDatumOptimized values;
for (int i = 0; i < covariates.length-3; i++)
key += covariates[i] + ",";
if (covariates.length < 3)
throw new ReviewedStingException("Line only has 1 covariate : " + line);
values = new RecalDatumOptimized(Long.parseLong(covariates[covariates.length - 3]), Long.parseLong(covariates[covariates.length - 2]));
RecalDatumOptimized currentValues = dataMap.get(key);
if (currentValues == null)
dataMap.put(key, values);
else
currentValues.increment(values);
}
@Override @Override
public void gather(List<File> inputs, File output) { public void gather(List<File> inputs, File output) {
PrintStream o; RecalibrationReport generalReport = null;
PrintStream outputFile;
try { try {
o = new PrintStream(output); outputFile = new PrintStream(output);
} catch ( FileNotFoundException e) { } catch(FileNotFoundException e) {
throw new UserException("File to be output by CountCovariates Gather function was not found"); throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
} }
boolean sawEOF = false; for (File input : inputs) {
boolean printedHeader = false; RecalibrationReport inputReport = new RecalibrationReport(input);
if (generalReport == null)
// Read input files generalReport = inputReport;
for ( File RECAL_FILE : inputs) { else
try { generalReport.combine(inputReport);
for ( String line : new XReadLines(RECAL_FILE) ) {
if ( EOF_MARKER.equals(line) ) {
sawEOF = true; // sanity check
break;
}
else if(line.startsWith("#")) {
if (!printedHeader)
o.println(line);
}
else // Found a line of data
addCSVData(line); // Parse the line and add the data to the HashMap
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
}
if ( !sawEOF ) {
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
}
printedHeader = true;
} }
if (generalReport == null)
throw new ReviewedStingException(EMPTY_INPUT_LIST);
// Write output file from dataMap generalReport.calculateEmpiricalAndQuantizedQualities();
for(Map.Entry<String, RecalDatumOptimized> entry : dataMap.entrySet()) generalReport.output(outputFile);
o.println(entry.getKey() + entry.getValue().outputToCSV());
o.println("EOF");
} }
} }

View File

@ -25,17 +25,17 @@ import java.util.*;
* @since 3/6/12 * @since 3/6/12
*/ */
public class BQSRKeyManager { public class BQSRKeyManager {
private List<RequiredCovariateInfo> requiredCovariates; private final List<RequiredCovariateInfo> requiredCovariates;
private List<OptionalCovariateInfo> optionalCovariates; private final List<OptionalCovariateInfo> optionalCovariates;
private Map<String, Short> covariateNameToIDMap; private final Map<String, Short> covariateNameToIDMap;
private int nRequiredBits; // Number of bits used to represent the required covariates private int nRequiredBits; // Number of bits used to represent the required covariates
private int nOptionalBits; // Number of bits used to represent the standard covaraites private int nOptionalBits; // Number of bits used to represent the standard covaraites
private int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs private final int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
private int totalNumberOfBits; // Sum of all of the above plus the event bits private final int totalNumberOfBits; // Sum of all of the above plus the event bits
private BitSet optionalCovariateMask; // Standard mask for optional covariates bitset private final BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
private BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset private final BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
/** /**
* Initializes the KeyManager with the total number of covariates to use * Initializes the KeyManager with the total number of covariates to use
@ -44,34 +44,34 @@ public class BQSRKeyManager {
* @param optionalCovariates the ordered list of optional covariates * @param optionalCovariates the ordered list of optional covariates
*/ */
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) { public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay) this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates) this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
nRequiredBits = 0; nRequiredBits = 0;
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
int nBits = required.numberOfBits(); // number of bits used by this covariate int nBits = required.numberOfBits(); // number of bits used by this covariate
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate
nRequiredBits += nBits; nRequiredBits += nBits;
} }
short id = 0; short id = 0;
nOptionalBits = 0; nOptionalBits = 0;
for (Covariate optional : optionalCovariates) { for (Covariate optional : optionalCovariates) {
int nBits = optional.numberOfBits(); // number of bits used by this covariate int nBits = optional.numberOfBits(); // number of bits used by this covariate
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
this.covariateNameToIDMap.put(covariateName, id); this.covariateNameToIDMap.put(covariateName, id);
id++; id++;
} }
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
} }
/** /**
@ -93,32 +93,32 @@ public class BQSRKeyManager {
* @return one key in bitset representation per covariate * @return one key in bitset representation per covariate
*/ */
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) { public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
int covariateIndex = 0; int covariateIndex = 0;
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
for (RequiredCovariateInfo infoRequired : requiredCovariates) for (RequiredCovariateInfo infoRequired : requiredCovariates)
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
for (OptionalCovariateInfo infoOptional : optionalCovariates) { for (OptionalCovariateInfo infoOptional : optionalCovariates) {
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
if (covariateKey == null) if (covariateKey == null)
continue; // do not add nulls to the final set of keys. continue; // do not add nulls to the final set of keys.
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
optionalKey.or(requiredKey); // import all the required covariates optionalKey.or(requiredKey); // import all the required covariates
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(optionalKey); // add this key to the list of keys allBitSets.add(optionalKey); // add this key to the list of keys
} }
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key) if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(requiredKey); // add this key to the list of keys allBitSets.add(requiredKey); // add this key to the list of keys
} }
return allBitSets; return allBitSets;
@ -141,25 +141,25 @@ public class BQSRKeyManager {
int requiredCovariate = 0; int requiredCovariate = 0;
for (RequiredCovariateInfo infoRequired : requiredCovariates) { for (RequiredCovariateInfo infoRequired : requiredCovariates) {
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
} }
if (optionalCovariates.size() > 0) { if (optionalCovariates.size() > 0) {
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
} }
int eventIndex = key.length - 1; // the event type is always the last key int eventIndex = key.length - 1; // the event type is always the last key
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
return bitSetKey; return bitSetKey;
} }
@ -186,19 +186,19 @@ public class BQSRKeyManager {
public List<Object> keySetFrom(BitSet key) { public List<Object> keySetFrom(BitSet key) {
List<Object> objectKeys = new ArrayList<Object>(); List<Object> objectKeys = new ArrayList<Object>();
for (RequiredCovariateInfo info : requiredCovariates) { for (RequiredCovariateInfo info : requiredCovariates) {
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
} }
if (optionalCovariates.size() > 0) { if (optionalCovariates.size() > 0) {
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits);// mask out the covariate order (to identify which covariate this is) BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is)
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
} }
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
return objectKeys; return objectKeys;
} }
@ -227,7 +227,7 @@ public class BQSRKeyManager {
private BitSet chopNBitsFrom(BitSet key, int n) { private BitSet chopNBitsFrom(BitSet key, int n) {
BitSet choppedKey = new BitSet(); BitSet choppedKey = new BitSet();
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1)) for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
return choppedKey; return choppedKey;
} }
@ -269,7 +269,7 @@ public class BQSRKeyManager {
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) { private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1)) for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
} }
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) { private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
@ -282,22 +282,20 @@ public class BQSRKeyManager {
* Aggregate information for each Covariate * Aggregate information for each Covariate
*/ */
class RequiredCovariateInfo { class RequiredCovariateInfo {
public int bitsBefore; // number of bits before this covariate in the combined bitset key public final int bitsBefore; // number of bits before this covariate in the combined bitset key
public int nBits; // number of bits used by this covariate (cached access to covariate.nBits()) public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
public BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) public final Covariate covariate; // this allows reverse lookup of the Covariates in order
public Covariate covariate; // this allows reverse lookup of the Covariates in order
RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) { RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) {
this.bitsBefore = bitsBefore; this.bitsBefore = bitsBefore;
this.nBits = nBits;
this.mask = mask; this.mask = mask;
this.covariate = covariate; this.covariate = covariate;
} }
} }
class OptionalCovariateInfo { class OptionalCovariateInfo {
public BitSet covariateID; // cache the covariate ID public final BitSet covariateID; // cache the covariate ID
public Covariate covariate; public final Covariate covariate;
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) { OptionalCovariateInfo(BitSet covariateID, Covariate covariate) {
this.covariateID = covariateID; this.covariateID = covariateID;

View File

@ -0,0 +1,74 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Map;
/**
* Class that encapsulates the information necessary for quality score quantization for BQSR
*
* @author carneiro
* @since 3/26/12
*/
public class QuantizationInfo {
private List<Byte> quantizedQuals;
private List<Long> empiricalQualCounts;
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
this.quantizedQuals = quantizedQuals;
this.empiricalQualCounts = empiricalQualCounts;
}
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
Map<BitSet, RecalDatum> qualTable = null; // look for the quality score table
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
if (keyManager.getRequiredCovariates().size() == 2) // it should be the only one with 2 required covaraites
qualTable = entry.getValue();
}
if (qualTable == null)
throw new ReviewedStingException("Could not find QualityScore table.");
for (RecalDatum datum : qualTable.values()) {
int empiricalQual = (int) Math.round(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL )
long nObservations = datum.numObservations;
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
}
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
quantizeQualityScores(nLevels);
}
public void quantizeQualityScores(int nLevels) {
QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
}
public List<Byte> getQuantizedQuals() {
return quantizedQuals;
}
public GATKReportTable generateReportTable() {
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
quantizedTable.addColumn(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, 0L);
quantizedTable.addColumn(RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, (byte) 0);
for (int qual = 0; qual <= QualityUtils.MAX_QUAL_SCORE; qual++) {
quantizedTable.set(qual, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual));
quantizedTable.set(qual, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual));
}
return quantizedTable;
}
}

View File

@ -28,7 +28,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.samtools.SAMUtils; import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual; import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
@ -42,6 +43,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.PrintStream;
import java.util.*; import java.util.*;
/** /**
@ -71,13 +73,14 @@ public class RecalDataManager {
public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue";
public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors";
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams private final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullPlatform = false; private static boolean warnUserNullPlatform = false;
public enum SOLID_RECAL_MODE { public enum SOLID_RECAL_MODE {
/** /**
* Treat reference inserted bases as reference matching bases. Very unsafe! * Treat reference inserted bases as reference matching bases. Very unsafe!
@ -136,25 +139,38 @@ public class RecalDataManager {
} }
} }
public static void listAvailableCovariates(Logger logger) {
// Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
// Print and exit if that's what was requested /**
logger.info("Available covariates:"); * Initializes the recalibration table -> key manager map
for (Class<?> covClass : covariateClasses) *
logger.info(covClass.getSimpleName()); * @param requiredCovariates list of required covariates (in order)
logger.info(""); * @param optionalCovariates list of optional covariates (in order)
* @return a map with each key manager and it's corresponding recalibration table properly initialized
*/
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
}
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
} }
/** /**
* Generates two lists : required covariates and optional covariates based on the user's requests. * Generates two lists : required covariates and optional covariates based on the user's requests.
* *
* Performs the following tasks in order: * Performs the following tasks in order:
* 1. Adds all requierd covariates in order * 1. Adds all requierd covariates in order
* 2. Check if the user asked to use the standard covariates and adds them all if that's the case * 2. Check if the user asked to use the standard covariates and adds them all if that's the case
* 3. Adds all covariates requested by the user that were not already added by the two previous steps * 3. Adds all covariates requested by the user that were not already added by the two previous steps
* *
* @param argumentCollection the argument collection object for the recalibration walker * @param argumentCollection the argument collection object for the recalibration walker
* @return a pair of ordered lists : required covariates (first) and optional covariates (second) * @return a pair of ordered lists : required covariates (first) and optional covariates (second)
*/ */
@ -194,52 +210,102 @@ public class RecalDataManager {
return new Pair<ArrayList<Covariate>, ArrayList<Covariate>>(requiredCovariates, optionalCovariates); return new Pair<ArrayList<Covariate>, ArrayList<Covariate>>(requiredCovariates, optionalCovariates);
} }
/** public static void listAvailableCovariates(Logger logger) {
* Initializes the recalibration table -> key manager map // Get a list of all available covariates
* final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
* @param requiredCovariates list of required covariates (in order)
* @param optionalCovariates list of optional covariates (in order) // Print and exit if that's what was requested
* @return a map with each key manager and it's corresponding recalibration table properly initialized logger.info("Available covariates:");
*/ for (Class<?> covClass : covariateClasses)
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) { logger.info(covClass.getSimpleName());
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>(); logger.info("");
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
}
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
} }
/** public static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) {
* Initializes the table -> key manager map (unfortunate copy of the above code with minor modifications to accomodate the different return types (RecalDatum vs EmpiricalQual objects) List<GATKReportTable> result = new LinkedList<GATKReportTable>();
* int tableIndex = 0;
* @param requiredCovariates list of required covariates (in order) for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
* @param optionalCovariates list of optional covariates (in order) BQSRKeyManager keyManager = entry.getKey();
* @return a map with each key manager and it's corresponding recalibration table properly initialized Map<BitSet, RecalDatum> recalTable = entry.getValue();
*/
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> initializeEmpiricalTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) { GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "");
final LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>(); final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
for (Covariate covariate : requiredCovariates) { final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.2f");
requiredCovariatesToAdd.add(covariate); final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.2f");
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(QualityUtils.MAX_QUAL_SCORE); // initializing a new recal table for each required covariate (cumulatively) final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
long primaryKey = 0L;
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
for (Covariate covariate : requiredList) {
String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order
columnNames.add(new Pair<String,String>(name, "%s")); // save the required covariate name so we can reference it in the future
}
if (optionalList.size() > 0) {
columnNames.add(covariateValue);
columnNames.add(covariateName);
}
columnNames.add(eventType); // the order of these column names is important here
columnNames.add(empiricalQuality);
columnNames.add(estimatedQReported);
columnNames.add(nObservations);
columnNames.add(nErrors);
reportTable.addPrimaryKey("PrimaryKey", false); // every table must have a primary key (hidden)
for (Pair<String, String> columnName : columnNames)
reportTable.addColumn(columnName.getFirst(), true, columnName.getSecond()); // every table must have the event type
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
BitSet bitSetKey = recalTableEntry.getKey();
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
Iterator<Pair<String, String>> iterator = columnNames.iterator();
for (Object key : keyManager.keySetFrom(bitSetKey)) {
String columnName = iterator.next().getFirst();
columnData.put(columnName, key);
}
RecalDatum datum = recalTableEntry.getValue();
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); // iterator.next() gives the column name for Empirical Quality
columnData.put(iterator.next().getFirst(), Math.round(datum.getEstimatedQReported())); // iterator.next() gives the column name for EstimatedQReported
columnData.put(iterator.next().getFirst(), datum.numObservations);
columnData.put(iterator.next().getFirst(), datum.numMismatches);
for (Map.Entry<String, Object> dataEntry : columnData.entrySet()) {
String columnName = dataEntry.getKey();
Object value = dataEntry.getValue();
reportTable.set(primaryKey, columnName, value.toString());
}
primaryKey++;
}
result.add(reportTable);
} }
final Map<BitSet, EmpiricalQual> recalTable = new HashMap<BitSet, EmpiricalQual>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates return result;
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
} }
public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List<GATKReportTable> recalTables, PrintStream outputFile) {
GATKReport report = new GATKReport();
report.addTable(argumentTable);
report.addTable(quantizationTable);
report.addTables(recalTables);
report.print(outputFile);
}
/** /**
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string

View File

@ -38,6 +38,8 @@ public class RecalDatum extends RecalDatumOptimized {
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
// constructors // constructors
@ -75,7 +77,6 @@ public class RecalDatum extends RecalDatumOptimized {
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
this.increment(other.numObservations, other.numMismatches); this.increment(other.numObservations, other.numMismatches);
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations); this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -84,8 +85,8 @@ public class RecalDatum extends RecalDatumOptimized {
// //
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) { public final void calcCombinedEmpiricalQuality(final int maxQual) {
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again this.empiricalQuality = empiricalQualDouble(SMOOTHING_CONSTANT, maxQual); // cache the value so we don't call log over and over again
} }
public final void calcEstimatedReportedQuality() { public final void calcEstimatedReportedQuality() {
@ -106,6 +107,11 @@ public class RecalDatum extends RecalDatumOptimized {
return empiricalQuality; return empiricalQuality;
} }
public final void resetCalculatedQualities() {
empiricalQuality = 0.0;
estimatedQReported = 0.0;
}
private double calcExpectedErrors() { private double calcExpectedErrors() {
return (double) this.numObservations * qualToErrorProb(estimatedQReported); return (double) this.numObservations * qualToErrorProb(estimatedQReported);
} }

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.Utils;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.Collections; import java.util.Collections;
@ -156,5 +158,25 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16; public int QUANTIZING_LEVELS = 16;
public GATKReportTable generateReportTable() {
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run");
argumentsTable.addPrimaryKey("Argument");
argumentsTable.addColumn(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, "null");
argumentsTable.set("covariate", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES));
argumentsTable.set("standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, USE_STANDARD_COVARIATES);
argumentsTable.set("run_without_dbsnp", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
argumentsTable.set("solid_recal_mode", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.set("insertions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_CONTEXT_SIZE);
argumentsTable.set("deletions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_CONTEXT_SIZE);
argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.set("insertions_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
argumentsTable.set("low_quality_tail", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
return argumentsTable;
}
} }

View File

@ -0,0 +1,290 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* This class has all the static functionality for reading a recalibration report file into memory.
*
* @author carneiro
* @since 3/26/12
*/
public class RecalibrationReport {
private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
GATKReportTable argumentTable; // keep the argument table untouched just for output purposes
RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter | todo -- this should be a new parameter, not necessarily coming from the original table parameter list
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
public RecalibrationReport(final File RECAL_FILE) {
GATKReport report = new GATKReport(RECAL_FILE);
argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE);
RAC = initializeArgumentCollectionTable(argumentTable);
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
quantizationInfo = initializeQuantizationTable(quantizedTable);
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
table = parseReadGroupTable(keyManager, reportTable);
}
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
table = parseQualityScoreTable(keyManager, reportTable);
}
else
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
}
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
final Map<BitSet, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable);
keysAndTablesMap.put(keyManager, table);
}
/**
* Combines two recalibration reports by adding all observations and errors
*
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate them
* after combining. The reason for not calculating it is because this function is inteded for combining a series of
* recalibration reports, and it only makes sense to calculate the empirical qualities and quantized qualities after all
* the recalibration reports have been combined. Having the user recalculate when appropriate, makes this method faster
*
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
*
* @param other the recalibration report to combine with this one
*/
public void combine(RecalibrationReport other) {
Iterator<Map<BitSet, RecalDatum>> tableIterator = keysAndTablesMap.values().iterator(); // because these are ordered (linked hashmaps) we can iterate over the 'this' and do a for loop on the 'other' tables and be sure that we are looking at the equivalent tables on both objects
for (Map<BitSet, RecalDatum> otherTable : other.getKeysAndTablesMap().values()) { // iterate over all tables for 'other'
Map<BitSet, RecalDatum> thisTable = tableIterator.next(); // iterate over all tables for 'this'
for (Map.Entry<BitSet, RecalDatum> entry : otherTable.entrySet()) { // for each table, go through all the entries in the 'other' dataset to update 'this' dataset
BitSet key = entry.getKey();
RecalDatum otherDatum = entry.getValue();
RecalDatum thisDatum = thisTable.get(key);
thisDatum.increment(otherDatum); // add the two datum objects into 'this'
thisDatum.resetCalculatedQualities(); // reset the empirical quality to make sure the user doesn't forget to recalculate it
}
}
}
public QuantizationInfo getQuantizationInfo() {
return quantizationInfo;
}
public LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> getKeysAndTablesMap() {
return keysAndTablesMap;
}
public ArrayList<Covariate> getRequestedCovariates() {
return requestedCovariates;
}
/**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
*
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Shared parsing functionality for all tables.
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2);
for (Object primaryKey : reportTable.getPrimaryKeys()) {
int nKeys = columnNamesOrderedList.size();
Object [] keySet = new Object[nKeys];
for (int i = 0; i < nKeys; i++)
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
BitSet bitKey = keyManager.bitSetFromKey(keySet);
long nObservations = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
long nErrors = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
result.put(bitKey, recalDatum);
}
return result;
}
/**
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
*
* @param table the GATKReportTable containing the quantization mappings
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
*/
private QuantizationInfo initializeQuantizationTable(GATKReportTable table) {
Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1];
for (Object primaryKey : table.getPrimaryKeys()) {
Object quantizedObject = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
Object countObject = table.get(primaryKey, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
byte originalQual = Byte.parseByte(primaryKey.toString());
byte quantizedQual = Byte.parseByte(quantizedObject.toString());
long quantizedCount = Long.parseLong(countObject.toString());
quals[originalQual] = quantizedQual;
counts[originalQual] = quantizedCount;
}
return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts));
}
/**
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
*
* @param table the GATKReportTable containing the arguments and its corresponding values
* @return a RAC object properly initialized with all the objects in the table
*/
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for (Object primaryKey : table.getPrimaryKeys()) {
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
if (value.equals("null"))
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
if (primaryKey.equals("covariate") && value != null)
RAC.COVARIATES = value.toString().split(",");
else if (primaryKey.equals("standard_covs"))
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("solid_recal_mode"))
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
else if (primaryKey.equals("solid_nocall_strategy"))
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
else if (primaryKey.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("insertions_context_size"))
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("deletions_context_size"))
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("insertions_default_quality"))
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("deletions_default_quality"))
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("low_quality_tail"))
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
else if (primaryKey.equals("default_platform"))
RAC.DEFAULT_PLATFORM = (String) value;
else if (primaryKey.equals("force_platform"))
RAC.FORCE_PLATFORM = (String) value;
else if (primaryKey.equals("quantizing_levels"))
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
}
return RAC;
}
/**
* this functionality avoids recalculating the empirical qualities, estimated reported quality
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
*/
public void calculateEmpiricalAndQuantizedQualities() {
quantizationInfo.quantizeQualityScores(RAC.QUANTIZING_LEVELS);
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values()) {
for (RecalDatum datum : table.values()) {
datum.calcCombinedEmpiricalQuality(QualityUtils.MAX_QUAL_SCORE);
datum.calcEstimatedReportedQuality();
}
}
}
public void output(PrintStream output) {
RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output);
}
}

View File

@ -25,13 +25,9 @@
package org.broadinstitute.sting.utils.recalibration; package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.bqsr.*; import org.broadinstitute.sting.gatk.walkers.bqsr.*;
import org.broadinstitute.sting.gatk.walkers.recalibration.EmpiricalQual;
import org.broadinstitute.sting.utils.BitSetUtils; import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -46,224 +42,27 @@ import java.util.*;
*/ */
public class BaseRecalibration { public class BaseRecalibration {
private List<Byte> qualQuantizationMap; // histogram containing the map for qual quantization (calculated after recalibration is done) private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>> keysAndTablesMap; // quick access reference to the read group table and its key manager private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code"; private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here"; private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
/**
* Should ALWAYS use the constructor with the GATK Report file
*/
private BaseRecalibration() {}
/** /**
* Constructor using a GATK Report file * Constructor using a GATK Report file
* *
* @param RECAL_FILE a GATK Report file containing the recalibration information * @param RECAL_FILE a GATK Report file containing the recalibration information
*/ */
public BaseRecalibration(final File RECAL_FILE) { public BaseRecalibration(final File RECAL_FILE) {
GATKReport report = new GATKReport(RECAL_FILE); RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
GATKReportTable argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE); quantizationInfo = recalibrationReport.getQuantizationInfo();
RecalibrationArgumentCollection RAC = initializeArgumentCollectionTable(argumentTable); keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
requestedCovariates = recalibrationReport.getRequestedCovariates();
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
qualQuantizationMap = initializeQuantizationTable(quantizedTable);
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, EmpiricalQual>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, EmpiricalQual> table; // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
table = parseReadGroupTable(keyManager, reportTable);
}
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
table = parseQualityScoreTable(keyManager, reportTable);
}
else
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
}
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
final Map<BitSet, EmpiricalQual> table = parseAllCovariatesTable(keyManager, reportTable);
keysAndTablesMap.put(keyManager, table); // adding the pair table+key to the map
} }
/**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
*
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
}
/**
* Shared parsing functionality for all tables.
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, EmpiricalQual> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
Map<BitSet, EmpiricalQual> result = new HashMap<BitSet, EmpiricalQual>(reportTable.getNumRows()*2);
for (Object primaryKey : reportTable.getPrimaryKeys()) {
int nKeys = columnNamesOrderedList.size();
Object [] keySet = new Object[nKeys];
for (int i = 0; i < nKeys; i++)
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
BitSet bitKey = keyManager.bitSetFromKey(keySet);
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
EmpiricalQual empiricalQual = new EmpiricalQual(estimatedQReported, empiricalQuality);
result.put(bitKey, empiricalQual);
}
return result;
}
/**
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
*
* @param table the GATKReportTable containing the quantization mappings
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
*/
private List<Byte> initializeQuantizationTable(GATKReportTable table) {
Byte[] result = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
for (Object primaryKey : table.getPrimaryKeys()) {
Object value = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
byte originalQual = Byte.parseByte(primaryKey.toString());
byte quantizedQual = Byte.parseByte(value.toString());
result[originalQual] = quantizedQual;
}
return Arrays.asList(result);
}
/**
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
*
* @param table the GATKReportTable containing the arguments and its corresponding values
* @return a RAC object properly initialized with all the objects in the table
*/
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for (Object primaryKey : table.getPrimaryKeys()) {
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
if (value.equals("null"))
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
if (primaryKey.equals("covariate") && value != null)
RAC.COVARIATES = value.toString().split(",");
else if (primaryKey.equals("standard_covs"))
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("solid_recal_mode"))
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
else if (primaryKey.equals("solid_nocall_strategy"))
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
else if (primaryKey.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("insertions_context_size"))
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("deletions_context_size"))
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("insertions_default_quality"))
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("deletions_default_quality"))
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("low_quality_tail"))
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
else if (primaryKey.equals("default_platform"))
RAC.DEFAULT_PLATFORM = (String) value;
else if (primaryKey.equals("force_platform"))
RAC.FORCE_PLATFORM = (String) value;
else if (primaryKey.equals("quantizing_levels"))
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
}
return RAC;
}
/** /**
* Recalibrates the base qualities of a read * Recalibrates the base qualities of a read
* *
@ -316,9 +115,9 @@ public class BaseRecalibration {
double deltaQReported = 0.0; double deltaQReported = 0.0;
double deltaQCovariates = 0.0; double deltaQCovariates = 0.0;
for (Map.Entry<BQSRKeyManager, Map<BitSet, EmpiricalQual>> mapEntry : keysAndTablesMap.entrySet()) { for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> mapEntry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = mapEntry.getKey(); BQSRKeyManager keyManager = mapEntry.getKey();
Map<BitSet, EmpiricalQual> table = mapEntry.getValue(); Map<BitSet, RecalDatum> table = mapEntry.getValue();
switch(keyManager.getRequiredCovariates().size()) { switch(keyManager.getRequiredCovariates().size()) {
case 1: // this is the ReadGroup table case 1: // this is the ReadGroup table
@ -326,7 +125,7 @@ public class BaseRecalibration {
if (bitKeys.size() > 1) if (bitKeys.size() > 1)
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION); throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
final EmpiricalQual empiricalQualRG = table.get(bitKeys.get(0)); final RecalDatum empiricalQualRG = table.get(bitKeys.get(0));
if (empiricalQualRG != null) { if (empiricalQualRG != null) {
final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(); final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality();
final double aggregrateQReported = empiricalQualRG.getEstimatedQReported(); final double aggregrateQReported = empiricalQualRG.getEstimatedQReported();
@ -339,7 +138,7 @@ public class BaseRecalibration {
if (bitKeys.size() > 1) if (bitKeys.size() > 1)
throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION); throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION);
final EmpiricalQual empiricalQualQS = table.get(bitKeys.get(0)); final RecalDatum empiricalQualQS = table.get(bitKeys.get(0));
if (empiricalQualQS != null) { if (empiricalQualQS != null) {
final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(); final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality();
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
@ -348,7 +147,7 @@ public class BaseRecalibration {
else { // this is the table with all the covariates else { // this is the table with all the covariates
bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to each covariate by itself in turn bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to each covariate by itself in turn
for (BitSet k : bitKeys) { for (BitSet k : bitKeys) {
final EmpiricalQual empiricalQualCO = table.get(k); final RecalDatum empiricalQualCO = table.get(k);
if (empiricalQualCO != null) { if (empiricalQualCO != null) {
double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(); double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality();
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
@ -364,7 +163,8 @@ public class BaseRecalibration {
double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula
recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_QUAL_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_QUAL_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL
return qualQuantizationMap.get((int) recalibratedQual); // return the quantized version of the recalibrated quality
return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality
} }
} }

View File

@ -0,0 +1,476 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.recalibration;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.PrintStream;
import java.util.*;
/**
* A general algorithm for quantizing quality score distributions to use a specific number of levels
*
* Takes a histogram of quality scores and a desired number of levels and produces a
* map from original quality scores -> quantized quality scores.
*
* Note that this data structure is fairly heavy-weight, holding lots of debugging and
* calculation information. If you want to use it efficiently at scale with lots of
* read groups the right way to do this:
*
* Map<ReadGroup, List<Byte>> map
* for each read group rg:
* hist = getQualHist(rg)
* QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual)
* map.set(rg, qq.getOriginalToQuantizedMap())
*
* This map would then be used to look up the appropriate original -> quantized
* quals for each read as it comes in.
*
* @author Mark Depristo
* @since 3/2/12
*/
public class QualQuantizer {
final private static Set<QualInterval> MY_EMPTY_SET = Collections.emptySet();
private static Logger logger = Logger.getLogger(QualQuantizer.class);
/**
* Inputs to the QualQuantizer
*/
final int nLevels, minInterestingQual;
final List<Long> nObservationsPerQual;
/**
* Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28).
*
* Has the same range as nObservationsPerQual
*/
final List<Byte> originalToQuantizedMap;
/** Sorted set of qual intervals.
*
* After quantize() this data structure contains only the top-level qual intervals
*/
final TreeSet<QualInterval> quantizedIntervals;
/**
* Protected creator for testng use only
*/
protected QualQuantizer(final int minInterestingQual) {
this.nObservationsPerQual = Collections.emptyList();
this.nLevels = 0;
this.minInterestingQual = minInterestingQual;
this.quantizedIntervals = null;
this.originalToQuantizedMap = null;
}
/**
* Creates a QualQuantizer for the histogram that has nLevels
*
* Note this is the only interface to the system. After creating this object
* the map can be obtained via getOriginalToQuantizedMap()
*
* @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that
* this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the
* way up to the largest quality score possible in the reads. OK if the histogram includes many 0
* count bins, as these are quantized for free.
* @param nLevels the desired number of distinct quality scores to represent the full original range. Must
* be at least 1.
* @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely
* merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and
* quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing
* all data with Q0-Q10.
*/
public QualQuantizer(final List<Long> nObservationsPerQual, final int nLevels, final int minInterestingQual) {
this.nObservationsPerQual = nObservationsPerQual;
this.nLevels = nLevels;
this.minInterestingQual = minInterestingQual;
// some sanity checking
if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedStingException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual));
if ( nLevels < 0 ) throw new ReviewedStingException("nLevels must be >= 0");
if ( minInterestingQual < 0 ) throw new ReviewedStingException("minInterestingQual must be >= 0");
// actually run the quantizer
this.quantizedIntervals = quantize();
// store the map
this.originalToQuantizedMap = intervalsToMap(quantizedIntervals);
}
/**
* Represents an contiguous interval of quality scores.
*
* qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2
*/
@Invariant({
"qStart <= qEnd",
"qStart >= 0",
"qEnd <= 1000",
"nObservations >= 0",
"nErrors >= 0",
"nErrors <= nObservations",
"fixedQual >= -1 && fixedQual <= QualityUtils.MAX_QUAL_SCORE",
"mergeOrder >= 0"})
protected final class QualInterval implements Comparable<QualInterval> {
final int qStart, qEnd, fixedQual, level;
final long nObservations, nErrors;
final Set<QualInterval> subIntervals;
/** for debugging / visualization. When was this interval created? */
int mergeOrder;
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) {
this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET);
}
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set<QualInterval> subIntervals) {
this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals);
}
protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) {
this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET);
}
@Requires("level >= 0")
public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set<QualInterval> subIntervals) {
this.qStart = qStart;
this.qEnd = qEnd;
this.nObservations = nObservations;
this.nErrors = nErrors;
this.fixedQual = fixedQual;
this.level = level;
this.mergeOrder = 0;
this.subIntervals = Collections.unmodifiableSet(subIntervals);
}
/**
* Human readable name of this interval: e.g., 10-12
* @return
*/
public String getName() {
return qStart + "-" + qEnd;
}
@Override
public String toString() {
return "QQ:" + getName();
}
/**
* Returns the error rate (in real space) of this interval, or 0 if there are no obserations
* @return
*/
@Ensures("result >= 0.0")
public double getErrorRate() {
if ( hasFixedQual() )
return QualityUtils.qualToErrorProb((byte)fixedQual);
else if ( nObservations == 0 )
return 0.0;
else
return (nErrors+1) / (1.0 * (nObservations+1));
}
/**
* Returns the QUAL of the error rate of this interval, or the fixed
* qual if this interval was created with a fixed qual.
* @return
*/
@Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE")
public byte getQual() {
if ( ! hasFixedQual() )
return QualityUtils.probToQual(1-getErrorRate(), 0);
else
return (byte)fixedQual;
}
/**
* @return true if this bin is using a fixed qual
*/
public boolean hasFixedQual() {
return fixedQual != -1;
}
@Override
public int compareTo(final QualInterval qualInterval) {
return new Integer(this.qStart).compareTo(qualInterval.qStart);
}
/**
* Create a interval representing the merge of this interval and toMerge
*
* Errors and observations are combined
* Subintervals updated in order of left to right (determined by qStart)
* Level is 1 + highest level of this and toMerge
* Order must be updated elsewhere
*
* @param toMerge
* @return newly created merged QualInterval
*/
@Requires({"toMerge != null"})
@Ensures({
"result != null",
"result.nObservations >= this.nObservations",
"result.nObservations >= toMerge.nObservations",
"result.nErrors >= this.nErrors",
"result.nErrors >= toMerge.nErrors",
"result.qStart == Math.min(this.qStart, toMerge.qStart)",
"result.qEnd == Math.max(this.qEnd, toMerge.qEnd)",
"result.level > Math.max(this.level, toMerge.level)",
"result.subIntervals.size() == 2"
})
public QualInterval merge(final QualInterval toMerge) {
final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge;
final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this;
if ( left.qEnd + 1 != right.qStart )
throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right);
final long nCombinedObs = left.nObservations + right.nObservations;
final long nCombinedErr = left.nErrors + right.nErrors;
final int level = Math.max(left.level, right.level) + 1;
final Set<QualInterval> subIntervals = new HashSet<QualInterval>(Arrays.asList(left, right));
QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals);
return merged;
}
public double getPenalty() {
return calcPenalty(getErrorRate());
}
/**
* Calculate the penalty of this interval, given the overall error rate for the interval
*
* If the globalErrorRate is e, this value is:
*
* sum_i |log10(e_i) - log10(e)| * nObservations_i
*
* each the index i applies to all leaves of the tree accessible from this interval
* (found recursively from subIntervals as necessary)
*
* @param globalErrorRate overall error rate in real space against which we calculate the penalty
* @return the cost of approximating the bins in this interval with the globalErrorRate
*/
@Requires("globalErrorRate >= 0.0")
@Ensures("result >= 0.0")
private double calcPenalty(final double globalErrorRate) {
if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
return 0.0;
if ( subIntervals.isEmpty() ) {
// this is leave node
if ( this.qEnd <= minInterestingQual )
// It's free to merge up quality scores below the smallest interesting one
return 0;
else {
return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations;
}
} else {
double sum = 0;
for ( final QualInterval interval : subIntervals )
sum += interval.calcPenalty(globalErrorRate);
return sum;
}
}
}
/**
* Main method for computing the quantization intervals.
*
* Invoked in the constructor after all input variables are initialized. Walks
* over the inputs and builds the min. penalty forest of intervals with exactly nLevel
* root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed
* to find the optimal combination.
*
* TODO: develop a smarter algorithm
*
* @return the forest of intervals with size == nLevels
*/
@Ensures({"! result.isEmpty()", "result.size() == nLevels"})
private TreeSet<QualInterval> quantize() {
// create intervals for each qual individually
final TreeSet<QualInterval> intervals = new TreeSet<QualInterval>();
for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) {
final long nObs = nObservationsPerQual.get(qStart);
final double errorRate = QualityUtils.qualToErrorProb((byte)qStart);
final double nErrors = nObs * errorRate;
final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart);
intervals.add(qi);
}
// greedy algorithm:
// while ( n intervals >= nLevels ):
// find intervals to merge with least penalty
// merge it
while ( intervals.size() > nLevels ) {
mergeLowestPenaltyIntervals(intervals);
}
return intervals;
}
/**
* Helper function that finds and mergest together the lowest penalty pair
* of intervals
* @param intervals
*/
@Requires("! intervals.isEmpty()")
private void mergeLowestPenaltyIntervals(final TreeSet<QualInterval> intervals) {
// setup the iterators
final Iterator<QualInterval> it1 = intervals.iterator();
final Iterator<QualInterval> it1p = intervals.iterator();
it1p.next(); // skip one
// walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty
QualInterval minMerge = null;
if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size());
int lastMergeOrder = 0;
while ( it1p.hasNext() ) {
final QualInterval left = it1.next();
final QualInterval right = it1p.next();
final QualInterval merged = left.merge(right);
lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder);
if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) {
if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge);
minMerge = merged;
}
}
// now actually go ahead and merge the minMerge pair
if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge);
intervals.removeAll(minMerge.subIntervals);
intervals.add(minMerge);
minMerge.mergeOrder = lastMergeOrder + 1;
if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals);
}
/**
* Given a final forest of intervals constructs a list mapping
* list.get(i) => quantized qual to use for original quality score i
*
* This function should be called only once to initialize the corresponding
* cached value in this object, as the calculation is a bit costly.
*
* @param intervals
* @return
*/
@Ensures("result.size() == getNQualsInHistogram()")
private List<Byte> intervalsToMap(final TreeSet<QualInterval> intervals) {
final List<Byte> map = new ArrayList<Byte>(getNQualsInHistogram());
map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE));
for ( final QualInterval interval : intervals ) {
for ( int q = interval.qStart; q <= interval.qEnd; q++ ) {
map.set(q, interval.getQual());
}
}
if ( Collections.min(map) == Byte.MIN_VALUE )
throw new ReviewedStingException("quantized quality score map contains an un-initialized value");
return map;
}
@Ensures("result > 0")
private final int getNQualsInHistogram() {
return nObservationsPerQual.size();
}
/**
* Write out a GATKReport to visualize the QualQuantization process of this data
* @param out
*/
public void writeReport(PrintStream out) {
final GATKReport report = new GATKReport();
addQualHistogramToReport(report);
addIntervalsToReport(report);
report.print(out);
}
private final void addQualHistogramToReport(final GATKReport report) {
report.addTable("QualHistogram", "Quality score histogram provided to report");
GATKReportTable table = report.getTable("QualHistogram");
table.addPrimaryKey("qual");
table.addColumn("count", "NA");
for ( int q = 0; q < nObservationsPerQual.size(); q++ ) {
table.set(q, "count", nObservationsPerQual.get(q));
}
}
private final void addIntervalsToReport(final GATKReport report) {
report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals");
GATKReportTable table = report.getTable("QualQuantizerIntervals");
table.addPrimaryKey("name");
table.addColumn("qStart", "NA");
table.addColumn("qEnd", "NA");
table.addColumn("level", "NA");
table.addColumn("merge.order", "NA");
table.addColumn("nErrors", "NA");
table.addColumn("nObservations", "NA");
table.addColumn("qual", "NA");
table.addColumn("penalty", "NA");
table.addColumn("root.node", "NA");
//table.addColumn("subintervals", "NA");
for ( QualInterval interval : quantizedIntervals)
addIntervalToReport(table, interval, true);
}
private final void addIntervalToReport(final GATKReportTable table, QualInterval interval, final boolean atRootP) {
final String name = interval.getName();
table.set(name, "qStart", interval.qStart);
table.set(name, "qEnd", interval.qEnd);
table.set(name, "level", interval.level);
table.set(name, "merge.order", interval.mergeOrder);
table.set(name, "nErrors", interval.nErrors);
table.set(name, "nObservations", interval.nObservations);
table.set(name, "qual", interval.getQual());
table.set(name, "penalty", String.format("%.1f", interval.getPenalty()));
table.set(name, "root.node", atRootP);
for ( final QualInterval sub : interval.subIntervals )
addIntervalToReport(table, sub, false);
}
public List<Byte> getOriginalToQuantizedMap() {
return originalToQuantizedMap;
}
}

View File

@ -1,5 +1,8 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.testng.Assert;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.io.File; import java.io.File;
@ -13,17 +16,69 @@ import java.util.List;
public class BQSRGathererUnitTest { public class BQSRGathererUnitTest {
RecalibrationArgumentCollection RAC; RecalibrationArgumentCollection RAC;
private static File recal1 = new File("public/testdata/exampleCSV.csv"); private static File recal = new File("public/testdata/exampleGRP.grp");
private static File recal2 = new File("public/testdata/exampleCSV.2.csv");
//todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure
@Test(enabled = false) @Test(enabled = false)
public void testCombineTwoFiles() { public void testCombineSimilarFiles() {
BQSRGatherer gatherer = new BQSRGatherer(); BQSRGatherer gatherer = new BQSRGatherer();
List<File> recalFiles = new LinkedList<File> (); List<File> recalFiles = new LinkedList<File> ();
File output = new File("foo.csv"); File output = new File("foo.grp");
recalFiles.add(recal);
recalFiles.add(recal1); recalFiles.add(recal);
recalFiles.add(recal2);
gatherer.gather(recalFiles, output); gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(recal);
GATKReport calculatedReport = new GATKReport(output);
for (GATKReportTable originalTable : originalReport.getTables()) {
GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName());
List<String> columnsToTest = new LinkedList<String>();
if (originalTable.getTableName().equals(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL
columnsToTest.add(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1);
}
else if (originalTable.getTableName().equals(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE)) {
columnsToTest.add(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
}
else if (originalTable.getTableName().startsWith("RecalTable")) {
columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2);
}
}
}
/**
* Common testing functionality given the columns to test and the multiplication factor to the expected result
*
* @param original the original table
* @param calculated the calculated table
* @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor)
* @param factor 1 to test for equality, any other value to multiply the original value and match with the calculated
*/
private void testTablesWithColumnsAndFactor(GATKReportTable original, GATKReportTable calculated, List<String> columnsToTest, int factor) {
for (Object primaryKey : original.getPrimaryKeys()) { // tables don't necessarily have the same primary keys
for (String column : columnsToTest) {
Object actual = calculated.get(primaryKey, column);
Object expected = original.get(primaryKey, column);
if (factor != 1) {
if (expected instanceof Double)
expected = (Double) expected * factor;
else if (expected instanceof Long)
expected = (Long) expected * factor;
else if (expected instanceof Integer)
expected = (Integer) expected * factor;
else if (expected instanceof Byte) {
expected = (Byte) expected * factor;
}
}
Assert.assertEquals(actual, expected, "Primary key: " + primaryKey + " Original Table: " + original.getTableName() + " Calc Table: " + calculated.getTableName());
}
}
} }
} }

View File

@ -0,0 +1,132 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.LinkedList;
import java.util.List;
/**
* @author Mauricio Carneiro
* @since 3/7/12
*/
public class BQSRKeyManagerUnitTest {
RecalibrationArgumentCollection RAC;
@BeforeClass
public void init() {
RAC = new RecalibrationArgumentCollection();
}
@Test(enabled = true)
public void testCombineBitSets() {
final int nRequired = 2;
final ArrayList<Covariate> covariates = new ArrayList<Covariate>();
covariates.add(new ReadGroupCovariate());
covariates.add(new QualityScoreCovariate());
covariates.add(new CycleCovariate());
covariates.add(new ContextCovariate());
createReadAndTest(covariates, nRequired);
}
@Test(enabled = true)
public void testOnlyRequiredCovariates() {
final int nRequired = 2;
final ArrayList<Covariate> covariates = new ArrayList<Covariate>(2);
covariates.add(new ReadGroupCovariate());
covariates.add(new QualityScoreCovariate());
createReadAndTest(covariates, nRequired);
}
@Test(enabled = true)
public void testOnlyOneCovariate() {
final int nRequired = 1;
final ArrayList<Covariate> covariates = new ArrayList<Covariate>(2);
covariates.add(new ReadGroupCovariate());
createReadAndTest(covariates, nRequired);
}
private void createReadAndTest(List<Covariate> covariates, int nRequired) {
int readLength = 1000;
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M");
read.setReadGroup(new GATKSAMReadGroupRecord("ID"));
read.getReadGroup().setPlatform("illumina");
runTestOnRead(read, covariates, nRequired);
read.setReadNegativeStrandFlag(true);
runTestOnRead(read, covariates, nRequired);
read.setReadPairedFlag(true);
read.setSecondOfPairFlag(true);
runTestOnRead(read, covariates, nRequired);
read.setReadNegativeStrandFlag(false);
runTestOnRead(read, covariates, nRequired);
}
private void runTestOnRead(GATKSAMRecord read, List<Covariate> covariateList, int nRequired) {
final BitSet[][][] covariateKeys = new BitSet[covariateList.size()][EventType.values().length][];
int i = 0;
for (Covariate cov : covariateList) {
cov.initialize(RAC);
CovariateValues covValues = cov.getValues(read);
covariateKeys[i][EventType.BASE_SUBSTITUTION.index] = covValues.getMismatches();
covariateKeys[i][EventType.BASE_INSERTION.index] = covValues.getInsertions();
covariateKeys[i][EventType.BASE_DELETION.index] = covValues.getDeletions();
i++;
}
List<Covariate> requiredCovariates = new LinkedList<Covariate>();
List<Covariate> optionalCovariates = new LinkedList<Covariate>();
for (int j=0; j<nRequired; j++)
requiredCovariates.add(covariateList.get(j));
for (int j=nRequired; j<covariateList.size(); j++)
optionalCovariates.add(covariateList.get(j));
BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
for (int l = 0; l < read.getReadLength(); l++) {
for (int eventType = 0; eventType < EventType.values().length; eventType++) {
BitSet[] keySet = new BitSet[covariateList.size()];
Object[] expectedRequired = new Object[covariateList.size()];
Object[] expectedCovariate = new Object[covariateList.size()];
for (int j = 0; j < covariateList.size(); j++) {
keySet[j] = covariateKeys[j][eventType][l];
if (j < nRequired)
expectedRequired[j] = covariateList.get(j).keyFromBitSet(keySet[j]);
else
expectedCovariate[j - nRequired] = covariateList.get(j).keyFromBitSet(keySet[j]);
}
List<BitSet> hashKeys = keyManager.bitSetsFromAllKeys(keySet, EventType.eventFrom(eventType));
short cov = 0;
for (BitSet key : hashKeys) {
Object[] actual = keyManager.keySetFrom(key).toArray();
// Build the expected array
Object[] expected = new Object[nRequired + (optionalCovariates.size() > 0 ? 3 : 1)];
System.arraycopy(expectedRequired, 0, expected, 0, nRequired);
if (optionalCovariates.size() > 0) {
expected[expected.length-3] = expectedCovariate[cov];
expected[expected.length-2] = optionalCovariates.get(cov++).getClass().getSimpleName().split("Covariate")[0];
}
expected[expected.length-1] = EventType.eventFrom(eventType);
// System.out.println("Actual : " + Utils.join(",", Arrays.asList(actual)));
// System.out.println("Expected: " + Utils.join(",", Arrays.asList(expected)));
// System.out.println();
for (int k = 0; k < expected.length; k++)
Assert.assertEquals(actual[k], expected[k]);
}
}
}
}
}

View File

@ -33,19 +33,15 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.lang.reflect.Array;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;

1518
public/testdata/exampleGRP.grp vendored 100644

File diff suppressed because it is too large Load Diff