BQSR triage and test routines
* updated BQSR queue script for faster turnaround * implemented plot generation for scatter/gatherered runs * adjusted output file names to be cooperative with the queue script * added the recalibration report file to the argument table in the report * added ReadCovariates unit test -- guarantees that all the covariates are being generated for every base in the read * added RecalibrationReport unit test -- guarantees the integrity of the delta tables
This commit is contained in:
parent
a733723439
commit
e39a59594a
|
|
@ -65,6 +65,14 @@ public class BQSRGatherer extends Gatherer {
|
|||
if (generalReport == null)
|
||||
throw new ReviewedStingException(EMPTY_INPUT_LIST);
|
||||
|
||||
RecalibrationArgumentCollection RAC = generalReport.getRAC();
|
||||
if (RAC.recalibrationReport != null) {
|
||||
RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport);
|
||||
RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
|
||||
}
|
||||
else
|
||||
RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
|
||||
|
||||
generalReport.calculateEmpiricalAndQuantizedQualities();
|
||||
generalReport.output(outputFile);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -277,7 +277,42 @@ public class BQSRKeyManager {
|
|||
bitSet.and(mask);
|
||||
return chopNBitsFrom(bitSet, leadingBits);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof BQSRKeyManager))
|
||||
return false;
|
||||
|
||||
BQSRKeyManager other = (BQSRKeyManager) o;
|
||||
if (this == other)
|
||||
return true;
|
||||
|
||||
if (requiredCovariates.size() != other.requiredCovariates.size() || optionalCovariates.size() != other.optionalCovariates.size())
|
||||
return false;
|
||||
|
||||
Iterator<RequiredCovariateInfo> otherRequiredIterator = other.requiredCovariates.iterator();
|
||||
for (RequiredCovariateInfo thisInfo: requiredCovariates) {
|
||||
RequiredCovariateInfo otherInfo = otherRequiredIterator.next();
|
||||
|
||||
String thisName = thisInfo.covariate.getClass().getSimpleName();
|
||||
String otherName = otherInfo.covariate.getClass().getSimpleName();
|
||||
if (!thisName.equals(otherName))
|
||||
return false;
|
||||
}
|
||||
|
||||
Iterator<OptionalCovariateInfo> otherOptionalIterator = other.optionalCovariates.iterator();
|
||||
for (OptionalCovariateInfo thisInfo : optionalCovariates) {
|
||||
OptionalCovariateInfo otherInfo = otherOptionalIterator.next();
|
||||
String thisName = thisInfo.covariate.getClass().getSimpleName();
|
||||
String otherName = otherInfo.covariate.getClass().getSimpleName();
|
||||
if (!thisName.equals(otherName))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Aggregate information for each Covariate
|
||||
*/
|
||||
|
|
@ -292,7 +327,7 @@ public class BQSRKeyManager {
|
|||
this.covariate = covariate;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class OptionalCovariateInfo {
|
||||
public final BitSet covariateID; // cache the covariate ID
|
||||
public final Covariate covariate;
|
||||
|
|
|
|||
|
|
@ -55,6 +55,11 @@ public class Datum {
|
|||
numMismatches = 0L;
|
||||
}
|
||||
|
||||
public Datum(long numObservations, long numMismatches) {
|
||||
this.numObservations = numObservations;
|
||||
this.numMismatches = numMismatches;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// increment methods
|
||||
|
|
@ -90,4 +95,11 @@ public class Datum {
|
|||
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof Datum))
|
||||
return false;
|
||||
Datum other = (Datum) o;
|
||||
return numMismatches == other.numMismatches && numObservations == other.numObservations;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,4 +62,19 @@ public class ReadCovariates {
|
|||
for (int i = 0; i < covariateValues.length; i++)
|
||||
keySet[i][nextCovariateIndex] = covariateValues[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* Testing routines
|
||||
*/
|
||||
protected BitSet[][] getMismatchesKeySet() {
|
||||
return mismatchesKeySet;
|
||||
}
|
||||
|
||||
protected BitSet[][] getInsertionsKeySet() {
|
||||
return insertionsKeySet;
|
||||
}
|
||||
|
||||
protected BitSet[][] getDeletionsKeySet() {
|
||||
return deletionsKeySet;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,22 +25,24 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.R.RScriptExecutor;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.io.Resource;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -74,11 +76,13 @@ public class RecalDataManager {
|
|||
public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
|
||||
public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors";
|
||||
|
||||
private final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private static boolean warnUserNullPlatform = false;
|
||||
|
||||
private static final String SCRIPT_FILE = "BQSR.R";
|
||||
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
/**
|
||||
* Treat reference inserted bases as reference matching bases. Very unsafe!
|
||||
|
|
@ -309,6 +313,130 @@ public class RecalDataManager {
|
|||
report.print(outputFile);
|
||||
}
|
||||
|
||||
private static Pair<PrintStream, File> initializeRecalibrationPlot(File filename) {
|
||||
final PrintStream deltaTableStream;
|
||||
final File deltaTableFileName = new File(filename + ".csv");
|
||||
try {
|
||||
deltaTableStream = new PrintStream(deltaTableFileName);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created");
|
||||
}
|
||||
return new Pair<PrintStream, File>(deltaTableStream, deltaTableFileName);
|
||||
}
|
||||
|
||||
private static void outputRecalibrationPlot(Pair<PrintStream, File> files, boolean keepIntermediates) {
|
||||
final File csvFileName = files.getSecond();
|
||||
final File plotFileName = new File(csvFileName + ".pdf");
|
||||
files.getFirst().close();
|
||||
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class));
|
||||
executor.addArgs(csvFileName.getAbsolutePath());
|
||||
executor.addArgs(plotFileName.getAbsolutePath());
|
||||
executor.exec();
|
||||
|
||||
if (!keepIntermediates)
|
||||
if (!csvFileName.delete())
|
||||
throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath());
|
||||
|
||||
}
|
||||
|
||||
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, boolean keepIntermediates) {
|
||||
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
|
||||
writeCSV(files.getFirst(), original, "ORIGINAL", true);
|
||||
outputRecalibrationPlot(files, keepIntermediates);
|
||||
}
|
||||
|
||||
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> recalibrated, boolean keepIntermediates) {
|
||||
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
|
||||
writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true);
|
||||
writeCSV(files.getFirst(), original, "ORIGINAL", false);
|
||||
outputRecalibrationPlot(files, keepIntermediates);
|
||||
}
|
||||
|
||||
private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> map, String recalibrationMode, boolean printHeader) {
|
||||
final int QUALITY_SCORE_COVARIATE_INDEX = 1;
|
||||
final Map<BitSet, AccuracyDatum> deltaTable = new HashMap<BitSet, AccuracyDatum>();
|
||||
|
||||
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
|
||||
BQSRKeyManager keyManager = tableEntry.getKey();
|
||||
|
||||
if (keyManager.getOptionalCovariates().size() > 0) { // only need the 'all covariates' table
|
||||
Map<BitSet, RecalDatum> table = tableEntry.getValue();
|
||||
|
||||
// create a key manager for the delta table
|
||||
List<Covariate> requiredCovariates = keyManager.getRequiredCovariates().subList(0, 1); // include the read group covariate as the only required covariate
|
||||
List<Covariate> optionalCovariates = keyManager.getRequiredCovariates().subList(1, 2); // include the quality score covariate as an optional covariate
|
||||
optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates
|
||||
BQSRKeyManager deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager
|
||||
|
||||
|
||||
// create delta table
|
||||
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
|
||||
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
|
||||
|
||||
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
|
||||
byte originalQuality = Byte.parseByte((String) covs.get(QUALITY_SCORE_COVARIATE_INDEX)); // save the original quality for accuracy calculation later on
|
||||
covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
|
||||
BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
|
||||
addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table
|
||||
|
||||
covs.set(1, originalQuality); // replace the covariate value with the quality score
|
||||
covs.set(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate)
|
||||
deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
|
||||
addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table
|
||||
}
|
||||
|
||||
// print header
|
||||
if (printHeader) {
|
||||
List<String> header = new LinkedList<String>();
|
||||
header.add("ReadGroup");
|
||||
header.add("CovariateValue");
|
||||
header.add("CovariateName");
|
||||
header.add("EventType");
|
||||
header.add("Observations");
|
||||
header.add("Errors");
|
||||
header.add("EmpiricalQuality");
|
||||
header.add("AverageReportedQuality");
|
||||
header.add("Accuracy");
|
||||
header.add("Recalibration");
|
||||
deltaTableFile.println(Utils.join(",", header));
|
||||
}
|
||||
|
||||
// print each data line
|
||||
for(Map.Entry<BitSet, AccuracyDatum> deltaEntry : deltaTable.entrySet()) {
|
||||
List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey());
|
||||
RecalDatum deltaDatum = deltaEntry.getValue();
|
||||
deltaTableFile.print(Utils.join(",", deltaKeys));
|
||||
deltaTableFile.print("," + deltaDatum.toString());
|
||||
deltaTableFile.println("," + recalibrationMode);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the current AccuracyDatum element in the delta table.
|
||||
*
|
||||
* If it doesn't have an element yet, it creates an AccuracyDatum element and adds it to the delta table.
|
||||
*
|
||||
* @param deltaTable the delta table
|
||||
* @param deltaKey the key to the table
|
||||
* @param recalDatum the recal datum to combine with the accuracyDatum element in the table
|
||||
* @param originalQuality the quality score to we can calculate the accuracy for the accuracyDatum element
|
||||
*/
|
||||
private static void addToDeltaTable(Map<BitSet, AccuracyDatum> deltaTable, BitSet deltaKey, RecalDatum recalDatum, byte originalQuality) {
|
||||
AccuracyDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
|
||||
if (deltaDatum == null)
|
||||
deltaTable.put(deltaKey, new AccuracyDatum(recalDatum, originalQuality)); // if we don't have a key yet, create a new one with the same values as the curent datum
|
||||
else
|
||||
deltaDatum.combine(recalDatum, originalQuality); // if we do have a datum, combine it with this one.
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
|
||||
*
|
||||
|
|
@ -382,127 +510,6 @@ public class RecalDataManager {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
|
||||
*
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param originalQualScores The array of original quality scores to set to zero if needed
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
|
||||
* @return The byte array of original quality scores some of which might have been set to zero
|
||||
*/
|
||||
private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) {
|
||||
|
||||
final boolean negStrand = read.getReadNegativeStrandFlag();
|
||||
for (int iii = 1; iii < originalQualScores.length; iii++) {
|
||||
if (inconsistency[iii] == 1) {
|
||||
if (readBases[iii] == refBases[iii]) {
|
||||
if (negStrand) {
|
||||
originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0;
|
||||
}
|
||||
else {
|
||||
originalQualScores[iii] = (byte) 0;
|
||||
}
|
||||
if (setBaseN) {
|
||||
readBases[iii] = (byte) 'N';
|
||||
}
|
||||
}
|
||||
// Set the prev base to Q0 as well
|
||||
if (readBases[iii - 1] == refBases[iii - 1]) {
|
||||
if (negStrand) {
|
||||
originalQualScores[originalQualScores.length - iii] = (byte) 0;
|
||||
}
|
||||
else {
|
||||
originalQualScores[iii - 1] = (byte) 0;
|
||||
}
|
||||
if (setBaseN) {
|
||||
readBases[iii - 1] = (byte) 'N';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (negStrand) {
|
||||
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases(readBases);
|
||||
|
||||
return originalQualScores;
|
||||
}
|
||||
|
||||
/**
|
||||
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
|
||||
*
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
*/
|
||||
private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) {
|
||||
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
|
||||
if (attr != null) {
|
||||
byte[] colorSpaceQuals;
|
||||
if (attr instanceof String) {
|
||||
String x = (String) attr;
|
||||
colorSpaceQuals = x.getBytes();
|
||||
SAMUtils.fastqToPhred(colorSpaceQuals);
|
||||
}
|
||||
else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for (int iii = 1; iii < inconsistency.length - 1; iii++) {
|
||||
if (inconsistency[iii] == 1) {
|
||||
for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read
|
||||
if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
|
||||
if (readBases[jjj] == refBases[jjj]) {
|
||||
if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2);
|
||||
if (rand == 0) { // The color implied base won the coin flip
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
}
|
||||
else {
|
||||
final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
|
||||
final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if (numLow == 0) {
|
||||
numLow++;
|
||||
diffInQuality++;
|
||||
}
|
||||
final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh);
|
||||
if (rand >= numLow) { // higher q score won
|
||||
if (maxQuality == (int) colorSpaceQuals[jjj]) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had higher q score, and won out, so nothing to do here
|
||||
}
|
||||
else { // lower q score won
|
||||
if (minQuality == (int) colorSpaceQuals[jjj]) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had lower q score, and won out, so nothing to do here
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases(readBases);
|
||||
}
|
||||
else { // No color space quality tag in file
|
||||
throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the base and the color calculate the next base in the sequence
|
||||
*
|
||||
|
|
|
|||
|
|
@ -25,6 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
@ -114,4 +118,29 @@ public class RecalDatum extends Datum {
|
|||
return Math.pow(10.0, qual / -10.0);
|
||||
}
|
||||
|
||||
public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
|
||||
Random random = new Random();
|
||||
int nObservations = random.nextInt(maxObservations);
|
||||
int nErrors = random.nextInt(maxErrors);
|
||||
Datum datum = new Datum(nObservations, nErrors);
|
||||
double empiricalQuality = datum.empiricalQualDouble();
|
||||
double estimatedQReported = empiricalQuality + ((10 * random.nextDouble()) - 5); // empirical quality +/- 5.
|
||||
return new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
|
||||
}
|
||||
|
||||
/**
|
||||
* We don't compare the estimated quality reported because it may be different when read from
|
||||
* report tables.
|
||||
*
|
||||
* @param o the other recal datum
|
||||
* @return true if the two recal datums have the same number of observations, errors and empirical quality.
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof RecalDatum))
|
||||
return false;
|
||||
RecalDatum other = (RecalDatum) o;
|
||||
return super.equals(o) &&
|
||||
MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -170,6 +170,8 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering")
|
||||
public boolean NO_PLOTS = false;
|
||||
|
||||
public File recalibrationReport = null;
|
||||
|
||||
public GATKReportTable generateReportTable() {
|
||||
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run");
|
||||
argumentsTable.addPrimaryKey("Argument");
|
||||
|
|
@ -190,6 +192,7 @@ public class RecalibrationArgumentCollection {
|
|||
argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
|
||||
argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES);
|
||||
argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS);
|
||||
argumentsTable.set("recalibration_report", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath());
|
||||
return argumentsTable;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -73,19 +73,26 @@ public class RecalibrationReport {
|
|||
keysAndTablesMap.put(keyManager, table);
|
||||
}
|
||||
|
||||
protected RecalibrationReport(QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, GATKReportTable argumentTable, RecalibrationArgumentCollection RAC) {
|
||||
this.quantizationInfo = quantizationInfo;
|
||||
this.keysAndTablesMap = keysAndTablesMap;
|
||||
this.argumentTable = argumentTable;
|
||||
this.RAC = RAC;
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines two recalibration reports by adding all observations and errors
|
||||
*
|
||||
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
|
||||
* them after combining. The reason for not calculating it is because this function is inteded for combining a
|
||||
* series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
|
||||
* qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
|
||||
* makes this method faster
|
||||
*
|
||||
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
|
||||
*
|
||||
* @param other the recalibration report to combine with this one
|
||||
*/
|
||||
* Combines two recalibration reports by adding all observations and errors
|
||||
*
|
||||
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
|
||||
* them after combining. The reason for not calculating it is because this function is inteded for combining a
|
||||
* series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
|
||||
* qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
|
||||
* makes this method faster
|
||||
*
|
||||
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
|
||||
*
|
||||
* @param other the recalibration report to combine with this one
|
||||
*/
|
||||
public void combine(RecalibrationReport other) {
|
||||
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
|
||||
|
||||
|
|
@ -285,6 +292,12 @@ public class RecalibrationReport {
|
|||
|
||||
else if (primaryKey.equals("no_plots"))
|
||||
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
|
||||
|
||||
else if (primaryKey.equals("no_plots"))
|
||||
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
|
||||
|
||||
else if (primaryKey.equals("recalibration_report"))
|
||||
RAC.recalibrationReport = (value == null) ? null : new File((String) value);
|
||||
}
|
||||
|
||||
return RAC;
|
||||
|
|
@ -305,4 +318,45 @@ public class RecalibrationReport {
|
|||
public void output(PrintStream output) {
|
||||
RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output);
|
||||
}
|
||||
|
||||
public RecalibrationArgumentCollection getRAC() {
|
||||
return RAC;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof RecalibrationReport))
|
||||
return false;
|
||||
RecalibrationReport other = (RecalibrationReport) o;
|
||||
if (this == o)
|
||||
return true;
|
||||
return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap);
|
||||
}
|
||||
|
||||
private boolean isEqualTable(LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t1, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t2) {
|
||||
if (t1.size() != t2.size())
|
||||
return false;
|
||||
|
||||
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t1Iterator = t1.entrySet().iterator();
|
||||
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t2Iterator = t2.entrySet().iterator();
|
||||
|
||||
while (t1Iterator.hasNext() && t2Iterator.hasNext()) {
|
||||
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t1MapEntry = t1Iterator.next();
|
||||
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t2MapEntry = t2Iterator.next();
|
||||
|
||||
if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey())))
|
||||
return false;
|
||||
|
||||
Map<BitSet, RecalDatum> table2 = t2MapEntry.getValue();
|
||||
for (Map.Entry<BitSet, RecalDatum> t1TableEntry : t1MapEntry.getValue().entrySet()) {
|
||||
BitSet t1Key = t1TableEntry.getKey();
|
||||
if (!table2.containsKey(t1Key))
|
||||
return false;
|
||||
RecalDatum t1Datum = t1TableEntry.getValue();
|
||||
if (!t1Datum.equals(table2.get(t1Key)))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -738,8 +738,12 @@ public class ReadUtils {
|
|||
}
|
||||
|
||||
public static GATKSAMRecord createRandomRead(int length) {
|
||||
return createRandomRead(length, true);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord createRandomRead(int length, boolean allowNs) {
|
||||
byte[] quals = ReadUtils.createRandomReadQuals(length);
|
||||
byte[] bbases = ReadUtils.createRandomReadBases(length, true);
|
||||
byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs);
|
||||
return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,24 +31,29 @@ public class ContextCovariateUnitTest {
|
|||
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
|
||||
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
|
||||
CovariateValues values = covariate.getValues(read);
|
||||
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||
verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||
verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate);
|
||||
verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, clippedRead, covariate);
|
||||
verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, clippedRead, covariate);
|
||||
}
|
||||
|
||||
private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) {
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
String expectedContext = null;
|
||||
if (i - contextSize + 1 >= 0) {
|
||||
String context = bases.substring(i - contextSize + 1, i + 1);
|
||||
if (!context.contains("N"))
|
||||
expectedContext = context;
|
||||
}
|
||||
Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext);
|
||||
public static void verifyCovariateArray(BitSet[] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) {
|
||||
for (int i = 0; i < values.length; i++)
|
||||
Assert.assertEquals(contextCovariate.keyFromBitSet(values[i]), expectedContext(read, i, contextSize));
|
||||
|
||||
}
|
||||
|
||||
public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) {
|
||||
final String bases = stringFrom(read.getReadBases());
|
||||
String expectedContext = null;
|
||||
if (offset - contextSize + 1 >= 0) {
|
||||
String context = bases.substring(offset - contextSize + 1, offset + 1);
|
||||
if (!context.contains("N"))
|
||||
expectedContext = context;
|
||||
}
|
||||
return expectedContext;
|
||||
}
|
||||
|
||||
private String stringFrom(byte[] array) {
|
||||
private static String stringFrom(byte[] array) {
|
||||
String s = "";
|
||||
for (byte value : array)
|
||||
s += (char) value;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,78 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author carneiro
|
||||
* @since 4/21/12
|
||||
*/
|
||||
public class ReadCovariatesUnitTest {
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCovariateGeneration() {
|
||||
final String RGID = "id";
|
||||
final int length = 10;
|
||||
final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
GATKSAMRecord read = ReadUtils.createRandomRead(length, false);
|
||||
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID);
|
||||
rg.setPlatform("illumina");
|
||||
read.setReadGroup(rg);
|
||||
final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION);
|
||||
final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION);
|
||||
final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION);
|
||||
|
||||
ReadGroupCovariate rgCov = new ReadGroupCovariate();
|
||||
QualityScoreCovariate qsCov = new QualityScoreCovariate();
|
||||
ContextCovariate coCov = new ContextCovariate();
|
||||
CycleCovariate cyCov = new CycleCovariate();
|
||||
|
||||
rgCov.initialize(RAC);
|
||||
qsCov.initialize(RAC);
|
||||
coCov.initialize(RAC);
|
||||
cyCov.initialize(RAC);
|
||||
|
||||
List<Covariate> requestedCovariates = new ArrayList<Covariate>(4);
|
||||
requestedCovariates.add(rgCov);
|
||||
requestedCovariates.add(qsCov);
|
||||
requestedCovariates.add(coCov);
|
||||
requestedCovariates.add(cyCov);
|
||||
|
||||
ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates);
|
||||
|
||||
// check that the length is correct
|
||||
Assert.assertEquals(rc.getMismatchesKeySet().length, length);
|
||||
Assert.assertEquals(rc.getInsertionsKeySet().length, length);
|
||||
Assert.assertEquals(rc.getDeletionsKeySet().length, length);
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
// check that read group is always the same
|
||||
Assert.assertEquals(rgCov.keyFromBitSet(rc.getMismatchesKeySet(i)[0]), RGID);
|
||||
Assert.assertEquals(rgCov.keyFromBitSet(rc.getInsertionsKeySet(i)[0]), RGID);
|
||||
Assert.assertEquals(rgCov.keyFromBitSet(rc.getDeletionsKeySet(i)[0]), RGID);
|
||||
|
||||
// check quality score
|
||||
Assert.assertEquals(qsCov.keyFromBitSet(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]);
|
||||
Assert.assertEquals(qsCov.keyFromBitSet(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]);
|
||||
Assert.assertEquals(qsCov.keyFromBitSet(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]);
|
||||
|
||||
// check context
|
||||
Assert.assertEquals(coCov.keyFromBitSet(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE));
|
||||
Assert.assertEquals(coCov.keyFromBitSet(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INSERTIONS_CONTEXT_SIZE));
|
||||
Assert.assertEquals(coCov.keyFromBitSet(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.DELETIONS_CONTEXT_SIZE));
|
||||
|
||||
// check cycle
|
||||
Assert.assertEquals(cyCov.keyFromBitSet(rc.getMismatchesKeySet(i)[3]), "" + (i+1));
|
||||
Assert.assertEquals(cyCov.keyFromBitSet(rc.getInsertionsKeySet(i)[3]), "" + (i+1));
|
||||
Assert.assertEquals(cyCov.keyFromBitSet(rc.getDeletionsKeySet(i)[3]), "" + (i+1));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author carneiro
|
||||
* @since 4/21/12
|
||||
*/
|
||||
public class RecalibrationReportUnitTest {
|
||||
@Test(enabled = true)
|
||||
public void testOutput() {
|
||||
final int length = 100;
|
||||
|
||||
List<Byte> quals = new ArrayList<Byte>(QualityUtils.MAX_QUAL_SCORE + 1);
|
||||
List<Long> counts = new ArrayList<Long>(QualityUtils.MAX_QUAL_SCORE + 1);
|
||||
|
||||
for (int i = 0; i<= QualityUtils.MAX_QUAL_SCORE; i++) {
|
||||
quals.add((byte) i);
|
||||
counts.add(1L);
|
||||
}
|
||||
|
||||
final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts);
|
||||
final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
|
||||
|
||||
quantizationInfo.noQuantization();
|
||||
final List<Covariate> requiredCovariates = new LinkedList<Covariate>();
|
||||
final List<Covariate> optionalCovariates = new LinkedList<Covariate>();
|
||||
final List<Covariate> requestedCovariates = new LinkedList<Covariate>();
|
||||
|
||||
final ReadGroupCovariate rgCovariate = new ReadGroupCovariate();
|
||||
rgCovariate.initialize(RAC);
|
||||
requiredCovariates.add(rgCovariate);
|
||||
final BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
|
||||
keysAndTablesMap.put(rgKeyManager, new HashMap<BitSet, RecalDatum>());
|
||||
|
||||
final QualityScoreCovariate qsCovariate = new QualityScoreCovariate();
|
||||
qsCovariate.initialize(RAC);
|
||||
requiredCovariates.add(qsCovariate);
|
||||
final BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
|
||||
keysAndTablesMap.put(qsKeyManager, new HashMap<BitSet, RecalDatum>());
|
||||
|
||||
final ContextCovariate cxCovariate = new ContextCovariate();
|
||||
cxCovariate.initialize(RAC);
|
||||
optionalCovariates.add(cxCovariate);
|
||||
final CycleCovariate cyCovariate = new CycleCovariate();
|
||||
cyCovariate.initialize(RAC);
|
||||
optionalCovariates.add(cyCovariate);
|
||||
BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
|
||||
keysAndTablesMap.put(cvKeyManager, new HashMap<BitSet, RecalDatum>());
|
||||
|
||||
for (Covariate cov : requiredCovariates)
|
||||
requestedCovariates.add(cov);
|
||||
for (Covariate cov : optionalCovariates)
|
||||
requestedCovariates.add(cov);
|
||||
|
||||
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id");
|
||||
rg.setPlatform("illumina");
|
||||
final GATKSAMRecord read = ReadUtils.createRandomRead(length, false);
|
||||
read.setReadGroup(rg);
|
||||
final byte [] readQuals = new byte[length];
|
||||
for (int i = 0; i < length; i++)
|
||||
readQuals[i] = 20;
|
||||
read.setBaseQualities(readQuals);
|
||||
|
||||
|
||||
final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INSERTIONS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE);
|
||||
int nKeys = 0; // keep track of how many keys were produced
|
||||
final ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates);
|
||||
for (int offset = 0; offset < length; offset++) {
|
||||
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
|
||||
BQSRKeyManager keyManager = entry.getKey();
|
||||
Map<BitSet, RecalDatum> table = entry.getValue();
|
||||
|
||||
for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION)) {
|
||||
table.put(key, RecalDatum.createRandomRecalDatum(10000, 10));
|
||||
nKeys++;
|
||||
}
|
||||
|
||||
for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getInsertionsKeySet(offset), EventType.BASE_INSERTION)) {
|
||||
table.put(key, RecalDatum.createRandomRecalDatum(100000, 10));
|
||||
nKeys++;
|
||||
}
|
||||
|
||||
|
||||
for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getDeletionsKeySet(offset), EventType.BASE_DELETION)) {
|
||||
table.put(key, RecalDatum.createRandomRecalDatum(100000, 10));
|
||||
nKeys++;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(nKeys, expectedKeys);
|
||||
|
||||
RecalibrationReport report = new RecalibrationReport(quantizationInfo, keysAndTablesMap, RAC.generateReportTable(), RAC);
|
||||
|
||||
File output = new File("RecalibrationReportUnitTestOutuput.grp");
|
||||
PrintStream out;
|
||||
try {
|
||||
out = new PrintStream(output);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new ReviewedStingException("couldn't create the file " + output, e);
|
||||
}
|
||||
report.output(out);
|
||||
|
||||
RecalibrationReport loadedReport = new RecalibrationReport(output);
|
||||
|
||||
Assert.assertTrue(report.equals(loadedReport));
|
||||
if (!output.delete())
|
||||
throw new ReviewedStingException("File could not be deleted " + output);
|
||||
}
|
||||
|
||||
private static int expectedNumberOfKeys (int nCovariates, int readLength, int indelContextSize, int mismatchesContextSize) {
|
||||
int nommcs = readLength >= mismatchesContextSize ? mismatchesContextSize-1 : readLength;
|
||||
int noincs = readLength >= indelContextSize ? 2*(indelContextSize-1) : 2*readLength;
|
||||
return (nCovariates * readLength * 3) - nommcs - noincs;
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue