BQSR triage and test routines

* updated BQSR queue script for faster turnaround
   * implemented plot generation for scatter/gatherered runs
   * adjusted output file names to be cooperative with the queue script
   * added the recalibration report file to the argument table in the report
   * added ReadCovariates unit test -- guarantees that all the covariates are being generated for every base in the read
   * added RecalibrationReport unit test -- guarantees the integrity of the delta tables
This commit is contained in:
Mauricio Carneiro 2012-04-20 12:59:59 -04:00
parent a733723439
commit e39a59594a
12 changed files with 532 additions and 152 deletions

View File

@ -65,6 +65,14 @@ public class BQSRGatherer extends Gatherer {
if (generalReport == null)
throw new ReviewedStingException(EMPTY_INPUT_LIST);
RecalibrationArgumentCollection RAC = generalReport.getRAC();
if (RAC.recalibrationReport != null) {
RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport);
RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
}
else
RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
generalReport.calculateEmpiricalAndQuantizedQualities();
generalReport.output(outputFile);
}

View File

@ -277,7 +277,42 @@ public class BQSRKeyManager {
bitSet.and(mask);
return chopNBitsFrom(bitSet, leadingBits);
}
@Override
public boolean equals(Object o) {
if (!(o instanceof BQSRKeyManager))
return false;
BQSRKeyManager other = (BQSRKeyManager) o;
if (this == other)
return true;
if (requiredCovariates.size() != other.requiredCovariates.size() || optionalCovariates.size() != other.optionalCovariates.size())
return false;
Iterator<RequiredCovariateInfo> otherRequiredIterator = other.requiredCovariates.iterator();
for (RequiredCovariateInfo thisInfo: requiredCovariates) {
RequiredCovariateInfo otherInfo = otherRequiredIterator.next();
String thisName = thisInfo.covariate.getClass().getSimpleName();
String otherName = otherInfo.covariate.getClass().getSimpleName();
if (!thisName.equals(otherName))
return false;
}
Iterator<OptionalCovariateInfo> otherOptionalIterator = other.optionalCovariates.iterator();
for (OptionalCovariateInfo thisInfo : optionalCovariates) {
OptionalCovariateInfo otherInfo = otherOptionalIterator.next();
String thisName = thisInfo.covariate.getClass().getSimpleName();
String otherName = otherInfo.covariate.getClass().getSimpleName();
if (!thisName.equals(otherName))
return false;
}
return true;
}
/**
* Aggregate information for each Covariate
*/
@ -292,7 +327,7 @@ public class BQSRKeyManager {
this.covariate = covariate;
}
}
class OptionalCovariateInfo {
public final BitSet covariateID; // cache the covariate ID
public final Covariate covariate;

View File

@ -55,6 +55,11 @@ public class Datum {
numMismatches = 0L;
}
public Datum(long numObservations, long numMismatches) {
this.numObservations = numObservations;
this.numMismatches = numMismatches;
}
//---------------------------------------------------------------------------------------------------------------
//
// increment methods
@ -90,4 +95,11 @@ public class Datum {
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
}
@Override
public boolean equals(Object o) {
if (!(o instanceof Datum))
return false;
Datum other = (Datum) o;
return numMismatches == other.numMismatches && numObservations == other.numObservations;
}
}

View File

@ -62,4 +62,19 @@ public class ReadCovariates {
for (int i = 0; i < covariateValues.length; i++)
keySet[i][nextCovariateIndex] = covariateValues[i];
}
/**
* Testing routines
*/
protected BitSet[][] getMismatchesKeySet() {
return mismatchesKeySet;
}
protected BitSet[][] getInsertionsKeySet() {
return insertionsKeySet;
}
protected BitSet[][] getDeletionsKeySet() {
return deletionsKeySet;
}
}

View File

@ -25,22 +25,24 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.io.Resource;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
@ -74,11 +76,13 @@ public class RecalDataManager {
public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors";
private final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullPlatform = false;
private static final String SCRIPT_FILE = "BQSR.R";
public enum SOLID_RECAL_MODE {
/**
* Treat reference inserted bases as reference matching bases. Very unsafe!
@ -309,6 +313,130 @@ public class RecalDataManager {
report.print(outputFile);
}
private static Pair<PrintStream, File> initializeRecalibrationPlot(File filename) {
final PrintStream deltaTableStream;
final File deltaTableFileName = new File(filename + ".csv");
try {
deltaTableStream = new PrintStream(deltaTableFileName);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created");
}
return new Pair<PrintStream, File>(deltaTableStream, deltaTableFileName);
}
private static void outputRecalibrationPlot(Pair<PrintStream, File> files, boolean keepIntermediates) {
final File csvFileName = files.getSecond();
final File plotFileName = new File(csvFileName + ".pdf");
files.getFirst().close();
RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class));
executor.addArgs(csvFileName.getAbsolutePath());
executor.addArgs(plotFileName.getAbsolutePath());
executor.exec();
if (!keepIntermediates)
if (!csvFileName.delete())
throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath());
}
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), original, "ORIGINAL", true);
outputRecalibrationPlot(files, keepIntermediates);
}
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> recalibrated, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true);
writeCSV(files.getFirst(), original, "ORIGINAL", false);
outputRecalibrationPlot(files, keepIntermediates);
}
private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> map, String recalibrationMode, boolean printHeader) {
final int QUALITY_SCORE_COVARIATE_INDEX = 1;
final Map<BitSet, AccuracyDatum> deltaTable = new HashMap<BitSet, AccuracyDatum>();
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getOptionalCovariates().size() > 0) { // only need the 'all covariates' table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
// create a key manager for the delta table
List<Covariate> requiredCovariates = keyManager.getRequiredCovariates().subList(0, 1); // include the read group covariate as the only required covariate
List<Covariate> optionalCovariates = keyManager.getRequiredCovariates().subList(1, 2); // include the quality score covariate as an optional covariate
optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates
BQSRKeyManager deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager
// create delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
byte originalQuality = Byte.parseByte((String) covs.get(QUALITY_SCORE_COVARIATE_INDEX)); // save the original quality for accuracy calculation later on
covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table
covs.set(1, originalQuality); // replace the covariate value with the quality score
covs.set(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate)
deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table
}
// print header
if (printHeader) {
List<String> header = new LinkedList<String>();
header.add("ReadGroup");
header.add("CovariateValue");
header.add("CovariateName");
header.add("EventType");
header.add("Observations");
header.add("Errors");
header.add("EmpiricalQuality");
header.add("AverageReportedQuality");
header.add("Accuracy");
header.add("Recalibration");
deltaTableFile.println(Utils.join(",", header));
}
// print each data line
for(Map.Entry<BitSet, AccuracyDatum> deltaEntry : deltaTable.entrySet()) {
List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey());
RecalDatum deltaDatum = deltaEntry.getValue();
deltaTableFile.print(Utils.join(",", deltaKeys));
deltaTableFile.print("," + deltaDatum.toString());
deltaTableFile.println("," + recalibrationMode);
}
}
}
}
/**
* Updates the current AccuracyDatum element in the delta table.
*
* If it doesn't have an element yet, it creates an AccuracyDatum element and adds it to the delta table.
*
* @param deltaTable the delta table
* @param deltaKey the key to the table
* @param recalDatum the recal datum to combine with the accuracyDatum element in the table
* @param originalQuality the quality score to we can calculate the accuracy for the accuracyDatum element
*/
private static void addToDeltaTable(Map<BitSet, AccuracyDatum> deltaTable, BitSet deltaKey, RecalDatum recalDatum, byte originalQuality) {
AccuracyDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
if (deltaDatum == null)
deltaTable.put(deltaKey, new AccuracyDatum(recalDatum, originalQuality)); // if we don't have a key yet, create a new one with the same values as the curent datum
else
deltaDatum.combine(recalDatum, originalQuality); // if we do have a datum, combine it with this one.
}
/**
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
*
@ -382,127 +510,6 @@ public class RecalDataManager {
return false;
}
/**
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
*
* @param read The SAMRecord to recalibrate
* @param readBases The bases in the read which have been RC'd if necessary
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param originalQualScores The array of original quality scores to set to zero if needed
* @param refBases The reference which has been RC'd if necessary
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
* @return The byte array of original quality scores some of which might have been set to zero
*/
private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) {
final boolean negStrand = read.getReadNegativeStrandFlag();
for (int iii = 1; iii < originalQualScores.length; iii++) {
if (inconsistency[iii] == 1) {
if (readBases[iii] == refBases[iii]) {
if (negStrand) {
originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0;
}
else {
originalQualScores[iii] = (byte) 0;
}
if (setBaseN) {
readBases[iii] = (byte) 'N';
}
}
// Set the prev base to Q0 as well
if (readBases[iii - 1] == refBases[iii - 1]) {
if (negStrand) {
originalQualScores[originalQualScores.length - iii] = (byte) 0;
}
else {
originalQualScores[iii - 1] = (byte) 0;
}
if (setBaseN) {
readBases[iii - 1] = (byte) 'N';
}
}
}
}
if (negStrand) {
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
}
read.setReadBases(readBases);
return originalQualScores;
}
/**
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
*
* @param read The SAMRecord to recalibrate
* @param readBases The bases in the read which have been RC'd if necessary
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
* @param refBases The reference which has been RC'd if necessary
*/
private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpaceQuals;
if (attr instanceof String) {
String x = (String) attr;
colorSpaceQuals = x.getBytes();
SAMUtils.fastqToPhred(colorSpaceQuals);
}
else {
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
for (int iii = 1; iii < inconsistency.length - 1; iii++) {
if (inconsistency[iii] == 1) {
for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read
if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
if (readBases[jjj] == refBases[jjj]) {
if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2);
if (rand == 0) { // The color implied base won the coin flip
readBases[jjj] = colorImpliedBases[jjj];
}
}
else {
final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
int diffInQuality = maxQuality - minQuality;
int numLow = minQuality;
if (numLow == 0) {
numLow++;
diffInQuality++;
}
final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh);
if (rand >= numLow) { // higher q score won
if (maxQuality == (int) colorSpaceQuals[jjj]) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had higher q score, and won out, so nothing to do here
}
else { // lower q score won
if (minQuality == (int) colorSpaceQuals[jjj]) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had lower q score, and won out, so nothing to do here
}
}
}
}
}
}
}
if (read.getReadNegativeStrandFlag()) {
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
}
read.setReadBases(readBases);
}
else { // No color space quality tag in file
throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
}
}
/**
* Given the base and the color calculate the next base in the sequence
*

View File

@ -25,6 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Random;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -114,4 +118,29 @@ public class RecalDatum extends Datum {
return Math.pow(10.0, qual / -10.0);
}
public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
Random random = new Random();
int nObservations = random.nextInt(maxObservations);
int nErrors = random.nextInt(maxErrors);
Datum datum = new Datum(nObservations, nErrors);
double empiricalQuality = datum.empiricalQualDouble();
double estimatedQReported = empiricalQuality + ((10 * random.nextDouble()) - 5); // empirical quality +/- 5.
return new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
}
/**
* We don't compare the estimated quality reported because it may be different when read from
* report tables.
*
* @param o the other recal datum
* @return true if the two recal datums have the same number of observations, errors and empirical quality.
*/
@Override
public boolean equals(Object o) {
if (!(o instanceof RecalDatum))
return false;
RecalDatum other = (RecalDatum) o;
return super.equals(o) &&
MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0;
}
}

View File

@ -170,6 +170,8 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering")
public boolean NO_PLOTS = false;
public File recalibrationReport = null;
public GATKReportTable generateReportTable() {
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run");
argumentsTable.addPrimaryKey("Argument");
@ -190,6 +192,7 @@ public class RecalibrationArgumentCollection {
argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES);
argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS);
argumentsTable.set("recalibration_report", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath());
return argumentsTable;
}

View File

@ -73,19 +73,26 @@ public class RecalibrationReport {
keysAndTablesMap.put(keyManager, table);
}
protected RecalibrationReport(QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, GATKReportTable argumentTable, RecalibrationArgumentCollection RAC) {
this.quantizationInfo = quantizationInfo;
this.keysAndTablesMap = keysAndTablesMap;
this.argumentTable = argumentTable;
this.RAC = RAC;
}
/**
* Combines two recalibration reports by adding all observations and errors
*
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
* them after combining. The reason for not calculating it is because this function is inteded for combining a
* series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
* qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
* makes this method faster
*
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
*
* @param other the recalibration report to combine with this one
*/
* Combines two recalibration reports by adding all observations and errors
*
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
* them after combining. The reason for not calculating it is because this function is inteded for combining a
* series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
* qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
* makes this method faster
*
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
*
* @param other the recalibration report to combine with this one
*/
public void combine(RecalibrationReport other) {
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
@ -285,6 +292,12 @@ public class RecalibrationReport {
else if (primaryKey.equals("no_plots"))
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("no_plots"))
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("recalibration_report"))
RAC.recalibrationReport = (value == null) ? null : new File((String) value);
}
return RAC;
@ -305,4 +318,45 @@ public class RecalibrationReport {
public void output(PrintStream output) {
RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output);
}
public RecalibrationArgumentCollection getRAC() {
return RAC;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof RecalibrationReport))
return false;
RecalibrationReport other = (RecalibrationReport) o;
if (this == o)
return true;
return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap);
}
private boolean isEqualTable(LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t1, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t2) {
if (t1.size() != t2.size())
return false;
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t1Iterator = t1.entrySet().iterator();
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t2Iterator = t2.entrySet().iterator();
while (t1Iterator.hasNext() && t2Iterator.hasNext()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t1MapEntry = t1Iterator.next();
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t2MapEntry = t2Iterator.next();
if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey())))
return false;
Map<BitSet, RecalDatum> table2 = t2MapEntry.getValue();
for (Map.Entry<BitSet, RecalDatum> t1TableEntry : t1MapEntry.getValue().entrySet()) {
BitSet t1Key = t1TableEntry.getKey();
if (!table2.containsKey(t1Key))
return false;
RecalDatum t1Datum = t1TableEntry.getValue();
if (!t1Datum.equals(table2.get(t1Key)))
return false;
}
}
return true;
}
}

View File

@ -738,8 +738,12 @@ public class ReadUtils {
}
public static GATKSAMRecord createRandomRead(int length) {
return createRandomRead(length, true);
}
public static GATKSAMRecord createRandomRead(int length, boolean allowNs) {
byte[] quals = ReadUtils.createRandomReadQuals(length);
byte[] bbases = ReadUtils.createRandomReadBases(length, true);
byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs);
return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
}

View File

@ -31,24 +31,29 @@ public class ContextCovariateUnitTest {
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
CovariateValues values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate);
verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, clippedRead, covariate);
verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, clippedRead, covariate);
}
private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) {
for (int i = 0; i < values.length; i++) {
String expectedContext = null;
if (i - contextSize + 1 >= 0) {
String context = bases.substring(i - contextSize + 1, i + 1);
if (!context.contains("N"))
expectedContext = context;
}
Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext);
public static void verifyCovariateArray(BitSet[] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) {
for (int i = 0; i < values.length; i++)
Assert.assertEquals(contextCovariate.keyFromBitSet(values[i]), expectedContext(read, i, contextSize));
}
public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) {
final String bases = stringFrom(read.getReadBases());
String expectedContext = null;
if (offset - contextSize + 1 >= 0) {
String context = bases.substring(offset - contextSize + 1, offset + 1);
if (!context.contains("N"))
expectedContext = context;
}
return expectedContext;
}
private String stringFrom(byte[] array) {
private static String stringFrom(byte[] array) {
String s = "";
for (byte value : array)
s += (char) value;

View File

@ -0,0 +1,78 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.List;
/**
* @author carneiro
* @since 4/21/12
*/
public class ReadCovariatesUnitTest {
@Test(enabled = true)
public void testCovariateGeneration() {
final String RGID = "id";
final int length = 10;
final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
GATKSAMRecord read = ReadUtils.createRandomRead(length, false);
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID);
rg.setPlatform("illumina");
read.setReadGroup(rg);
final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION);
final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION);
final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION);
ReadGroupCovariate rgCov = new ReadGroupCovariate();
QualityScoreCovariate qsCov = new QualityScoreCovariate();
ContextCovariate coCov = new ContextCovariate();
CycleCovariate cyCov = new CycleCovariate();
rgCov.initialize(RAC);
qsCov.initialize(RAC);
coCov.initialize(RAC);
cyCov.initialize(RAC);
List<Covariate> requestedCovariates = new ArrayList<Covariate>(4);
requestedCovariates.add(rgCov);
requestedCovariates.add(qsCov);
requestedCovariates.add(coCov);
requestedCovariates.add(cyCov);
ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates);
// check that the length is correct
Assert.assertEquals(rc.getMismatchesKeySet().length, length);
Assert.assertEquals(rc.getInsertionsKeySet().length, length);
Assert.assertEquals(rc.getDeletionsKeySet().length, length);
for (int i = 0; i < length; i++) {
// check that read group is always the same
Assert.assertEquals(rgCov.keyFromBitSet(rc.getMismatchesKeySet(i)[0]), RGID);
Assert.assertEquals(rgCov.keyFromBitSet(rc.getInsertionsKeySet(i)[0]), RGID);
Assert.assertEquals(rgCov.keyFromBitSet(rc.getDeletionsKeySet(i)[0]), RGID);
// check quality score
Assert.assertEquals(qsCov.keyFromBitSet(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]);
Assert.assertEquals(qsCov.keyFromBitSet(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]);
Assert.assertEquals(qsCov.keyFromBitSet(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]);
// check context
Assert.assertEquals(coCov.keyFromBitSet(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE));
Assert.assertEquals(coCov.keyFromBitSet(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INSERTIONS_CONTEXT_SIZE));
Assert.assertEquals(coCov.keyFromBitSet(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.DELETIONS_CONTEXT_SIZE));
// check cycle
Assert.assertEquals(cyCov.keyFromBitSet(rc.getMismatchesKeySet(i)[3]), "" + (i+1));
Assert.assertEquals(cyCov.keyFromBitSet(rc.getInsertionsKeySet(i)[3]), "" + (i+1));
Assert.assertEquals(cyCov.keyFromBitSet(rc.getDeletionsKeySet(i)[3]), "" + (i+1));
}
}
}

View File

@ -0,0 +1,130 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
* @author carneiro
* @since 4/21/12
*/
public class RecalibrationReportUnitTest {
@Test(enabled = true)
public void testOutput() {
final int length = 100;
List<Byte> quals = new ArrayList<Byte>(QualityUtils.MAX_QUAL_SCORE + 1);
List<Long> counts = new ArrayList<Long>(QualityUtils.MAX_QUAL_SCORE + 1);
for (int i = 0; i<= QualityUtils.MAX_QUAL_SCORE; i++) {
quals.add((byte) i);
counts.add(1L);
}
final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts);
final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
quantizationInfo.noQuantization();
final List<Covariate> requiredCovariates = new LinkedList<Covariate>();
final List<Covariate> optionalCovariates = new LinkedList<Covariate>();
final List<Covariate> requestedCovariates = new LinkedList<Covariate>();
final ReadGroupCovariate rgCovariate = new ReadGroupCovariate();
rgCovariate.initialize(RAC);
requiredCovariates.add(rgCovariate);
final BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
keysAndTablesMap.put(rgKeyManager, new HashMap<BitSet, RecalDatum>());
final QualityScoreCovariate qsCovariate = new QualityScoreCovariate();
qsCovariate.initialize(RAC);
requiredCovariates.add(qsCovariate);
final BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
keysAndTablesMap.put(qsKeyManager, new HashMap<BitSet, RecalDatum>());
final ContextCovariate cxCovariate = new ContextCovariate();
cxCovariate.initialize(RAC);
optionalCovariates.add(cxCovariate);
final CycleCovariate cyCovariate = new CycleCovariate();
cyCovariate.initialize(RAC);
optionalCovariates.add(cyCovariate);
BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
keysAndTablesMap.put(cvKeyManager, new HashMap<BitSet, RecalDatum>());
for (Covariate cov : requiredCovariates)
requestedCovariates.add(cov);
for (Covariate cov : optionalCovariates)
requestedCovariates.add(cov);
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id");
rg.setPlatform("illumina");
final GATKSAMRecord read = ReadUtils.createRandomRead(length, false);
read.setReadGroup(rg);
final byte [] readQuals = new byte[length];
for (int i = 0; i < length; i++)
readQuals[i] = 20;
read.setBaseQualities(readQuals);
final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INSERTIONS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE);
int nKeys = 0; // keep track of how many keys were produced
final ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates);
for (int offset = 0; offset < length; offset++) {
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
Map<BitSet, RecalDatum> table = entry.getValue();
for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION)) {
table.put(key, RecalDatum.createRandomRecalDatum(10000, 10));
nKeys++;
}
for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getInsertionsKeySet(offset), EventType.BASE_INSERTION)) {
table.put(key, RecalDatum.createRandomRecalDatum(100000, 10));
nKeys++;
}
for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getDeletionsKeySet(offset), EventType.BASE_DELETION)) {
table.put(key, RecalDatum.createRandomRecalDatum(100000, 10));
nKeys++;
}
}
}
Assert.assertEquals(nKeys, expectedKeys);
RecalibrationReport report = new RecalibrationReport(quantizationInfo, keysAndTablesMap, RAC.generateReportTable(), RAC);
File output = new File("RecalibrationReportUnitTestOutuput.grp");
PrintStream out;
try {
out = new PrintStream(output);
} catch (FileNotFoundException e) {
throw new ReviewedStingException("couldn't create the file " + output, e);
}
report.output(out);
RecalibrationReport loadedReport = new RecalibrationReport(output);
Assert.assertTrue(report.equals(loadedReport));
if (!output.delete())
throw new ReviewedStingException("File could not be deleted " + output);
}
private static int expectedNumberOfKeys (int nCovariates, int readLength, int indelContextSize, int mismatchesContextSize) {
int nommcs = readLength >= mismatchesContextSize ? mismatchesContextSize-1 : readLength;
int noincs = readLength >= indelContextSize ? 2*(indelContextSize-1) : 2*readLength;
return (nCovariates * readLength * 3) - nommcs - noincs;
}
}