From e39a59594aad2d68139b40507d5ecd5f22cc0736 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 20 Apr 2012 12:59:59 -0400 Subject: [PATCH] BQSR triage and test routines * updated BQSR queue script for faster turnaround * implemented plot generation for scatter/gatherered runs * adjusted output file names to be cooperative with the queue script * added the recalibration report file to the argument table in the report * added ReadCovariates unit test -- guarantees that all the covariates are being generated for every base in the read * added RecalibrationReport unit test -- guarantees the integrity of the delta tables --- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 8 + .../gatk/walkers/bqsr/BQSRKeyManager.java | 39 ++- .../sting/gatk/walkers/bqsr/Datum.java | 12 + .../gatk/walkers/bqsr/ReadCovariates.java | 15 ++ .../gatk/walkers/bqsr/RecalDataManager.java | 255 +++++++++--------- .../sting/gatk/walkers/bqsr/RecalDatum.java | 29 ++ .../bqsr/RecalibrationArgumentCollection.java | 3 + .../walkers/bqsr/RecalibrationReport.java | 78 +++++- .../sting/utils/sam/ReadUtils.java | 6 +- .../bqsr/ContextCovariateUnitTest.java | 31 ++- .../walkers/bqsr/ReadCovariatesUnitTest.java | 78 ++++++ .../bqsr/RecalibrationReportUnitTest.java | 130 +++++++++ 12 files changed, 532 insertions(+), 152 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariatesUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index ecb19c6e6..d3be2d888 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -65,6 +65,14 @@ public class BQSRGatherer extends Gatherer { if (generalReport == null) throw new ReviewedStingException(EMPTY_INPUT_LIST); + RecalibrationArgumentCollection RAC = generalReport.getRAC(); + if (RAC.recalibrationReport != null) { + RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); + RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + } + else + RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + generalReport.calculateEmpiricalAndQuantizedQualities(); generalReport.output(outputFile); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index 2b48e5871..3ef25f9b8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -277,7 +277,42 @@ public class BQSRKeyManager { bitSet.and(mask); return chopNBitsFrom(bitSet, leadingBits); } - + + @Override + public boolean equals(Object o) { + if (!(o instanceof BQSRKeyManager)) + return false; + + BQSRKeyManager other = (BQSRKeyManager) o; + if (this == other) + return true; + + if (requiredCovariates.size() != other.requiredCovariates.size() || optionalCovariates.size() != other.optionalCovariates.size()) + return false; + + Iterator otherRequiredIterator = other.requiredCovariates.iterator(); + for (RequiredCovariateInfo thisInfo: requiredCovariates) { + RequiredCovariateInfo otherInfo = otherRequiredIterator.next(); + + String thisName = thisInfo.covariate.getClass().getSimpleName(); + String otherName = otherInfo.covariate.getClass().getSimpleName(); + if (!thisName.equals(otherName)) + return false; + } + + Iterator otherOptionalIterator = other.optionalCovariates.iterator(); + for (OptionalCovariateInfo thisInfo : optionalCovariates) { + OptionalCovariateInfo otherInfo = otherOptionalIterator.next(); + String thisName = thisInfo.covariate.getClass().getSimpleName(); + String otherName = otherInfo.covariate.getClass().getSimpleName(); + if (!thisName.equals(otherName)) + return false; + } + + return true; + } + + /** * Aggregate information for each Covariate */ @@ -292,7 +327,7 @@ public class BQSRKeyManager { this.covariate = covariate; } } - + class OptionalCovariateInfo { public final BitSet covariateID; // cache the covariate ID public final Covariate covariate; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java index 77e4cc8c7..779500512 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java @@ -55,6 +55,11 @@ public class Datum { numMismatches = 0L; } + public Datum(long numObservations, long numMismatches) { + this.numObservations = numObservations; + this.numMismatches = numMismatches; + } + //--------------------------------------------------------------------------------------------------------------- // // increment methods @@ -90,4 +95,11 @@ public class Datum { return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); } + @Override + public boolean equals(Object o) { + if (!(o instanceof Datum)) + return false; + Datum other = (Datum) o; + return numMismatches == other.numMismatches && numObservations == other.numObservations; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java index fc4445b22..74b759da5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java @@ -62,4 +62,19 @@ public class ReadCovariates { for (int i = 0; i < covariateValues.length; i++) keySet[i][nextCovariateIndex] = covariateValues[i]; } + + /** + * Testing routines + */ + protected BitSet[][] getMismatchesKeySet() { + return mismatchesKeySet; + } + + protected BitSet[][] getInsertionsKeySet() { + return insertionsKeySet; + } + + protected BitSet[][] getDeletionsKeySet() { + return deletionsKeySet; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index 64dba0551..177177e97 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -25,22 +25,24 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import net.sf.samtools.SAMUtils; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.Resource; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import java.io.File; +import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.*; @@ -74,11 +76,13 @@ public class RecalDataManager { public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; - private final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color private static boolean warnUserNullPlatform = false; + private static final String SCRIPT_FILE = "BQSR.R"; + + public enum SOLID_RECAL_MODE { /** * Treat reference inserted bases as reference matching bases. Very unsafe! @@ -309,6 +313,130 @@ public class RecalDataManager { report.print(outputFile); } + private static Pair initializeRecalibrationPlot(File filename) { + final PrintStream deltaTableStream; + final File deltaTableFileName = new File(filename + ".csv"); + try { + deltaTableStream = new PrintStream(deltaTableFileName); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created"); + } + return new Pair(deltaTableStream, deltaTableFileName); + } + + private static void outputRecalibrationPlot(Pair files, boolean keepIntermediates) { + final File csvFileName = files.getSecond(); + final File plotFileName = new File(csvFileName + ".pdf"); + files.getFirst().close(); + + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class)); + executor.addArgs(csvFileName.getAbsolutePath()); + executor.addArgs(plotFileName.getAbsolutePath()); + executor.exec(); + + if (!keepIntermediates) + if (!csvFileName.delete()) + throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath()); + + } + + public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, boolean keepIntermediates) { + Pair files = initializeRecalibrationPlot(filename); + writeCSV(files.getFirst(), original, "ORIGINAL", true); + outputRecalibrationPlot(files, keepIntermediates); + } + + public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, LinkedHashMap> recalibrated, boolean keepIntermediates) { + Pair files = initializeRecalibrationPlot(filename); + writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true); + writeCSV(files.getFirst(), original, "ORIGINAL", false); + outputRecalibrationPlot(files, keepIntermediates); + } + + private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap> map, String recalibrationMode, boolean printHeader) { + final int QUALITY_SCORE_COVARIATE_INDEX = 1; + final Map deltaTable = new HashMap(); + + + for (Map.Entry> tableEntry : map.entrySet()) { + BQSRKeyManager keyManager = tableEntry.getKey(); + + if (keyManager.getOptionalCovariates().size() > 0) { // only need the 'all covariates' table + Map table = tableEntry.getValue(); + + // create a key manager for the delta table + List requiredCovariates = keyManager.getRequiredCovariates().subList(0, 1); // include the read group covariate as the only required covariate + List optionalCovariates = keyManager.getRequiredCovariates().subList(1, 2); // include the quality score covariate as an optional covariate + optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates + BQSRKeyManager deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager + + + // create delta table + for (Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table + RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) + + List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key + byte originalQuality = Byte.parseByte((String) covs.get(QUALITY_SCORE_COVARIATE_INDEX)); // save the original quality for accuracy calculation later on + covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table + addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table + + covs.set(1, originalQuality); // replace the covariate value with the quality score + covs.set(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate) + deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table + addToDeltaTable(deltaTable, deltaKey, recalDatum, originalQuality); // add this covariate to the delta table + } + + // print header + if (printHeader) { + List header = new LinkedList(); + header.add("ReadGroup"); + header.add("CovariateValue"); + header.add("CovariateName"); + header.add("EventType"); + header.add("Observations"); + header.add("Errors"); + header.add("EmpiricalQuality"); + header.add("AverageReportedQuality"); + header.add("Accuracy"); + header.add("Recalibration"); + deltaTableFile.println(Utils.join(",", header)); + } + + // print each data line + for(Map.Entry deltaEntry : deltaTable.entrySet()) { + List deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey()); + RecalDatum deltaDatum = deltaEntry.getValue(); + deltaTableFile.print(Utils.join(",", deltaKeys)); + deltaTableFile.print("," + deltaDatum.toString()); + deltaTableFile.println("," + recalibrationMode); + } + + } + + } + } + + /** + * Updates the current AccuracyDatum element in the delta table. + * + * If it doesn't have an element yet, it creates an AccuracyDatum element and adds it to the delta table. + * + * @param deltaTable the delta table + * @param deltaKey the key to the table + * @param recalDatum the recal datum to combine with the accuracyDatum element in the table + * @param originalQuality the quality score to we can calculate the accuracy for the accuracyDatum element + */ + private static void addToDeltaTable(Map deltaTable, BitSet deltaKey, RecalDatum recalDatum, byte originalQuality) { + AccuracyDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + if (deltaDatum == null) + deltaTable.put(deltaKey, new AccuracyDatum(recalDatum, originalQuality)); // if we don't have a key yet, create a new one with the same values as the curent datum + else + deltaDatum.combine(recalDatum, originalQuality); // if we do have a datum, combine it with this one. + } + + /** * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string * @@ -382,127 +510,6 @@ public class RecalDataManager { return false; } - /** - * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero - * - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color - * @param originalQualScores The array of original quality scores to set to zero if needed - * @param refBases The reference which has been RC'd if necessary - * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar - * @return The byte array of original quality scores some of which might have been set to zero - */ - private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) { - - final boolean negStrand = read.getReadNegativeStrandFlag(); - for (int iii = 1; iii < originalQualScores.length; iii++) { - if (inconsistency[iii] == 1) { - if (readBases[iii] == refBases[iii]) { - if (negStrand) { - originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0; - } - else { - originalQualScores[iii] = (byte) 0; - } - if (setBaseN) { - readBases[iii] = (byte) 'N'; - } - } - // Set the prev base to Q0 as well - if (readBases[iii - 1] == refBases[iii - 1]) { - if (negStrand) { - originalQualScores[originalQualScores.length - iii] = (byte) 0; - } - else { - originalQualScores[iii - 1] = (byte) 0; - } - if (setBaseN) { - readBases[iii - 1] = (byte) 'N'; - } - } - } - } - if (negStrand) { - readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read - } - read.setReadBases(readBases); - - return originalQualScores; - } - - /** - * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference - * - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color - * @param colorImpliedBases The bases implied by the color space, RC'd if necessary - * @param refBases The reference which has been RC'd if necessary - */ - private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) { - - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpaceQuals; - if (attr instanceof String) { - String x = (String) attr; - colorSpaceQuals = x.getBytes(); - SAMUtils.fastqToPhred(colorSpaceQuals); - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - - for (int iii = 1; iii < inconsistency.length - 1; iii++) { - if (inconsistency[iii] == 1) { - for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read - if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step - if (readBases[jjj] == refBases[jjj]) { - if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2); - if (rand == 0) { // The color implied base won the coin flip - readBases[jjj] = colorImpliedBases[jjj]; - } - } - else { - final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); - final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); - int diffInQuality = maxQuality - minQuality; - int numLow = minQuality; - if (numLow == 0) { - numLow++; - diffInQuality++; - } - final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh); - if (rand >= numLow) { // higher q score won - if (maxQuality == (int) colorSpaceQuals[jjj]) { - readBases[jjj] = colorImpliedBases[jjj]; - } // else ref color had higher q score, and won out, so nothing to do here - } - else { // lower q score won - if (minQuality == (int) colorSpaceQuals[jjj]) { - readBases[jjj] = colorImpliedBases[jjj]; - } // else ref color had lower q score, and won out, so nothing to do here - } - } - } - } - } - } - } - - if (read.getReadNegativeStrandFlag()) { - readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read - } - read.setReadBases(readBases); - } - else { // No color space quality tag in file - throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName()); - } - } - /** * Given the base and the color calculate the next base in the sequence * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index 2dac90252..ded3e619b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -25,6 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Random; + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -114,4 +118,29 @@ public class RecalDatum extends Datum { return Math.pow(10.0, qual / -10.0); } + public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + Random random = new Random(); + int nObservations = random.nextInt(maxObservations); + int nErrors = random.nextInt(maxErrors); + Datum datum = new Datum(nObservations, nErrors); + double empiricalQuality = datum.empiricalQualDouble(); + double estimatedQReported = empiricalQuality + ((10 * random.nextDouble()) - 5); // empirical quality +/- 5. + return new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); + } + + /** + * We don't compare the estimated quality reported because it may be different when read from + * report tables. + * + * @param o the other recal datum + * @return true if the two recal datums have the same number of observations, errors and empirical quality. + */ + @Override + public boolean equals(Object o) { + if (!(o instanceof RecalDatum)) + return false; + RecalDatum other = (RecalDatum) o; + return super.equals(o) && + MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index b5768eedd..598312916 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -170,6 +170,8 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering") public boolean NO_PLOTS = false; + public File recalibrationReport = null; + public GATKReportTable generateReportTable() { GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run"); argumentsTable.addPrimaryKey("Argument"); @@ -190,6 +192,7 @@ public class RecalibrationArgumentCollection { argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); + argumentsTable.set("recalibration_report", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); return argumentsTable; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java index 2962c4674..bd1ba112b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -73,19 +73,26 @@ public class RecalibrationReport { keysAndTablesMap.put(keyManager, table); } + protected RecalibrationReport(QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, GATKReportTable argumentTable, RecalibrationArgumentCollection RAC) { + this.quantizationInfo = quantizationInfo; + this.keysAndTablesMap = keysAndTablesMap; + this.argumentTable = argumentTable; + this.RAC = RAC; + } + /** - * Combines two recalibration reports by adding all observations and errors - * - * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate - * them after combining. The reason for not calculating it is because this function is inteded for combining a - * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized - * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, - * makes this method faster - * - * Note2: The empirical quality reported, however, is recalculated given its simplicity. - * - * @param other the recalibration report to combine with this one - */ + * Combines two recalibration reports by adding all observations and errors + * + * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate + * them after combining. The reason for not calculating it is because this function is inteded for combining a + * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized + * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, + * makes this method faster + * + * Note2: The empirical quality reported, however, is recalculated given its simplicity. + * + * @param other the recalibration report to combine with this one + */ public void combine(RecalibrationReport other) { Iterator>> thisIterator = keysAndTablesMap.entrySet().iterator(); @@ -285,6 +292,12 @@ public class RecalibrationReport { else if (primaryKey.equals("no_plots")) RAC.NO_PLOTS = Boolean.parseBoolean((String) value); + + else if (primaryKey.equals("no_plots")) + RAC.NO_PLOTS = Boolean.parseBoolean((String) value); + + else if (primaryKey.equals("recalibration_report")) + RAC.recalibrationReport = (value == null) ? null : new File((String) value); } return RAC; @@ -305,4 +318,45 @@ public class RecalibrationReport { public void output(PrintStream output) { RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output); } + + public RecalibrationArgumentCollection getRAC() { + return RAC; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof RecalibrationReport)) + return false; + RecalibrationReport other = (RecalibrationReport) o; + if (this == o) + return true; + return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap); + } + + private boolean isEqualTable(LinkedHashMap> t1, LinkedHashMap> t2) { + if (t1.size() != t2.size()) + return false; + + Iterator>> t1Iterator = t1.entrySet().iterator(); + Iterator>> t2Iterator = t2.entrySet().iterator(); + + while (t1Iterator.hasNext() && t2Iterator.hasNext()) { + Map.Entry> t1MapEntry = t1Iterator.next(); + Map.Entry> t2MapEntry = t2Iterator.next(); + + if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey()))) + return false; + + Map table2 = t2MapEntry.getValue(); + for (Map.Entry t1TableEntry : t1MapEntry.getValue().entrySet()) { + BitSet t1Key = t1TableEntry.getKey(); + if (!table2.containsKey(t1Key)) + return false; + RecalDatum t1Datum = t1TableEntry.getValue(); + if (!t1Datum.equals(table2.get(t1Key))) + return false; + } + } + return true; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 81ebb0fa7..c2f7117f8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -738,8 +738,12 @@ public class ReadUtils { } public static GATKSAMRecord createRandomRead(int length) { + return createRandomRead(length, true); + } + + public static GATKSAMRecord createRandomRead(int length, boolean allowNs) { byte[] quals = ReadUtils.createRandomReadQuals(length); - byte[] bbases = ReadUtils.createRandomReadBases(length, true); + byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs); return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index 4b384aac0..5a522e81e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -31,24 +31,29 @@ public class ContextCovariateUnitTest { GATKSAMRecord read = ReadUtils.createRandomRead(1000); GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); CovariateValues values = covariate.getValues(read); - verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases())); - verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases())); - verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases())); + verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, clippedRead, covariate); } - private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { - for (int i = 0; i < values.length; i++) { - String expectedContext = null; - if (i - contextSize + 1 >= 0) { - String context = bases.substring(i - contextSize + 1, i + 1); - if (!context.contains("N")) - expectedContext = context; - } - Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext); + public static void verifyCovariateArray(BitSet[] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + for (int i = 0; i < values.length; i++) + Assert.assertEquals(contextCovariate.keyFromBitSet(values[i]), expectedContext(read, i, contextSize)); + + } + + public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { + final String bases = stringFrom(read.getReadBases()); + String expectedContext = null; + if (offset - contextSize + 1 >= 0) { + String context = bases.substring(offset - contextSize + 1, offset + 1); + if (!context.contains("N")) + expectedContext = context; } + return expectedContext; } - private String stringFrom(byte[] array) { + private static String stringFrom(byte[] array) { String s = ""; for (byte value : array) s += (char) value; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariatesUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariatesUnitTest.java new file mode 100644 index 000000000..c25a6dba2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariatesUnitTest.java @@ -0,0 +1,78 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class ReadCovariatesUnitTest { + + @Test(enabled = true) + public void testCovariateGeneration() { + final String RGID = "id"; + final int length = 10; + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(RGID); + rg.setPlatform("illumina"); + read.setReadGroup(rg); + final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); + final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); + final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); + + ReadGroupCovariate rgCov = new ReadGroupCovariate(); + QualityScoreCovariate qsCov = new QualityScoreCovariate(); + ContextCovariate coCov = new ContextCovariate(); + CycleCovariate cyCov = new CycleCovariate(); + + rgCov.initialize(RAC); + qsCov.initialize(RAC); + coCov.initialize(RAC); + cyCov.initialize(RAC); + + List requestedCovariates = new ArrayList(4); + requestedCovariates.add(rgCov); + requestedCovariates.add(qsCov); + requestedCovariates.add(coCov); + requestedCovariates.add(cyCov); + + ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, length); + Assert.assertEquals(rc.getInsertionsKeySet().length, length); + Assert.assertEquals(rc.getDeletionsKeySet().length, length); + + for (int i = 0; i < length; i++) { + // check that read group is always the same + Assert.assertEquals(rgCov.keyFromBitSet(rc.getMismatchesKeySet(i)[0]), RGID); + Assert.assertEquals(rgCov.keyFromBitSet(rc.getInsertionsKeySet(i)[0]), RGID); + Assert.assertEquals(rgCov.keyFromBitSet(rc.getDeletionsKeySet(i)[0]), RGID); + + // check quality score + Assert.assertEquals(qsCov.keyFromBitSet(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); + Assert.assertEquals(qsCov.keyFromBitSet(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); + Assert.assertEquals(qsCov.keyFromBitSet(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); + + // check context + Assert.assertEquals(coCov.keyFromBitSet(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); + Assert.assertEquals(coCov.keyFromBitSet(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INSERTIONS_CONTEXT_SIZE)); + Assert.assertEquals(coCov.keyFromBitSet(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.DELETIONS_CONTEXT_SIZE)); + + // check cycle + Assert.assertEquals(cyCov.keyFromBitSet(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.keyFromBitSet(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.keyFromBitSet(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); + } + + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java new file mode 100644 index 000000000..9911300c6 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java @@ -0,0 +1,130 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class RecalibrationReportUnitTest { + @Test(enabled = true) + public void testOutput() { + final int length = 100; + + List quals = new ArrayList(QualityUtils.MAX_QUAL_SCORE + 1); + List counts = new ArrayList(QualityUtils.MAX_QUAL_SCORE + 1); + + for (int i = 0; i<= QualityUtils.MAX_QUAL_SCORE; i++) { + quals.add((byte) i); + counts.add(1L); + } + + final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final LinkedHashMap> keysAndTablesMap = new LinkedHashMap>(); + + quantizationInfo.noQuantization(); + final List requiredCovariates = new LinkedList(); + final List optionalCovariates = new LinkedList(); + final List requestedCovariates = new LinkedList(); + + final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + final BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(rgKeyManager, new HashMap()); + + final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + final BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(qsKeyManager, new HashMap()); + + final ContextCovariate cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + final CycleCovariate cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(cvKeyManager, new HashMap()); + + for (Covariate cov : requiredCovariates) + requestedCovariates.add(cov); + for (Covariate cov : optionalCovariates) + requestedCovariates.add(cov); + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); + rg.setPlatform("illumina"); + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + read.setReadGroup(rg); + final byte [] readQuals = new byte[length]; + for (int i = 0; i < length; i++) + readQuals[i] = 20; + read.setBaseQualities(readQuals); + + + final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INSERTIONS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); + int nKeys = 0; // keep track of how many keys were produced + final ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); + for (int offset = 0; offset < length; offset++) { + for (Map.Entry> entry : keysAndTablesMap.entrySet()) { + BQSRKeyManager keyManager = entry.getKey(); + Map table = entry.getValue(); + + for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION)) { + table.put(key, RecalDatum.createRandomRecalDatum(10000, 10)); + nKeys++; + } + + for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getInsertionsKeySet(offset), EventType.BASE_INSERTION)) { + table.put(key, RecalDatum.createRandomRecalDatum(100000, 10)); + nKeys++; + } + + + for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getDeletionsKeySet(offset), EventType.BASE_DELETION)) { + table.put(key, RecalDatum.createRandomRecalDatum(100000, 10)); + nKeys++; + } + + } + } + Assert.assertEquals(nKeys, expectedKeys); + + RecalibrationReport report = new RecalibrationReport(quantizationInfo, keysAndTablesMap, RAC.generateReportTable(), RAC); + + File output = new File("RecalibrationReportUnitTestOutuput.grp"); + PrintStream out; + try { + out = new PrintStream(output); + } catch (FileNotFoundException e) { + throw new ReviewedStingException("couldn't create the file " + output, e); + } + report.output(out); + + RecalibrationReport loadedReport = new RecalibrationReport(output); + + Assert.assertTrue(report.equals(loadedReport)); + if (!output.delete()) + throw new ReviewedStingException("File could not be deleted " + output); + } + + private static int expectedNumberOfKeys (int nCovariates, int readLength, int indelContextSize, int mismatchesContextSize) { + int nommcs = readLength >= mismatchesContextSize ? mismatchesContextSize-1 : readLength; + int noincs = readLength >= indelContextSize ? 2*(indelContextSize-1) : 2*readLength; + return (nCovariates * readLength * 3) - nommcs - noincs; + } + +}