diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index d6f0e16e8..84bf844f4 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -46,16 +46,18 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import org.apache.commons.collections.CollectionUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.Gatherer; +import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.recalibration.RecalUtils; import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintStream; -import java.util.List; +import java.util.*; /** * User: carneiro @@ -64,22 +66,57 @@ import java.util.List; public class BQSRGatherer extends Gatherer { - + + private static final Logger logger = Logger.getLogger(BQSRGatherer.class); private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; private static final String MISSING_OUTPUT_FILE = "missing output file name"; + private static final String MISSING_READ_GROUPS = "Missing read group(s)"; @Override - public void gather(List inputs, File output) { + public void gather(final List inputs, final File output) { final PrintStream outputFile; try { outputFile = new PrintStream(output); } catch(FileNotFoundException e) { throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); } + final GATKReport report = gatherReport(inputs); + report.print(outputFile); + } + + /** + * Gathers the input recalibration reports into a single report. + * + * @param inputs Input recalibration GATK reports + * @return gathered recalibration GATK report + */ + public static GATKReport gatherReport(final List inputs) { + final SortedSet allReadGroups = new TreeSet(); + final LinkedHashMap> inputReadGroups = new LinkedHashMap>(); + + // Get the read groups from each input report + for (final File input : inputs) { + final Set readGroups = RecalibrationReport.getReadGroups(input); + inputReadGroups.put(input, readGroups); + allReadGroups.addAll(readGroups); + } + + // Log the read groups that are missing from specific inputs + for (Map.Entry> entry: inputReadGroups.entrySet()) { + final File input = entry.getKey(); + final Set readGroups = entry.getValue(); + if (allReadGroups.size() != readGroups.size()) { + // Since this is not completely unexpected, more than debug, but less than a proper warning. + logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath()); + for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) { + logger.info(" " + readGroup); + } + } + } RecalibrationReport generalReport = null; for (File input : inputs) { - final RecalibrationReport inputReport = new RecalibrationReport(input); + final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups); if( inputReport.isEmpty() ) { continue; } if (generalReport == null) @@ -92,6 +129,6 @@ public class BQSRGatherer extends Gatherer { generalReport.calculateQuantizedQualities(); - generalReport.output(outputFile); + return generalReport.createGATKReport(); } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 325237d05..b4849ca33 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -501,10 +501,6 @@ public class RecalUtils { return covariate.getClass().getSimpleName().split("Covariate")[0]; } - public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { - outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), RAC.RECAL_TABLE); - } - /** * Return a human-readable string representing the used covariates * @@ -518,16 +514,48 @@ public class RecalUtils { return Utils.join(",", names); } - public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile, boolean sortByCols) { - outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), outputFile); + /** + * Outputs the GATK report to RAC.RECAL_TABLE. + * + * @param RAC The list of shared command line arguments + * @param quantizationInfo Quantization info + * @param recalibrationTables Recalibration tables + * @param requestedCovariates The list of requested covariates + * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT + */ + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { + final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); + report.print(RAC.RECAL_TABLE); } - private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables, final PrintStream outputFile) { + /** + * Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @param argumentTable Argument table + * @param quantizationInfo Quantization info + * @param recalibrationTables Recalibration tables + * @param requestedCovariates The list of requested covariates + * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT + * @return GATK report + */ + public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) { + return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); + } + + /** + * Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @param argumentTable Argument table + * @param quantizationTable Quantization Table + * @param recalTables Other recal tables + * @return GATK report + */ + private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) { final GATKReport report = new GATKReport(); report.addTable(argumentTable); report.addTable(quantizationTable); report.addTables(recalTables); - report.print(outputFile); + return report; } /** s diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 091b5ecf0..0ff5e5de5 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -77,8 +77,12 @@ public class RecalibrationReport { private final int[] tempQUALarray = new int[3]; private final int[] tempCOVarray = new int[4]; - public RecalibrationReport(final File RECAL_FILE) { - final GATKReport report = new GATKReport(RECAL_FILE); + public RecalibrationReport(final File recalFile) { + this(recalFile, getReadGroups(recalFile)); + } + + public RecalibrationReport(final File recalFile, final SortedSet allReadGroups) { + final GATKReport report = new GATKReport(recalFile); argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); RAC = initializeArgumentCollectionTable(argumentTable); @@ -104,7 +108,9 @@ public class RecalibrationReport { for (Covariate cov : requestedCovariates) cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection - recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE))); + recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size()); + + initializeReadGroupCovariates(allReadGroups); parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable()); @@ -115,16 +121,27 @@ public class RecalibrationReport { } /** - * Counts the number of unique read groups in the table + * Gets the unique read groups in the recal file * - * @param reportTable the GATKReport table containing data for this table - * @return the number of unique read groups + * @param recalFile the recal file as a GATK Report + * @return the unique read groups */ - private int countReadGroups(final GATKReportTable reportTable) { - Set readGroups = new HashSet(); + public static SortedSet getReadGroups(final File recalFile) { + return getReadGroups(new GATKReport(recalFile)); + } + + /** + * Gets the unique read groups in the table + * + * @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE + * @return the unique read groups + */ + private static SortedSet getReadGroups(final GATKReport report) { + final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); + final SortedSet readGroups = new TreeSet(); for ( int i = 0; i < reportTable.getNumRows(); i++ ) readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString()); - return readGroups.size(); + return readGroups; } /** @@ -160,6 +177,20 @@ public class RecalibrationReport { return requestedCovariates; } + /** + * Initialize read group keys using the shared list of all the read groups. + * + * By using the same sorted set of read groups across all recalibration reports, even if + * one report is missing a read group, all the reports use the same read group keys. + * + * @param allReadGroups The list of all possible read groups + */ + private void initializeReadGroupCovariates(final SortedSet allReadGroups) { + for (String readGroup: allReadGroups) { + requestedCovariates[0].keyFromValue(readGroup); + } + } + /** * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table * @@ -358,8 +389,13 @@ public class RecalibrationReport { quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); } - public void output(PrintStream output) { - RecalUtils.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output, RAC.SORT_BY_ALL_COLUMNS); + /** + * Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @return newly created recalibration report + */ + public GATKReport createGATKReport() { + return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); } public RecalibrationArgumentCollection getRAC() { diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index 658b8527d..7bfb2d86c 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -183,4 +183,19 @@ public class BQSRGathererUnitTest extends BaseTest { } } + + @Test + public void testGatherMissingReadGroup() { + // TODO: This test data is NOT private but privateTestDir offers: + // TODO: - Doesn't end up in protected / private github + // TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts + // Hand modified subset of problematic gather inputs submitted by George Grant + File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table"); + File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table"); + + GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2)); + GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1)); + + Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order."); + } }