BQSRGatherer handles missing read groups from some input files. [#68720468]

This commit is contained in:
Khalid Shakir 2014-04-08 02:59:51 +08:00
parent b07c0a6b4c
commit 3047d6ff32
4 changed files with 141 additions and 25 deletions

View File

@ -46,16 +46,18 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.List;
import java.util.*;
/**
* User: carneiro
@ -64,22 +66,57 @@ import java.util.List;
public class BQSRGatherer extends Gatherer {
private static final Logger logger = Logger.getLogger(BQSRGatherer.class);
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
private static final String MISSING_OUTPUT_FILE = "missing output file name";
private static final String MISSING_READ_GROUPS = "Missing read group(s)";
@Override
public void gather(List<File> inputs, File output) {
public void gather(final List<File> inputs, final File output) {
final PrintStream outputFile;
try {
outputFile = new PrintStream(output);
} catch(FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
}
final GATKReport report = gatherReport(inputs);
report.print(outputFile);
}
/**
* Gathers the input recalibration reports into a single report.
*
* @param inputs Input recalibration GATK reports
* @return gathered recalibration GATK report
*/
public static GATKReport gatherReport(final List<File> inputs) {
final SortedSet<String> allReadGroups = new TreeSet<String>();
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<File, Set<String>>();
// Get the read groups from each input report
for (final File input : inputs) {
final Set<String> readGroups = RecalibrationReport.getReadGroups(input);
inputReadGroups.put(input, readGroups);
allReadGroups.addAll(readGroups);
}
// Log the read groups that are missing from specific inputs
for (Map.Entry<File, Set<String>> entry: inputReadGroups.entrySet()) {
final File input = entry.getKey();
final Set<String> readGroups = entry.getValue();
if (allReadGroups.size() != readGroups.size()) {
// Since this is not completely unexpected, more than debug, but less than a proper warning.
logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath());
for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) {
logger.info(" " + readGroup);
}
}
}
RecalibrationReport generalReport = null;
for (File input : inputs) {
final RecalibrationReport inputReport = new RecalibrationReport(input);
final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
if( inputReport.isEmpty() ) { continue; }
if (generalReport == null)
@ -92,6 +129,6 @@ public class BQSRGatherer extends Gatherer {
generalReport.calculateQuantizedQualities();
generalReport.output(outputFile);
return generalReport.createGATKReport();
}
}

View File

@ -501,10 +501,6 @@ public class RecalUtils {
return covariate.getClass().getSimpleName().split("Covariate")[0];
}
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), RAC.RECAL_TABLE);
}
/**
* Return a human-readable string representing the used covariates
*
@ -518,16 +514,48 @@ public class RecalUtils {
return Utils.join(",", names);
}
public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile, boolean sortByCols) {
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), outputFile);
/**
* Outputs the GATK report to RAC.RECAL_TABLE.
*
* @param RAC The list of shared command line arguments
* @param quantizationInfo Quantization info
* @param recalibrationTables Recalibration tables
* @param requestedCovariates The list of requested covariates
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
*/
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
report.print(RAC.RECAL_TABLE);
}
private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables, final PrintStream outputFile) {
/**
* Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @param argumentTable Argument table
* @param quantizationInfo Quantization info
* @param recalibrationTables Recalibration tables
* @param requestedCovariates The list of requested covariates
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
* @return GATK report
*/
public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) {
return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
}
/**
* Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @param argumentTable Argument table
* @param quantizationTable Quantization Table
* @param recalTables Other recal tables
* @return GATK report
*/
private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables) {
final GATKReport report = new GATKReport();
report.addTable(argumentTable);
report.addTable(quantizationTable);
report.addTables(recalTables);
report.print(outputFile);
return report;
}
/** s

View File

@ -77,8 +77,12 @@ public class RecalibrationReport {
private final int[] tempQUALarray = new int[3];
private final int[] tempCOVarray = new int[4];
public RecalibrationReport(final File RECAL_FILE) {
final GATKReport report = new GATKReport(RECAL_FILE);
public RecalibrationReport(final File recalFile) {
this(recalFile, getReadGroups(recalFile));
}
public RecalibrationReport(final File recalFile, final SortedSet<String> allReadGroups) {
final GATKReport report = new GATKReport(recalFile);
argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
RAC = initializeArgumentCollectionTable(argumentTable);
@ -104,7 +108,9 @@ public class RecalibrationReport {
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE)));
recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size());
initializeReadGroupCovariates(allReadGroups);
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable());
@ -115,16 +121,27 @@ public class RecalibrationReport {
}
/**
* Counts the number of unique read groups in the table
* Gets the unique read groups in the recal file
*
* @param reportTable the GATKReport table containing data for this table
* @return the number of unique read groups
* @param recalFile the recal file as a GATK Report
* @return the unique read groups
*/
private int countReadGroups(final GATKReportTable reportTable) {
Set<String> readGroups = new HashSet<String>();
public static SortedSet<String> getReadGroups(final File recalFile) {
return getReadGroups(new GATKReport(recalFile));
}
/**
* Gets the unique read groups in the table
*
* @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE
* @return the unique read groups
*/
private static SortedSet<String> getReadGroups(final GATKReport report) {
final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
final SortedSet<String> readGroups = new TreeSet<String>();
for ( int i = 0; i < reportTable.getNumRows(); i++ )
readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString());
return readGroups.size();
return readGroups;
}
/**
@ -160,6 +177,20 @@ public class RecalibrationReport {
return requestedCovariates;
}
/**
* Initialize read group keys using the shared list of all the read groups.
*
* By using the same sorted set of read groups across all recalibration reports, even if
* one report is missing a read group, all the reports use the same read group keys.
*
* @param allReadGroups The list of all possible read groups
*/
private void initializeReadGroupCovariates(final SortedSet<String> allReadGroups) {
for (String readGroup: allReadGroups) {
requestedCovariates[0].keyFromValue(readGroup);
}
}
/**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
*
@ -358,8 +389,13 @@ public class RecalibrationReport {
quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS);
}
public void output(PrintStream output) {
RecalUtils.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output, RAC.SORT_BY_ALL_COLUMNS);
/**
* Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @return newly created recalibration report
*/
public GATKReport createGATKReport() {
return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
}
public RecalibrationArgumentCollection getRAC() {

View File

@ -183,4 +183,19 @@ public class BQSRGathererUnitTest extends BaseTest {
}
}
@Test
public void testGatherMissingReadGroup() {
// TODO: This test data is NOT private but privateTestDir offers:
// TODO: - Doesn't end up in protected / private github
// TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts
// Hand modified subset of problematic gather inputs submitted by George Grant
File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table");
File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table");
GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order.");
}
}