BQSRGatherer handles missing read groups from some input files. [#68720468]

This commit is contained in:
Khalid Shakir 2014-04-08 02:59:51 +08:00
parent b07c0a6b4c
commit 3047d6ff32
4 changed files with 141 additions and 25 deletions

View File

@ -46,16 +46,18 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Gatherer; import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.List; import java.util.*;
/** /**
* User: carneiro * User: carneiro
@ -65,21 +67,56 @@ import java.util.List;
public class BQSRGatherer extends Gatherer { public class BQSRGatherer extends Gatherer {
private static final Logger logger = Logger.getLogger(BQSRGatherer.class);
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
private static final String MISSING_OUTPUT_FILE = "missing output file name"; private static final String MISSING_OUTPUT_FILE = "missing output file name";
private static final String MISSING_READ_GROUPS = "Missing read group(s)";
@Override @Override
public void gather(List<File> inputs, File output) { public void gather(final List<File> inputs, final File output) {
final PrintStream outputFile; final PrintStream outputFile;
try { try {
outputFile = new PrintStream(output); outputFile = new PrintStream(output);
} catch(FileNotFoundException e) { } catch(FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
} }
final GATKReport report = gatherReport(inputs);
report.print(outputFile);
}
/**
* Gathers the input recalibration reports into a single report.
*
* @param inputs Input recalibration GATK reports
* @return gathered recalibration GATK report
*/
public static GATKReport gatherReport(final List<File> inputs) {
final SortedSet<String> allReadGroups = new TreeSet<String>();
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<File, Set<String>>();
// Get the read groups from each input report
for (final File input : inputs) {
final Set<String> readGroups = RecalibrationReport.getReadGroups(input);
inputReadGroups.put(input, readGroups);
allReadGroups.addAll(readGroups);
}
// Log the read groups that are missing from specific inputs
for (Map.Entry<File, Set<String>> entry: inputReadGroups.entrySet()) {
final File input = entry.getKey();
final Set<String> readGroups = entry.getValue();
if (allReadGroups.size() != readGroups.size()) {
// Since this is not completely unexpected, more than debug, but less than a proper warning.
logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath());
for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) {
logger.info(" " + readGroup);
}
}
}
RecalibrationReport generalReport = null; RecalibrationReport generalReport = null;
for (File input : inputs) { for (File input : inputs) {
final RecalibrationReport inputReport = new RecalibrationReport(input); final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
if( inputReport.isEmpty() ) { continue; } if( inputReport.isEmpty() ) { continue; }
if (generalReport == null) if (generalReport == null)
@ -92,6 +129,6 @@ public class BQSRGatherer extends Gatherer {
generalReport.calculateQuantizedQualities(); generalReport.calculateQuantizedQualities();
generalReport.output(outputFile); return generalReport.createGATKReport();
} }
} }

View File

@ -501,10 +501,6 @@ public class RecalUtils {
return covariate.getClass().getSimpleName().split("Covariate")[0]; return covariate.getClass().getSimpleName().split("Covariate")[0];
} }
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), RAC.RECAL_TABLE);
}
/** /**
* Return a human-readable string representing the used covariates * Return a human-readable string representing the used covariates
* *
@ -518,16 +514,48 @@ public class RecalUtils {
return Utils.join(",", names); return Utils.join(",", names);
} }
public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile, boolean sortByCols) { /**
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), outputFile); * Outputs the GATK report to RAC.RECAL_TABLE.
*
* @param RAC The list of shared command line arguments
* @param quantizationInfo Quantization info
* @param recalibrationTables Recalibration tables
* @param requestedCovariates The list of requested covariates
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
*/
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
report.print(RAC.RECAL_TABLE);
} }
private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables, final PrintStream outputFile) { /**
* Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @param argumentTable Argument table
* @param quantizationInfo Quantization info
* @param recalibrationTables Recalibration tables
* @param requestedCovariates The list of requested covariates
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
* @return GATK report
*/
public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) {
return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
}
/**
* Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @param argumentTable Argument table
* @param quantizationTable Quantization Table
* @param recalTables Other recal tables
* @return GATK report
*/
private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables) {
final GATKReport report = new GATKReport(); final GATKReport report = new GATKReport();
report.addTable(argumentTable); report.addTable(argumentTable);
report.addTable(quantizationTable); report.addTable(quantizationTable);
report.addTables(recalTables); report.addTables(recalTables);
report.print(outputFile); return report;
} }
/** s /** s

View File

@ -77,8 +77,12 @@ public class RecalibrationReport {
private final int[] tempQUALarray = new int[3]; private final int[] tempQUALarray = new int[3];
private final int[] tempCOVarray = new int[4]; private final int[] tempCOVarray = new int[4];
public RecalibrationReport(final File RECAL_FILE) { public RecalibrationReport(final File recalFile) {
final GATKReport report = new GATKReport(RECAL_FILE); this(recalFile, getReadGroups(recalFile));
}
public RecalibrationReport(final File recalFile, final SortedSet<String> allReadGroups) {
final GATKReport report = new GATKReport(recalFile);
argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
RAC = initializeArgumentCollectionTable(argumentTable); RAC = initializeArgumentCollectionTable(argumentTable);
@ -104,7 +108,9 @@ public class RecalibrationReport {
for (Covariate cov : requestedCovariates) for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE))); recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size());
initializeReadGroupCovariates(allReadGroups);
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable()); parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable());
@ -115,16 +121,27 @@ public class RecalibrationReport {
} }
/** /**
* Counts the number of unique read groups in the table * Gets the unique read groups in the recal file
* *
* @param reportTable the GATKReport table containing data for this table * @param recalFile the recal file as a GATK Report
* @return the number of unique read groups * @return the unique read groups
*/ */
private int countReadGroups(final GATKReportTable reportTable) { public static SortedSet<String> getReadGroups(final File recalFile) {
Set<String> readGroups = new HashSet<String>(); return getReadGroups(new GATKReport(recalFile));
}
/**
* Gets the unique read groups in the table
*
* @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE
* @return the unique read groups
*/
private static SortedSet<String> getReadGroups(final GATKReport report) {
final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
final SortedSet<String> readGroups = new TreeSet<String>();
for ( int i = 0; i < reportTable.getNumRows(); i++ ) for ( int i = 0; i < reportTable.getNumRows(); i++ )
readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString()); readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString());
return readGroups.size(); return readGroups;
} }
/** /**
@ -160,6 +177,20 @@ public class RecalibrationReport {
return requestedCovariates; return requestedCovariates;
} }
/**
* Initialize read group keys using the shared list of all the read groups.
*
* By using the same sorted set of read groups across all recalibration reports, even if
* one report is missing a read group, all the reports use the same read group keys.
*
* @param allReadGroups The list of all possible read groups
*/
private void initializeReadGroupCovariates(final SortedSet<String> allReadGroups) {
for (String readGroup: allReadGroups) {
requestedCovariates[0].keyFromValue(readGroup);
}
}
/** /**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
* *
@ -358,8 +389,13 @@ public class RecalibrationReport {
quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS);
} }
public void output(PrintStream output) { /**
RecalUtils.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output, RAC.SORT_BY_ALL_COLUMNS); * Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @return newly created recalibration report
*/
public GATKReport createGATKReport() {
return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
} }
public RecalibrationArgumentCollection getRAC() { public RecalibrationArgumentCollection getRAC() {

View File

@ -183,4 +183,19 @@ public class BQSRGathererUnitTest extends BaseTest {
} }
} }
@Test
public void testGatherMissingReadGroup() {
// TODO: This test data is NOT private but privateTestDir offers:
// TODO: - Doesn't end up in protected / private github
// TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts
// Hand modified subset of problematic gather inputs submitted by George Grant
File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table");
File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table");
GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order.");
}
} }