BQSRGatherer handles missing read groups from some input files. [#68720468]
This commit is contained in:
parent
b07c0a6b4c
commit
3047d6ff32
|
|
@ -46,16 +46,18 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.Gatherer;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
|
|
@ -64,22 +66,57 @@ import java.util.List;
|
|||
|
||||
|
||||
public class BQSRGatherer extends Gatherer {
|
||||
|
||||
|
||||
private static final Logger logger = Logger.getLogger(BQSRGatherer.class);
|
||||
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
|
||||
private static final String MISSING_OUTPUT_FILE = "missing output file name";
|
||||
private static final String MISSING_READ_GROUPS = "Missing read group(s)";
|
||||
|
||||
@Override
|
||||
public void gather(List<File> inputs, File output) {
|
||||
public void gather(final List<File> inputs, final File output) {
|
||||
final PrintStream outputFile;
|
||||
try {
|
||||
outputFile = new PrintStream(output);
|
||||
} catch(FileNotFoundException e) {
|
||||
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
|
||||
}
|
||||
final GATKReport report = gatherReport(inputs);
|
||||
report.print(outputFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gathers the input recalibration reports into a single report.
|
||||
*
|
||||
* @param inputs Input recalibration GATK reports
|
||||
* @return gathered recalibration GATK report
|
||||
*/
|
||||
public static GATKReport gatherReport(final List<File> inputs) {
|
||||
final SortedSet<String> allReadGroups = new TreeSet<String>();
|
||||
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<File, Set<String>>();
|
||||
|
||||
// Get the read groups from each input report
|
||||
for (final File input : inputs) {
|
||||
final Set<String> readGroups = RecalibrationReport.getReadGroups(input);
|
||||
inputReadGroups.put(input, readGroups);
|
||||
allReadGroups.addAll(readGroups);
|
||||
}
|
||||
|
||||
// Log the read groups that are missing from specific inputs
|
||||
for (Map.Entry<File, Set<String>> entry: inputReadGroups.entrySet()) {
|
||||
final File input = entry.getKey();
|
||||
final Set<String> readGroups = entry.getValue();
|
||||
if (allReadGroups.size() != readGroups.size()) {
|
||||
// Since this is not completely unexpected, more than debug, but less than a proper warning.
|
||||
logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath());
|
||||
for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) {
|
||||
logger.info(" " + readGroup);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
RecalibrationReport generalReport = null;
|
||||
for (File input : inputs) {
|
||||
final RecalibrationReport inputReport = new RecalibrationReport(input);
|
||||
final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
|
||||
if( inputReport.isEmpty() ) { continue; }
|
||||
|
||||
if (generalReport == null)
|
||||
|
|
@ -92,6 +129,6 @@ public class BQSRGatherer extends Gatherer {
|
|||
|
||||
generalReport.calculateQuantizedQualities();
|
||||
|
||||
generalReport.output(outputFile);
|
||||
return generalReport.createGATKReport();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -501,10 +501,6 @@ public class RecalUtils {
|
|||
return covariate.getClass().getSimpleName().split("Covariate")[0];
|
||||
}
|
||||
|
||||
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
|
||||
outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), RAC.RECAL_TABLE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a human-readable string representing the used covariates
|
||||
*
|
||||
|
|
@ -518,16 +514,48 @@ public class RecalUtils {
|
|||
return Utils.join(",", names);
|
||||
}
|
||||
|
||||
public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile, boolean sortByCols) {
|
||||
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), outputFile);
|
||||
/**
|
||||
* Outputs the GATK report to RAC.RECAL_TABLE.
|
||||
*
|
||||
* @param RAC The list of shared command line arguments
|
||||
* @param quantizationInfo Quantization info
|
||||
* @param recalibrationTables Recalibration tables
|
||||
* @param requestedCovariates The list of requested covariates
|
||||
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
|
||||
*/
|
||||
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
|
||||
final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
|
||||
report.print(RAC.RECAL_TABLE);
|
||||
}
|
||||
|
||||
private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables, final PrintStream outputFile) {
|
||||
/**
|
||||
* Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream).
|
||||
*
|
||||
* @param argumentTable Argument table
|
||||
* @param quantizationInfo Quantization info
|
||||
* @param recalibrationTables Recalibration tables
|
||||
* @param requestedCovariates The list of requested covariates
|
||||
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
|
||||
* @return GATK report
|
||||
*/
|
||||
public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) {
|
||||
return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream).
|
||||
*
|
||||
* @param argumentTable Argument table
|
||||
* @param quantizationTable Quantization Table
|
||||
* @param recalTables Other recal tables
|
||||
* @return GATK report
|
||||
*/
|
||||
private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables) {
|
||||
final GATKReport report = new GATKReport();
|
||||
report.addTable(argumentTable);
|
||||
report.addTable(quantizationTable);
|
||||
report.addTables(recalTables);
|
||||
report.print(outputFile);
|
||||
return report;
|
||||
}
|
||||
|
||||
/** s
|
||||
|
|
|
|||
|
|
@ -77,8 +77,12 @@ public class RecalibrationReport {
|
|||
private final int[] tempQUALarray = new int[3];
|
||||
private final int[] tempCOVarray = new int[4];
|
||||
|
||||
public RecalibrationReport(final File RECAL_FILE) {
|
||||
final GATKReport report = new GATKReport(RECAL_FILE);
|
||||
public RecalibrationReport(final File recalFile) {
|
||||
this(recalFile, getReadGroups(recalFile));
|
||||
}
|
||||
|
||||
public RecalibrationReport(final File recalFile, final SortedSet<String> allReadGroups) {
|
||||
final GATKReport report = new GATKReport(recalFile);
|
||||
|
||||
argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
|
||||
RAC = initializeArgumentCollectionTable(argumentTable);
|
||||
|
|
@ -104,7 +108,9 @@ public class RecalibrationReport {
|
|||
for (Covariate cov : requestedCovariates)
|
||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||
|
||||
recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE)));
|
||||
recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size());
|
||||
|
||||
initializeReadGroupCovariates(allReadGroups);
|
||||
|
||||
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable());
|
||||
|
||||
|
|
@ -115,16 +121,27 @@ public class RecalibrationReport {
|
|||
}
|
||||
|
||||
/**
|
||||
* Counts the number of unique read groups in the table
|
||||
* Gets the unique read groups in the recal file
|
||||
*
|
||||
* @param reportTable the GATKReport table containing data for this table
|
||||
* @return the number of unique read groups
|
||||
* @param recalFile the recal file as a GATK Report
|
||||
* @return the unique read groups
|
||||
*/
|
||||
private int countReadGroups(final GATKReportTable reportTable) {
|
||||
Set<String> readGroups = new HashSet<String>();
|
||||
public static SortedSet<String> getReadGroups(final File recalFile) {
|
||||
return getReadGroups(new GATKReport(recalFile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the unique read groups in the table
|
||||
*
|
||||
* @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE
|
||||
* @return the unique read groups
|
||||
*/
|
||||
private static SortedSet<String> getReadGroups(final GATKReport report) {
|
||||
final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
|
||||
final SortedSet<String> readGroups = new TreeSet<String>();
|
||||
for ( int i = 0; i < reportTable.getNumRows(); i++ )
|
||||
readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString());
|
||||
return readGroups.size();
|
||||
return readGroups;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -160,6 +177,20 @@ public class RecalibrationReport {
|
|||
return requestedCovariates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize read group keys using the shared list of all the read groups.
|
||||
*
|
||||
* By using the same sorted set of read groups across all recalibration reports, even if
|
||||
* one report is missing a read group, all the reports use the same read group keys.
|
||||
*
|
||||
* @param allReadGroups The list of all possible read groups
|
||||
*/
|
||||
private void initializeReadGroupCovariates(final SortedSet<String> allReadGroups) {
|
||||
for (String readGroup: allReadGroups) {
|
||||
requestedCovariates[0].keyFromValue(readGroup);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
|
||||
*
|
||||
|
|
@ -358,8 +389,13 @@ public class RecalibrationReport {
|
|||
quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS);
|
||||
}
|
||||
|
||||
public void output(PrintStream output) {
|
||||
RecalUtils.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output, RAC.SORT_BY_ALL_COLUMNS);
|
||||
/**
|
||||
* Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream).
|
||||
*
|
||||
* @return newly created recalibration report
|
||||
*/
|
||||
public GATKReport createGATKReport() {
|
||||
return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
|
||||
}
|
||||
|
||||
public RecalibrationArgumentCollection getRAC() {
|
||||
|
|
|
|||
|
|
@ -183,4 +183,19 @@ public class BQSRGathererUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGatherMissingReadGroup() {
|
||||
// TODO: This test data is NOT private but privateTestDir offers:
|
||||
// TODO: - Doesn't end up in protected / private github
|
||||
// TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts
|
||||
// Hand modified subset of problematic gather inputs submitted by George Grant
|
||||
File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table");
|
||||
File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table");
|
||||
|
||||
GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
|
||||
GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
|
||||
|
||||
Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order.");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue