BQSRGatherer handles missing read groups from some input files. [#68720468]
This commit is contained in:
parent
b07c0a6b4c
commit
3047d6ff32
|
|
@ -46,16 +46,18 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.commandline.Gatherer;
|
import org.broadinstitute.sting.commandline.Gatherer;
|
||||||
|
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
|
||||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
|
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* User: carneiro
|
* User: carneiro
|
||||||
|
|
@ -64,22 +66,57 @@ import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
public class BQSRGatherer extends Gatherer {
|
public class BQSRGatherer extends Gatherer {
|
||||||
|
|
||||||
|
private static final Logger logger = Logger.getLogger(BQSRGatherer.class);
|
||||||
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
|
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
|
||||||
private static final String MISSING_OUTPUT_FILE = "missing output file name";
|
private static final String MISSING_OUTPUT_FILE = "missing output file name";
|
||||||
|
private static final String MISSING_READ_GROUPS = "Missing read group(s)";
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void gather(List<File> inputs, File output) {
|
public void gather(final List<File> inputs, final File output) {
|
||||||
final PrintStream outputFile;
|
final PrintStream outputFile;
|
||||||
try {
|
try {
|
||||||
outputFile = new PrintStream(output);
|
outputFile = new PrintStream(output);
|
||||||
} catch(FileNotFoundException e) {
|
} catch(FileNotFoundException e) {
|
||||||
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
|
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
|
||||||
}
|
}
|
||||||
|
final GATKReport report = gatherReport(inputs);
|
||||||
|
report.print(outputFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gathers the input recalibration reports into a single report.
|
||||||
|
*
|
||||||
|
* @param inputs Input recalibration GATK reports
|
||||||
|
* @return gathered recalibration GATK report
|
||||||
|
*/
|
||||||
|
public static GATKReport gatherReport(final List<File> inputs) {
|
||||||
|
final SortedSet<String> allReadGroups = new TreeSet<String>();
|
||||||
|
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<File, Set<String>>();
|
||||||
|
|
||||||
|
// Get the read groups from each input report
|
||||||
|
for (final File input : inputs) {
|
||||||
|
final Set<String> readGroups = RecalibrationReport.getReadGroups(input);
|
||||||
|
inputReadGroups.put(input, readGroups);
|
||||||
|
allReadGroups.addAll(readGroups);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log the read groups that are missing from specific inputs
|
||||||
|
for (Map.Entry<File, Set<String>> entry: inputReadGroups.entrySet()) {
|
||||||
|
final File input = entry.getKey();
|
||||||
|
final Set<String> readGroups = entry.getValue();
|
||||||
|
if (allReadGroups.size() != readGroups.size()) {
|
||||||
|
// Since this is not completely unexpected, more than debug, but less than a proper warning.
|
||||||
|
logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath());
|
||||||
|
for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) {
|
||||||
|
logger.info(" " + readGroup);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
RecalibrationReport generalReport = null;
|
RecalibrationReport generalReport = null;
|
||||||
for (File input : inputs) {
|
for (File input : inputs) {
|
||||||
final RecalibrationReport inputReport = new RecalibrationReport(input);
|
final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
|
||||||
if( inputReport.isEmpty() ) { continue; }
|
if( inputReport.isEmpty() ) { continue; }
|
||||||
|
|
||||||
if (generalReport == null)
|
if (generalReport == null)
|
||||||
|
|
@ -92,6 +129,6 @@ public class BQSRGatherer extends Gatherer {
|
||||||
|
|
||||||
generalReport.calculateQuantizedQualities();
|
generalReport.calculateQuantizedQualities();
|
||||||
|
|
||||||
generalReport.output(outputFile);
|
return generalReport.createGATKReport();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -501,10 +501,6 @@ public class RecalUtils {
|
||||||
return covariate.getClass().getSimpleName().split("Covariate")[0];
|
return covariate.getClass().getSimpleName().split("Covariate")[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
|
|
||||||
outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), RAC.RECAL_TABLE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a human-readable string representing the used covariates
|
* Return a human-readable string representing the used covariates
|
||||||
*
|
*
|
||||||
|
|
@ -518,16 +514,48 @@ public class RecalUtils {
|
||||||
return Utils.join(",", names);
|
return Utils.join(",", names);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile, boolean sortByCols) {
|
/**
|
||||||
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols), outputFile);
|
* Outputs the GATK report to RAC.RECAL_TABLE.
|
||||||
|
*
|
||||||
|
* @param RAC The list of shared command line arguments
|
||||||
|
* @param quantizationInfo Quantization info
|
||||||
|
* @param recalibrationTables Recalibration tables
|
||||||
|
* @param requestedCovariates The list of requested covariates
|
||||||
|
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
|
||||||
|
*/
|
||||||
|
public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
|
||||||
|
final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
|
||||||
|
report.print(RAC.RECAL_TABLE);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void outputRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables, final PrintStream outputFile) {
|
/**
|
||||||
|
* Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream).
|
||||||
|
*
|
||||||
|
* @param argumentTable Argument table
|
||||||
|
* @param quantizationInfo Quantization info
|
||||||
|
* @param recalibrationTables Recalibration tables
|
||||||
|
* @param requestedCovariates The list of requested covariates
|
||||||
|
* @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
|
||||||
|
* @return GATK report
|
||||||
|
*/
|
||||||
|
public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) {
|
||||||
|
return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream).
|
||||||
|
*
|
||||||
|
* @param argumentTable Argument table
|
||||||
|
* @param quantizationTable Quantization Table
|
||||||
|
* @param recalTables Other recal tables
|
||||||
|
* @return GATK report
|
||||||
|
*/
|
||||||
|
private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List<GATKReportTable> recalTables) {
|
||||||
final GATKReport report = new GATKReport();
|
final GATKReport report = new GATKReport();
|
||||||
report.addTable(argumentTable);
|
report.addTable(argumentTable);
|
||||||
report.addTable(quantizationTable);
|
report.addTable(quantizationTable);
|
||||||
report.addTables(recalTables);
|
report.addTables(recalTables);
|
||||||
report.print(outputFile);
|
return report;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** s
|
/** s
|
||||||
|
|
|
||||||
|
|
@ -77,8 +77,12 @@ public class RecalibrationReport {
|
||||||
private final int[] tempQUALarray = new int[3];
|
private final int[] tempQUALarray = new int[3];
|
||||||
private final int[] tempCOVarray = new int[4];
|
private final int[] tempCOVarray = new int[4];
|
||||||
|
|
||||||
public RecalibrationReport(final File RECAL_FILE) {
|
public RecalibrationReport(final File recalFile) {
|
||||||
final GATKReport report = new GATKReport(RECAL_FILE);
|
this(recalFile, getReadGroups(recalFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
public RecalibrationReport(final File recalFile, final SortedSet<String> allReadGroups) {
|
||||||
|
final GATKReport report = new GATKReport(recalFile);
|
||||||
|
|
||||||
argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
|
argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);
|
||||||
RAC = initializeArgumentCollectionTable(argumentTable);
|
RAC = initializeArgumentCollectionTable(argumentTable);
|
||||||
|
|
@ -104,7 +108,9 @@ public class RecalibrationReport {
|
||||||
for (Covariate cov : requestedCovariates)
|
for (Covariate cov : requestedCovariates)
|
||||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||||
|
|
||||||
recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE)));
|
recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size());
|
||||||
|
|
||||||
|
initializeReadGroupCovariates(allReadGroups);
|
||||||
|
|
||||||
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable());
|
parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable());
|
||||||
|
|
||||||
|
|
@ -115,16 +121,27 @@ public class RecalibrationReport {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Counts the number of unique read groups in the table
|
* Gets the unique read groups in the recal file
|
||||||
*
|
*
|
||||||
* @param reportTable the GATKReport table containing data for this table
|
* @param recalFile the recal file as a GATK Report
|
||||||
* @return the number of unique read groups
|
* @return the unique read groups
|
||||||
*/
|
*/
|
||||||
private int countReadGroups(final GATKReportTable reportTable) {
|
public static SortedSet<String> getReadGroups(final File recalFile) {
|
||||||
Set<String> readGroups = new HashSet<String>();
|
return getReadGroups(new GATKReport(recalFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the unique read groups in the table
|
||||||
|
*
|
||||||
|
* @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE
|
||||||
|
* @return the unique read groups
|
||||||
|
*/
|
||||||
|
private static SortedSet<String> getReadGroups(final GATKReport report) {
|
||||||
|
final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
|
||||||
|
final SortedSet<String> readGroups = new TreeSet<String>();
|
||||||
for ( int i = 0; i < reportTable.getNumRows(); i++ )
|
for ( int i = 0; i < reportTable.getNumRows(); i++ )
|
||||||
readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString());
|
readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString());
|
||||||
return readGroups.size();
|
return readGroups;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -160,6 +177,20 @@ public class RecalibrationReport {
|
||||||
return requestedCovariates;
|
return requestedCovariates;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize read group keys using the shared list of all the read groups.
|
||||||
|
*
|
||||||
|
* By using the same sorted set of read groups across all recalibration reports, even if
|
||||||
|
* one report is missing a read group, all the reports use the same read group keys.
|
||||||
|
*
|
||||||
|
* @param allReadGroups The list of all possible read groups
|
||||||
|
*/
|
||||||
|
private void initializeReadGroupCovariates(final SortedSet<String> allReadGroups) {
|
||||||
|
for (String readGroup: allReadGroups) {
|
||||||
|
requestedCovariates[0].keyFromValue(readGroup);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
|
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
|
||||||
*
|
*
|
||||||
|
|
@ -358,8 +389,13 @@ public class RecalibrationReport {
|
||||||
quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS);
|
quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void output(PrintStream output) {
|
/**
|
||||||
RecalUtils.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output, RAC.SORT_BY_ALL_COLUMNS);
|
* Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream).
|
||||||
|
*
|
||||||
|
* @return newly created recalibration report
|
||||||
|
*/
|
||||||
|
public GATKReport createGATKReport() {
|
||||||
|
return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
|
||||||
}
|
}
|
||||||
|
|
||||||
public RecalibrationArgumentCollection getRAC() {
|
public RecalibrationArgumentCollection getRAC() {
|
||||||
|
|
|
||||||
|
|
@ -183,4 +183,19 @@ public class BQSRGathererUnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGatherMissingReadGroup() {
|
||||||
|
// TODO: This test data is NOT private but privateTestDir offers:
|
||||||
|
// TODO: - Doesn't end up in protected / private github
|
||||||
|
// TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts
|
||||||
|
// Hand modified subset of problematic gather inputs submitted by George Grant
|
||||||
|
File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table");
|
||||||
|
File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table");
|
||||||
|
|
||||||
|
GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
|
||||||
|
GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
|
||||||
|
|
||||||
|
Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue