Merge pull request #1412 from broadinstitute/sl_issue_1285

Allows GatherBqsrReports to accept a .list file as input.
This commit is contained in:
samuelklee 2016-06-22 15:25:19 -04:00 committed by GitHub
commit 679941c1b1
4 changed files with 147 additions and 50 deletions

View File

@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools;
import htsjdk.samtools.util.IOUtil;
import org.broadinstitute.gatk.engine.recalibration.BQSRGatherer;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
@ -97,10 +99,7 @@ import java.util.List;
*
*/
@CommandLineProgramProperties(
usage = "Gathers scattered BQSR recalibration reports into a single file",
usageShort = "Gathers scattered BQSR recalibration reports into a single file"
)
@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_QC)
public class GatherBqsrReports extends CommandLineProgram {
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="List of scattered BQSR files")
public List<File> INPUT;

View File

@ -59,8 +59,12 @@ import org.broadinstitute.gatk.engine.recalibration.RecalUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
@ -70,38 +74,36 @@ import java.util.List;
*/
public class BQSRGathererUnitTest extends BaseTest {
private static File recal1 = new File(privateTestDir + "HiSeq.1mb.1RG.sg1.table");
private static File recal2 = new File(privateTestDir + "HiSeq.1mb.1RG.sg2.table");
private static File recal3 = new File(privateTestDir + "HiSeq.1mb.1RG.sg3.table");
private static File recal4 = new File(privateTestDir + "HiSeq.1mb.1RG.sg4.table");
private static File recal5 = new File(privateTestDir + "HiSeq.1mb.1RG.sg5.table");
private static File recalEmpty = new File(privateTestDir + "HiSeq.1mb.1RG.empty.table");
private static File recal1 = new File(privateTestDir, "HiSeq.1mb.1RG.sg1.table");
private static File recal2 = new File(privateTestDir, "HiSeq.1mb.1RG.sg2.table");
private static File recal3 = new File(privateTestDir, "HiSeq.1mb.1RG.sg3.table");
private static File recal4 = new File(privateTestDir, "HiSeq.1mb.1RG.sg4.table");
private static File recal5 = new File(privateTestDir, "HiSeq.1mb.1RG.sg5.table");
private static File recalEmpty = new File(privateTestDir, "HiSeq.1mb.1RG.empty.table");
private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table");
private static File recal_original = new File(privateTestDir, "HiSeq.1mb.1RG.noSG.table");
private static File recal_many = new File(privateTestDir, "bqsr.manyObservations.full.table");
@Test(enabled = true)
@Test
public void testManyObservations() {
File recal = new File(privateTestDir + "bqsr.manyObservations.piece.table");
final File recal = new File(privateTestDir, "bqsr.manyObservations.piece.table");
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
List<File> recalFiles = new LinkedList<File> ();
final List<File> recalFiles = new LinkedList<>();
for ( int i=0; i < 5; i++ )
recalFiles.add(recal);
BQSRGatherer gatherer = new BQSRGatherer();
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(new File(privateTestDir + "bqsr.manyObservations.full.table"));
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
testReports(recal_many, output);
}
@Test(enabled = true)
@Test
public void testGatherBQSR() {
BQSRGatherer gatherer = new BQSRGatherer();
List<File> recalFiles = new LinkedList<File> ();
final BQSRGatherer gatherer = new BQSRGatherer();
final List<File> recalFiles = new LinkedList<>();
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
recalFiles.add(recal1);
@ -111,16 +113,13 @@ public class BQSRGathererUnitTest extends BaseTest {
recalFiles.add(recal5);
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(recal_original);
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
testReports(recal_original, output);
}
@Test(enabled = true)
@Test
public void testGatherBQSRWithEmptyFile() {
BQSRGatherer gatherer = new BQSRGatherer();
List<File> recalFiles = new LinkedList<File> ();
final BQSRGatherer gatherer = new BQSRGatherer();
final List<File> recalFiles = new LinkedList<>();
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
recalFiles.add(recal1);
@ -131,13 +130,12 @@ public class BQSRGathererUnitTest extends BaseTest {
recalFiles.add(recalEmpty);
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(recal_original);
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
testReports(recal_original, output);
}
private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) {
private void testReports(final File originalFile, final File calculatedFile) {
final GATKReport originalReport = new GATKReport(originalFile);
final GATKReport calculatedReport = new GATKReport(calculatedFile);
// test the Arguments table
List<String> columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
@ -177,11 +175,11 @@ public class BQSRGathererUnitTest extends BaseTest {
* @param calculated the calculated table
* @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor)
*/
private void testTablesWithColumns(GATKReportTable original, GATKReportTable calculated, List<String> columnsToTest) {
for (int row = 0; row < original.getNumRows(); row++ ) {
for (String column : columnsToTest) {
Object actual = calculated.get(new Integer(row), column);
Object expected = original.get(row, column);
private void testTablesWithColumns(final GATKReportTable original, final GATKReportTable calculated, final List<String> columnsToTest) {
for (int row = 0; row < original.getNumRows(); row++) {
for (final String column : columnsToTest) {
final Object actual = calculated.get(Integer.valueOf(row), column);
final Object expected = original.get(row, column);
//if ( !actual.equals(expected) )
// System.out.println("Row=" + row + " Table=" + original.getTableName() + " Column=" + column + " Expected=" + expected + " Actual=" + actual);
Assert.assertEquals(actual, expected, "Row: " + row + " Original Table: " + original.getTableName() + " Column=" + column);
@ -196,12 +194,83 @@ public class BQSRGathererUnitTest extends BaseTest {
// TODO: - Doesn't end up in protected / private github
// TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts
// Hand modified subset of problematic gather inputs submitted by George Grant
File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table");
File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table");
final File input1 = new File(privateTestDir, "NA12878.rg_subset.chr1.recal_data.table");
final File input2 = new File(privateTestDir, "NA12878.rg_subset.chrY_Plus.recal_data.table");
GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
final GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2));
final GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1));
Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order.");
}
}
@Test
public void testParseInputsAsList() {
final File inputListFile = BaseTest.createTempFile("BQSRGatherer.parse.input", ".list");
try (final BufferedWriter bw = new BufferedWriter(new FileWriter(inputListFile))) {
bw.write(recal1.getAbsolutePath() + "\n");
bw.write(recal2.getAbsolutePath() + "\n");
bw.write(recal3.getAbsolutePath() + "\n");
bw.write(recal4.getAbsolutePath() + "\n");
bw.write(recal5.getAbsolutePath() + "\n");
} catch (final IOException ioe) {
Assert.fail("Could not create temporary list of input files for BQSRGatherer unit test.");
}
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Collections.singletonList(inputListFile), output);
testReports(recal_original, output);
}
@Test
public void testParseInputsAsMultipleFiles() {
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Arrays.asList(recal1, recal2, recal3, recal4, recal5), output);
testReports(recal_original, output);
}
@Test
public void testParseInputsMixedSingleList() {
final File inputListFile = BaseTest.createTempFile("BQSRGatherer.parse.input", ".list");
try (final BufferedWriter bw = new BufferedWriter(new FileWriter(inputListFile))) {
bw.write(recal2.getAbsolutePath() + "\n");
bw.write(recal3.getAbsolutePath() + "\n");
bw.write(recal4.getAbsolutePath() + "\n");
} catch (final IOException ioe) {
Assert.fail("Could not create temporary list of input files for BQSRGatherer unit test.");
}
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Arrays.asList(recal1, inputListFile, recal5), output);
testReports(recal_original, output);
}
@Test
public void testParseInputsMixedMultipleLists() {
final File inputListFile1 = BaseTest.createTempFile("BQSRGatherer.parse.input.1", ".list");
final File inputListFile2 = BaseTest.createTempFile("BQSRGatherer.parse.input.2", ".list");
try (final BufferedWriter bw1 = new BufferedWriter(new FileWriter(inputListFile1));
final BufferedWriter bw2 = new BufferedWriter(new FileWriter(inputListFile2))) {
bw1.write(recal2.getAbsolutePath() + "\n");
bw1.write(recal3.getAbsolutePath() + "\n");
bw2.write(recal5.getAbsolutePath() + "\n");
} catch (final IOException ioe) {
Assert.fail("Could not create temporary lists of input files for BQSRGatherer unit test.");
}
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
final BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(Arrays.asList(recal1, inputListFile1, recal4, inputListFile2), output);
testReports(recal_original, output);
}
}

View File

@ -31,9 +31,11 @@ import org.broadinstitute.gatk.utils.commandline.Gatherer;
import org.broadinstitute.gatk.utils.report.GATKReport;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
@ -55,7 +57,7 @@ public class BQSRGatherer extends Gatherer {
final PrintStream outputFile;
try {
outputFile = new PrintStream(output);
} catch(FileNotFoundException e) {
} catch(final FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
}
final GATKReport report = gatherReport(inputs);
@ -70,10 +72,13 @@ public class BQSRGatherer extends Gatherer {
*/
public static GATKReport gatherReport(final List<File> inputs) {
final SortedSet<String> allReadGroups = new TreeSet<String>();
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<File, Set<String>>();
final LinkedHashMap<File, Set<String>> inputReadGroups = new LinkedHashMap<>();
// Parse the input list for .list files and replace them with the files contained within them
final List<File> parsedInputs = parseInputList(inputs);
// Get the read groups from each input report
for (final File input : inputs) {
for (final File input : parsedInputs) {
final Set<String> readGroups = RecalibrationReport.getReadGroups(input);
inputReadGroups.put(input, readGroups);
allReadGroups.addAll(readGroups);
@ -93,7 +98,7 @@ public class BQSRGatherer extends Gatherer {
}
RecalibrationReport generalReport = null;
for (File input : inputs) {
for (final File input : parsedInputs) {
final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
if( inputReport.isEmpty() ) { continue; }
@ -109,4 +114,27 @@ public class BQSRGatherer extends Gatherer {
return generalReport.createGATKReport();
}
/**
* Replaces any .list files in rawFileList with the files named in said .list file.
* Identical to parseVariantList method in CatVariants.
* @param rawFileList the original file list, possibly including .list files
* @return a new List, with .list files replaced
*/
private static List<File> parseInputList(final List<File> rawFileList) {
final List<File> result = new ArrayList<>(rawFileList.size());
for (final File rawFile : rawFileList) {
if (rawFile.getName().endsWith(".list")) {
try {
for (final String line : new XReadLines(rawFile, true))
result.add(new File(line));
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(rawFile, e);
}
} else {
result.add(rawFile);
}
}
return result;
}
}

View File

@ -195,7 +195,8 @@ public class CatVariants extends CommandLineProgram {
}
/**
* Replaces any .list files in rawFileList with the files named in said .list file
* Replaces any .list files in rawFileList with the files named in said .list file.
* Identical to {@link org.broadinstitute.gatk.engine.recalibration.BQSRGatherer#parseInputList}.
* @param rawFileList the original file list, possibly including .list files
* @return a new List, with .list files replaced
*/
@ -206,7 +207,7 @@ public class CatVariants extends CommandLineProgram {
try {
for (final String line : new XReadLines(rawFile, true))
result.add(new File(line));
} catch (IOException e) {
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(rawFile, e);
}
} else {