From 76bb8fd9e58c857c8ca08c38e5f4e09f59c29677 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Mon, 20 Jun 2016 13:30:39 -0400 Subject: [PATCH] Allows GatherBqsrReports to accept a .list file as input. --- .../gatk/tools/GatherBqsrReports.java | 7 +- .../walkers/bqsr/BQSRGathererUnitTest.java | 149 +++++++++++++----- .../engine/recalibration/BQSRGatherer.java | 36 ++++- .../gatk/tools/CatVariants.java | 5 +- 4 files changed, 147 insertions(+), 50 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java index 787181428..f343d0bab 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/GatherBqsrReports.java @@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools; import htsjdk.samtools.util.IOUtil; import org.broadinstitute.gatk.engine.recalibration.BQSRGatherer; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; @@ -97,10 +99,7 @@ import java.util.List; * */ -@CommandLineProgramProperties( - usage = "Gathers scattered BQSR recalibration reports into a single file", - usageShort = "Gathers scattered BQSR recalibration reports into a single file" -) +@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_QC) public class GatherBqsrReports extends CommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="List of scattered BQSR files") public List INPUT; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java index d44fe6cda..c515be885 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java @@ -59,8 +59,12 @@ import org.broadinstitute.gatk.engine.recalibration.RecalUtils; import org.testng.Assert; import org.testng.annotations.Test; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileWriter; +import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -70,38 +74,36 @@ import java.util.List; */ public class BQSRGathererUnitTest extends BaseTest { - private static File recal1 = new File(privateTestDir + "HiSeq.1mb.1RG.sg1.table"); - private static File recal2 = new File(privateTestDir + "HiSeq.1mb.1RG.sg2.table"); - private static File recal3 = new File(privateTestDir + "HiSeq.1mb.1RG.sg3.table"); - private static File recal4 = new File(privateTestDir + "HiSeq.1mb.1RG.sg4.table"); - private static File recal5 = new File(privateTestDir + "HiSeq.1mb.1RG.sg5.table"); - private static File recalEmpty = new File(privateTestDir + "HiSeq.1mb.1RG.empty.table"); + private static File recal1 = new File(privateTestDir, "HiSeq.1mb.1RG.sg1.table"); + private static File recal2 = new File(privateTestDir, "HiSeq.1mb.1RG.sg2.table"); + private static File recal3 = new File(privateTestDir, "HiSeq.1mb.1RG.sg3.table"); + private static File recal4 = new File(privateTestDir, "HiSeq.1mb.1RG.sg4.table"); + private static File recal5 = new File(privateTestDir, "HiSeq.1mb.1RG.sg5.table"); + private static File recalEmpty = new File(privateTestDir, "HiSeq.1mb.1RG.empty.table"); - private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table"); + private static File recal_original = new File(privateTestDir, "HiSeq.1mb.1RG.noSG.table"); + private static File recal_many = new File(privateTestDir, "bqsr.manyObservations.full.table"); - @Test(enabled = true) + @Test public void testManyObservations() { - File recal = new File(privateTestDir + "bqsr.manyObservations.piece.table"); + final File recal = new File(privateTestDir, "bqsr.manyObservations.piece.table"); final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); - List recalFiles = new LinkedList (); + final List recalFiles = new LinkedList<>(); for ( int i=0; i < 5; i++ ) recalFiles.add(recal); - BQSRGatherer gatherer = new BQSRGatherer(); + final BQSRGatherer gatherer = new BQSRGatherer(); gatherer.gather(recalFiles, output); - GATKReport originalReport = new GATKReport(new File(privateTestDir + "bqsr.manyObservations.full.table")); - GATKReport calculatedReport = new GATKReport(output); - - testReports(originalReport, calculatedReport); + testReports(recal_many, output); } - @Test(enabled = true) + @Test public void testGatherBQSR() { - BQSRGatherer gatherer = new BQSRGatherer(); - List recalFiles = new LinkedList (); + final BQSRGatherer gatherer = new BQSRGatherer(); + final List recalFiles = new LinkedList<>(); final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); recalFiles.add(recal1); @@ -111,16 +113,13 @@ public class BQSRGathererUnitTest extends BaseTest { recalFiles.add(recal5); gatherer.gather(recalFiles, output); - GATKReport originalReport = new GATKReport(recal_original); - GATKReport calculatedReport = new GATKReport(output); - - testReports(originalReport, calculatedReport); + testReports(recal_original, output); } - @Test(enabled = true) + @Test public void testGatherBQSRWithEmptyFile() { - BQSRGatherer gatherer = new BQSRGatherer(); - List recalFiles = new LinkedList (); + final BQSRGatherer gatherer = new BQSRGatherer(); + final List recalFiles = new LinkedList<>(); final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); recalFiles.add(recal1); @@ -131,13 +130,12 @@ public class BQSRGathererUnitTest extends BaseTest { recalFiles.add(recalEmpty); gatherer.gather(recalFiles, output); - GATKReport originalReport = new GATKReport(recal_original); - GATKReport calculatedReport = new GATKReport(output); - - testReports(originalReport, calculatedReport); + testReports(recal_original, output); } - private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) { + private void testReports(final File originalFile, final File calculatedFile) { + final GATKReport originalReport = new GATKReport(originalFile); + final GATKReport calculatedReport = new GATKReport(calculatedFile); // test the Arguments table List columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); @@ -177,11 +175,11 @@ public class BQSRGathererUnitTest extends BaseTest { * @param calculated the calculated table * @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor) */ - private void testTablesWithColumns(GATKReportTable original, GATKReportTable calculated, List columnsToTest) { - for (int row = 0; row < original.getNumRows(); row++ ) { - for (String column : columnsToTest) { - Object actual = calculated.get(new Integer(row), column); - Object expected = original.get(row, column); + private void testTablesWithColumns(final GATKReportTable original, final GATKReportTable calculated, final List columnsToTest) { + for (int row = 0; row < original.getNumRows(); row++) { + for (final String column : columnsToTest) { + final Object actual = calculated.get(Integer.valueOf(row), column); + final Object expected = original.get(row, column); //if ( !actual.equals(expected) ) // System.out.println("Row=" + row + " Table=" + original.getTableName() + " Column=" + column + " Expected=" + expected + " Actual=" + actual); Assert.assertEquals(actual, expected, "Row: " + row + " Original Table: " + original.getTableName() + " Column=" + column); @@ -196,12 +194,83 @@ public class BQSRGathererUnitTest extends BaseTest { // TODO: - Doesn't end up in protected / private github // TODO: - IS otherwise available for local debugging unlike /humgen NFS mounts // Hand modified subset of problematic gather inputs submitted by George Grant - File input1 = new File(privateTestDir + "NA12878.rg_subset.chr1.recal_data.table"); - File input2 = new File(privateTestDir + "NA12878.rg_subset.chrY_Plus.recal_data.table"); + final File input1 = new File(privateTestDir, "NA12878.rg_subset.chr1.recal_data.table"); + final File input2 = new File(privateTestDir, "NA12878.rg_subset.chrY_Plus.recal_data.table"); - GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2)); - GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1)); + final GATKReport report12 = BQSRGatherer.gatherReport(Arrays.asList(input1, input2)); + final GATKReport report21 = BQSRGatherer.gatherReport(Arrays.asList(input2, input1)); Assert.assertTrue(report12.equals(report21), "GATK reports are different when gathered in a different order."); } -} + + @Test + public void testParseInputsAsList() { + final File inputListFile = BaseTest.createTempFile("BQSRGatherer.parse.input", ".list"); + try (final BufferedWriter bw = new BufferedWriter(new FileWriter(inputListFile))) { + bw.write(recal1.getAbsolutePath() + "\n"); + bw.write(recal2.getAbsolutePath() + "\n"); + bw.write(recal3.getAbsolutePath() + "\n"); + bw.write(recal4.getAbsolutePath() + "\n"); + bw.write(recal5.getAbsolutePath() + "\n"); + } catch (final IOException ioe) { + Assert.fail("Could not create temporary list of input files for BQSRGatherer unit test."); + } + + final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); + + final BQSRGatherer gatherer = new BQSRGatherer(); + gatherer.gather(Collections.singletonList(inputListFile), output); + + testReports(recal_original, output); + } + + @Test + public void testParseInputsAsMultipleFiles() { + final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); + + final BQSRGatherer gatherer = new BQSRGatherer(); + gatherer.gather(Arrays.asList(recal1, recal2, recal3, recal4, recal5), output); + + testReports(recal_original, output); + } + + @Test + public void testParseInputsMixedSingleList() { + final File inputListFile = BaseTest.createTempFile("BQSRGatherer.parse.input", ".list"); + try (final BufferedWriter bw = new BufferedWriter(new FileWriter(inputListFile))) { + bw.write(recal2.getAbsolutePath() + "\n"); + bw.write(recal3.getAbsolutePath() + "\n"); + bw.write(recal4.getAbsolutePath() + "\n"); + } catch (final IOException ioe) { + Assert.fail("Could not create temporary list of input files for BQSRGatherer unit test."); + } + + final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); + + final BQSRGatherer gatherer = new BQSRGatherer(); + gatherer.gather(Arrays.asList(recal1, inputListFile, recal5), output); + + testReports(recal_original, output); + } + + @Test + public void testParseInputsMixedMultipleLists() { + final File inputListFile1 = BaseTest.createTempFile("BQSRGatherer.parse.input.1", ".list"); + final File inputListFile2 = BaseTest.createTempFile("BQSRGatherer.parse.input.2", ".list"); + try (final BufferedWriter bw1 = new BufferedWriter(new FileWriter(inputListFile1)); + final BufferedWriter bw2 = new BufferedWriter(new FileWriter(inputListFile2))) { + bw1.write(recal2.getAbsolutePath() + "\n"); + bw1.write(recal3.getAbsolutePath() + "\n"); + bw2.write(recal5.getAbsolutePath() + "\n"); + } catch (final IOException ioe) { + Assert.fail("Could not create temporary lists of input files for BQSRGatherer unit test."); + } + + final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); + + final BQSRGatherer gatherer = new BQSRGatherer(); + gatherer.gather(Arrays.asList(recal1, inputListFile1, recal4, inputListFile2), output); + + testReports(recal_original, output); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java index 134106db7..2dbe1d50c 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java @@ -31,9 +31,11 @@ import org.broadinstitute.gatk.utils.commandline.Gatherer; import org.broadinstitute.gatk.utils.report.GATKReport; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.text.XReadLines; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.io.PrintStream; import java.util.*; @@ -55,7 +57,7 @@ public class BQSRGatherer extends Gatherer { final PrintStream outputFile; try { outputFile = new PrintStream(output); - } catch(FileNotFoundException e) { + } catch(final FileNotFoundException e) { throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); } final GATKReport report = gatherReport(inputs); @@ -70,10 +72,13 @@ public class BQSRGatherer extends Gatherer { */ public static GATKReport gatherReport(final List inputs) { final SortedSet allReadGroups = new TreeSet(); - final LinkedHashMap> inputReadGroups = new LinkedHashMap>(); + final LinkedHashMap> inputReadGroups = new LinkedHashMap<>(); + + // Parse the input list for .list files and replace them with the files contained within them + final List parsedInputs = parseInputList(inputs); // Get the read groups from each input report - for (final File input : inputs) { + for (final File input : parsedInputs) { final Set readGroups = RecalibrationReport.getReadGroups(input); inputReadGroups.put(input, readGroups); allReadGroups.addAll(readGroups); @@ -93,7 +98,7 @@ public class BQSRGatherer extends Gatherer { } RecalibrationReport generalReport = null; - for (File input : inputs) { + for (final File input : parsedInputs) { final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups); if( inputReport.isEmpty() ) { continue; } @@ -109,4 +114,27 @@ public class BQSRGatherer extends Gatherer { return generalReport.createGATKReport(); } + + /** + * Replaces any .list files in rawFileList with the files named in said .list file. + * Identical to parseVariantList method in CatVariants. + * @param rawFileList the original file list, possibly including .list files + * @return a new List, with .list files replaced + */ + private static List parseInputList(final List rawFileList) { + final List result = new ArrayList<>(rawFileList.size()); + for (final File rawFile : rawFileList) { + if (rawFile.getName().endsWith(".list")) { + try { + for (final String line : new XReadLines(rawFile, true)) + result.add(new File(line)); + } catch (final IOException e) { + throw new UserException.CouldNotReadInputFile(rawFile, e); + } + } else { + result.add(rawFile); + } + } + return result; + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java index b62f1717e..65936795c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java @@ -195,7 +195,8 @@ public class CatVariants extends CommandLineProgram { } /** - * Replaces any .list files in rawFileList with the files named in said .list file + * Replaces any .list files in rawFileList with the files named in said .list file. + * Identical to {@link org.broadinstitute.gatk.engine.recalibration.BQSRGatherer#parseInputList}. * @param rawFileList the original file list, possibly including .list files * @return a new List, with .list files replaced */ @@ -206,7 +207,7 @@ public class CatVariants extends CommandLineProgram { try { for (final String line : new XReadLines(rawFile, true)) result.add(new File(line)); - } catch (IOException e) { + } catch (final IOException e) { throw new UserException.CouldNotReadInputFile(rawFile, e); } } else {