From 0d4ea30d6d6e5db9c3e6db08215a2a80d4977958 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sun, 18 Mar 2012 22:31:54 -0400 Subject: [PATCH] Updating the BQSR Gatherer to the new file format This is important for quick turnaround in the analysis cycle of the new covariates. Also added a dummy unit test that doesn't really test anything (disabled), but helps in debugging. --- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 124 ++++++++++++++++++ .../bqsr/RecalibrationArgumentCollection.java | 3 +- .../walkers/bqsr/BQSRGathererUnitTest.java | 29 ++++ 3 files changed, 154 insertions(+), 2 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java new file mode 100755 index 000000000..3712f0cc5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.commandline.Gatherer; +import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatumOptimized; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * User: carneiro + * Date: 3/29/11 + */ + + +public class BQSRGatherer extends Gatherer { + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private static final String EOF_MARKER = "EOF"; + + private HashMap dataMap = new HashMap(); + + + private void addCSVData (String line) { + String[] covariates = line.split(","); + String key = ""; + RecalDatumOptimized values; + + for (int i = 0; i < covariates.length-3; i++) + key += covariates[i] + ","; + + if (covariates.length < 3) + throw new ReviewedStingException("Line only has 1 covariate : " + line); + + values = new RecalDatumOptimized(Long.parseLong(covariates[covariates.length - 3]), Long.parseLong(covariates[covariates.length - 2])); + + RecalDatumOptimized currentValues = dataMap.get(key); + if (currentValues == null) + dataMap.put(key, values); + else + currentValues.increment(values); + + } + + @Override + public void gather(List inputs, File output) { + PrintStream o; + try { + o = new PrintStream(output); + } catch ( FileNotFoundException e) { + throw new UserException("File to be output by CountCovariates Gather function was not found"); + } + + boolean sawEOF = false; + boolean printedHeader = false; + + // Read input files + for ( File RECAL_FILE : inputs) { + try { + for ( String line : new XReadLines(RECAL_FILE) ) { + if ( EOF_MARKER.equals(line) ) { + sawEOF = true; // sanity check + break; + } + + else if(line.startsWith("#")) { + if (!printedHeader) + o.println(line); + } + + else // Found a line of data + addCSVData(line); // Parse the line and add the data to the HashMap + } + + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); + } + + if ( !sawEOF ) { + final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!"; + throw new UserException.MalformedFile(RECAL_FILE, errorMessage); + } + printedHeader = true; + } + + // Write output file from dataMap + for(Map.Entry entry : dataMap.entrySet()) + o.println(entry.getKey() + entry.getValue().outputToCSV()); + o.println("EOF"); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index ab173e4fb..40f28f644 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; import java.io.PrintStream; import java.util.Collections; @@ -59,7 +58,7 @@ public class RecalibrationArgumentCollection { * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, * and the raw empirical quality score calculated by phred-scaling the mismatch rate. */ - @Gather(CountCovariatesGatherer.class) + @Gather(BQSRGatherer.class) @Output protected PrintStream RECAL_FILE; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java new file mode 100644 index 000000000..f1df6f9a7 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -0,0 +1,29 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.testng.annotations.Test; + +import java.io.File; +import java.util.LinkedList; +import java.util.List; + +/** + * @author Mauricio Carneiro + * @since 3/7/12 + */ +public class BQSRGathererUnitTest { + RecalibrationArgumentCollection RAC; + + private static File recal1 = new File("public/testdata/exampleCSV.csv"); + private static File recal2 = new File("public/testdata/exampleCSV.2.csv"); + + @Test(enabled = false) + public void testCombineTwoFiles() { + BQSRGatherer gatherer = new BQSRGatherer(); + List recalFiles = new LinkedList (); + File output = new File("foo.csv"); + + recalFiles.add(recal1); + recalFiles.add(recal2); + gatherer.gather(recalFiles, output); + } +}