Count Covariates is almost scatter-gatherable now!

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5537 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-29 22:25:33 +00:00 · 2011-03-29 22:25:33 +00:00 · 5d26c66769
parent 5ddc0e464a
commit 5d26c66769
3 changed files with 158 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
@ -0,0 +1,108 @@
+package org.broadinstitute.sting.gatk.walkers.recalibration;
+
+import org.broadinstitute.sting.commandline.Gatherer;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.text.XReadLines;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.regex.Pattern;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 3/29/11
+ * Time: 3:54 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+
+public class CountCovariatesGatherer extends Gatherer {
+
+    /////////////////////////////
+    // Private Member Variables
+    /////////////////////////////
+    private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
+    private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
+    private static final String EOF_MARKER = "EOF";
+
+    private HashMap<String, int[]> dataMap;
+
+
+    private void addCSVData (String line) {
+        String[] covariates = line.split(",");
+        String key = "";
+        int [] values = new int[3];
+
+        for (int i = 0; i < covariates.length-3; i++) {
+            key += covariates[i] + ",";
+        }
+
+        for (int i = covariates.length-3; i < covariates.length; i++) {
+            values[i] = Integer.parseInt(covariates[i].trim());
+        }
+
+        if (dataMap.get(key) != null) {
+            int [] currentValues = dataMap.get(key);
+            for (int i = 0; i < 2; i++) {
+                values[i] += currentValues[i];// todo -- update the third value using the CountCovariatesWalker function
+            }
+        }
+
+        dataMap.put(key, values);
+    }
+
+    @Override
+    public void gather(List<File> inputs, File output) {
+        dataMap = new HashMap<String, int[]>();
+        PrintStream o;
+        try {
+            o = new PrintStream(output);
+        } catch ( FileNotFoundException e) {
+            throw new UserException("File to be output by CountCovariates Gather function was not found");
+        }
+
+        boolean sawEOF = false;
+        boolean headerPrinted = false;
+
+        // Read input files
+        for ( File RECAL_FILE : inputs) {
+            try {
+                for ( String line : new XReadLines(RECAL_FILE) ) {
+                    if ( EOF_MARKER.equals(line) ) {
+                        sawEOF = true;    // sanity check
+                    }
+                    else if( COMMENT_PATTERN.matcher(line).matches() || COVARIATE_PATTERN.matcher(line).matches() ) {
+                        if (!headerPrinted) {
+                            headerPrinted = true;
+                            o.println(line);
+                        }                 // Skip over the header (could check if headers are the same, but probably not necessary)
+                    }
+                    else {                // Found a line of data
+                        addCSVData(line); // Parse the line and add the data to the HashMap
+                    }
+                }
+
+            } catch ( FileNotFoundException e ) {
+                throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
+            }
+
+            if ( !sawEOF ) {
+                final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
+                throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
+            }
+        }
+
+        // Write output file from dataMap
+        for(String key : dataMap.keySet()) {
+            int [] values = dataMap.get(key);
+            String v = "," + values[0] + "," + values[1] + "," + values[2];
+            o.println(key + v);
+        }
+
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
@ -29,6 +29,7 @@ import org.broad.tribble.bed.BEDCodec;
 import org.broad.tribble.dbsnp.DbSNPCodec;
 import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broad.tribble.vcf.VCFCodec;
+import org.broadinstitute.sting.commandline.Gather;
 import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -75,6 +76,7 @@ import java.util.Map;
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
+@PartitionBy(PartitionType.LOCUS)
 public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {

    /////////////////////////////
@ -90,6 +92,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
    @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();

    @Output
+    @Gather(CountCovariatesGatherer.class)
    PrintStream out;

    /////////////////////////////
--- a/scala/qscript/oneoffs/carneiro/QuickCCTest.scala
+++ b/scala/qscript/oneoffs/carneiro/QuickCCTest.scala
@ -0,0 +1,47 @@
+/*
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 3/29/11
+ * Time: 5:31 PM
+ */
+package oneoffs.carneiro;
+
+import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.queue.extensions.gatk._
+
+class QuickCCTest extends QScript {
+  qscript =>
+
+  @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
+  var GATKjar: File = _
+
+  @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
+  var input: File = _
+
+  @Input(doc="Reference fasta file", shortName="R", required=false)
+  var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
+
+  @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false)
+  var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
+
+  @Input(shortName="L", required=false)
+  var intervals: List[String] = Nil
+
+
+  val queueLogDir: String = ".qlog/"
+
+
+  def script = {
+
+    val cc = new CountCovariates()
+    cc.reference_sequence = reference
+    cc.input_file :+= input
+    cc.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
+    cc.intervalsString = intervals
+    cc.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
+    cc.scatterCount = 4
+    cc.recal_file = new File("recal.csv")
+
+    add(cc);
+  }
+}