Added a playground concordance analyzer for summarizing VariantEval across a group.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2867 348d0f76-0448-11de-a6fe-93d51630548a
2010-02-22 20:28:52 +00:00 · 2010-02-22 20:28:52 +00:00 · 3738b76320
parent a640bd2d79
commit 3738b76320
5 changed files with 346 additions and 0 deletions
--- a/R/analyzeConcordance/analyzeConcordance.R
+++ b/R/analyzeConcordance/analyzeConcordance.R
@ -0,0 +1,61 @@
+#!/usr/bin/env Rscript
+
+args <- commandArgs(TRUE)
+
+base_name = args[1]
+input = args[2]
+
+d <- read.table(input, header=T)
+# separate the data into filtered and unfiltered
+
+d.filtered <- d[d$filter_type=="filtered",]
+d.unfiltered <- d[d$filter_type=="unfiltered",]
+
+if (nrow(d.filtered) > 0) {
+	d.display <- d.filtered
+} else {
+	d.display <- d.unfiltered
+}
+
+#
+# Plot histograms of the known versus novel Ti/Tv
+#
+
+outfile = paste(base_name, ".histograms.png", sep="")
+
+if (nrow(d.filtered) > 0) {
+  nFilterTypes <- 2
+} else {
+  nFilterTypes <- 1
+}
+
+png(outfile, width=600, height=(300 * nFilterTypes))
+par(cex=1.1, mfrow=c(1 * nFilterTypes,2))
+nbreaks <- 20
+color <- "grey"
+xlim <- c(0,4)
+
+hist(d.unfiltered$known_titv, nbreaks, col=color, xlim=xlim)
+hist(d.unfiltered$novel_titv, nbreaks, col=color, xlim=xlim)
+
+if (nrow(d.filtered) > 0) {
+  hist(d.filtered$known_titv, nbreaks, col=color, xlim=xlim)
+  hist(d.filtered$novel_titv, nbreaks, col=color, xlim=xlim)
+}
+
+dev.off()
+
+#
+# Plot samples in order of novel Ti/Tv versus known Ti/Tv
+#
+
+outfile = paste(base_name, ".novel_vs_known_titv.png", sep="")
+
+png(outfile, width=600, height=600)
+
+d.display <- d.display[order(d.display$novel_titv),]
+plot(1:length(d.display$known_titv),d.display$known_titv,type="b",col="blue",ylim=c(0,4), xlab="Sample #", ylab="Ti / Tv")
+points(1:length(d.display$novel_titv),d.display$novel_titv,type="b",col="red",ylim=c(0,4))
+legend("bottomright", c("known","novel"), col=c("blue","red"), pch=21)
+
+dev.off()
--- a/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordance.java
+++ b/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordance.java
@ -0,0 +1,185 @@
+package org.broadinstitute.sting.playground.analyzeconcordance;
+
+import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.utils.xReadLines;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.PathUtils;
+import org.broadinstitute.sting.playground.utils.ProcessUtils;
+import org.apache.log4j.Logger;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Compares results of VariantEval across a population or a case/control group.
+ */
+public class AnalyzeConcordance extends CommandLineProgram {
+
+	@Argument(fullName = "group_name", shortName = "groupName", doc = "The name of the group which will be prefixed output files", required = false)
+	private String baseName = "analyze_concordance";
+	@Argument(fullName = "eval_list", shortName = "evalList", doc = "The input list of unfiltered eval files to analyze", required = true)
+	private String evalListFile = null;
+    @Argument(fullName = "filtered_eval_list", shortName = "filteredEvalList", doc = "The input list of filtered eval files to analyze", required = false)
+    private String filteredEvalListFile = null;
+    @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
+    private String outputDir = "analyzeConcordance";
+    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript", required = false)
+    private String pathToRscript = "env Rscript";
+    @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting analyze concordance R scripts", required = false)
+    private String pathToResources = "R" + File.separator + "analyzeConcordance";
+
+    private enum EvalFilterType {
+        UNFILTERED, FILTERED
+    }
+
+    private static final AnalyzeConcordanceField[] ANALYZE_CONCORDANCE_FIELDS = AnalyzeConcordanceField.values();
+
+	private String evalDataFile;
+    private List<String[]> data = new ArrayList<String[]>();
+
+    private static Logger logger = Logger.getLogger(AnalyzeConcordance.class);
+
+    protected int execute() {
+        int result;
+
+        try {
+            createOutputDirectory();
+
+            // initialize all the data from the csv file and allocate the list of covariates
+            logger.info("Reading in input csv file...");
+            initializeData();
+            logger.info("...Done!");
+
+            // output data tables for Rscript to read in
+            logger.info("Writing out intermediate tables for R...");
+            writeDataTables();
+            logger.info("...Done!");
+
+			// perform the analysis using Rscript and output the plots
+			logger.info("Calling analysis R scripts and writing out figures...");
+			result = callRScripts();
+			logger.info("...Done!");
+
+			// perform the analysis using Rscript and output the plots
+			logger.info("Generating html report...");
+			generateHtmlReport();
+			logger.info("...Done!");
+			
+        } catch (StingException se) {
+            throw se;
+        } catch (Exception e) {
+            throw new StingException("Error analyzing concordance", e);
+        }
+
+        return result;
+    }
+
+    private void createOutputDirectory() {
+        // create the output directory where all the data tables and plots will go
+        File outputDir = new File(this.outputDir);
+        if (!outputDir.exists() && !outputDir.mkdirs()) {
+            throw new StingException("Couldn't create directory: " + this.outputDir);
+        }
+    }
+
+    private void initializeData() throws FileNotFoundException {
+        // add the column headers to the data
+        addHeader();
+
+        // read the list of unfiltered eval files
+        addEvalListFile(EvalFilterType.UNFILTERED, new File(evalListFile));
+
+        // if provided, read the list of filtered eval files
+        if (filteredEvalListFile != null) {
+            addEvalListFile(EvalFilterType.FILTERED, new File(filteredEvalListFile));
+        }
+    }
+
+    private void addHeader() {
+        String[] headers = new String[ANALYZE_CONCORDANCE_FIELDS.length + 2];
+        int column = 0;
+        headers[column++] = "eval_id";
+        headers[column++] = "filter_type";
+
+        for (AnalyzeConcordanceField field : ANALYZE_CONCORDANCE_FIELDS) {
+            headers[column++] = field.getColumnHeader();
+        }
+
+        data.add(headers);
+    }
+
+    private void addEvalListFile(EvalFilterType filterType, File evalListFile) throws FileNotFoundException {
+        for (String line : new xReadLines(evalListFile)) {
+            String[] parts = line.split("\t");
+            addEvalFile(parts[0], filterType, new File(parts[1]));
+        }
+    }
+
+    private void addEvalFile(String evalID, EvalFilterType filterType, File evalFile) throws FileNotFoundException {
+        SortedMap<AnalyzeConcordanceField, String> fieldValues = new TreeMap<AnalyzeConcordanceField, String>();
+
+        for (String line : new xReadLines(evalFile)) {
+            for (AnalyzeConcordanceField field : ANALYZE_CONCORDANCE_FIELDS) {
+                String value = field.parseLine(line);
+                if (value != null) {
+                    fieldValues.put(field, value);
+                    break;  // continue to the next line.
+                }
+            }
+        }
+
+        String[] values = new String[ANALYZE_CONCORDANCE_FIELDS.length + 2];
+        int column = 0;
+        values[column++] = evalID;
+        values[column++] = filterType.toString().toLowerCase();
+
+        // get all the values, including null if for some reason a value wasn't found
+        for (AnalyzeConcordanceField field : ANALYZE_CONCORDANCE_FIELDS) {
+            values[column++] = fieldValues.get(field);
+        }
+
+        data.add(values);
+    }
+
+    private void writeDataTables() throws FileNotFoundException {
+		evalDataFile = baseName + ".eval_data.tsv";
+        // Create a PrintStream
+        PrintStream output = new PrintStream(new File(outputDir, evalDataFile));
+        for (String[] line : data) {
+            output.println(Utils.join("\t", line));
+        }
+        output.close();
+    }
+
+    private int callRScripts() {
+        String command = pathToRscript + " "
+                + new File(pathToResources, "analyzeConcordance.R") + " "
+				+ new File(outputDir, baseName) + " "
+                + new File(outputDir, evalDataFile);
+
+        return ProcessUtils.runCommandAndWait(command);
+    }
+
+	private void generateHtmlReport() throws FileNotFoundException {
+		// TODO: Enhance the reports
+		PrintStream output = new PrintStream(new File(outputDir, "report.html"));
+		output.println("<html><body>");
+		for (File pngFile : new File(outputDir).listFiles(new PathUtils.ExtensionFilter("png"))) {
+			output.println("<div><img src=\"" + pngFile.getName() + "\"/></div>");
+		}
+		output.println("</body></html>");
+		output.close();
+	}
+
+    public static void main(String[] argv) {
+        try {
+            AnalyzeConcordance instance = new AnalyzeConcordance();
+            start(instance, argv);
+            System.exit(CommandLineProgram.result);
+        } catch (Exception e) {
+            exitSystemWithError(e);
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordanceField.java
+++ b/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordanceField.java
@ -0,0 +1,41 @@
+package org.broadinstitute.sting.playground.analyzeconcordance;
+
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: kshakir
+ * Date: Feb 11, 2010
+ */
+public enum AnalyzeConcordanceField {
+    N_BASES_COVERED("all_bases", "all,summary,variant_counts", "n bases covered"),
+    ALL_DBSNP_RATE("all_dbsnp", "all,summary,db_coverage", "dbsnp_rate"),
+    ALL_VARIANT_COUNT("all_variants", "all,summary,variant_counts", "variants"),
+    ALL_TITV_RATIO("all_titv", "all,summary,transitions_transversions", "ratio"),
+    KNOWN_VARIANT_COUNT("known_variants", "known,summary,variant_counts", "variants"),
+    KNOWN_TITV_RATIO("known_titv", "known,summary,transitions_transversions", "ratio"),
+    NOVEL_VARIANT_COUNT("novel_variants", "novel,summary,variant_counts", "variants"),
+    NOVEL_TITV_RATIO("novel_titv", "novel,summary,transitions_transversions", "ratio");
+
+    private String columnHeader;
+    private Pattern pattern;
+
+    private AnalyzeConcordanceField(String columnHeader, String evalHeader, String analysis) {
+        this.columnHeader = columnHeader;
+
+        String lineRegex = evalHeader + " {2,}" + analysis + " {2,}([0-9.]+).*";
+        this.pattern = Pattern.compile(lineRegex);
+    }
+
+    public String getColumnHeader() {
+        return this.columnHeader;
+    }
+
+    public String parseLine(String line) {
+        Matcher matcher = this.pattern.matcher(line);
+        if (!matcher.matches())
+            return null;
+        return matcher.group(1);
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java
+++ b/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java
@ -0,0 +1,41 @@
+package org.broadinstitute.sting.playground.utils;
+
+import org.broadinstitute.sting.utils.xReadLines;
+import org.broadinstitute.sting.utils.StingException;
+import org.apache.log4j.Logger;
+
+/**
+ * A set of utilities for managing external processes.
+ */
+public class ProcessUtils {
+	private static Logger logger = Logger.getLogger(ProcessUtils.class);
+
+	/**
+	 * Runs a command line and returns the result code.
+	 * @param command Command line to execute.
+	 * @return Result code of the command.
+	 */
+	public static int runCommandAndWait(String command) {
+		try {
+			logger.debug("Running command: " + command);
+
+			Process p = Runtime.getRuntime().exec(command);
+			int result = p.waitFor();
+
+			if (logger.isDebugEnabled()) {
+				for (String line : new xReadLines(p.getInputStream())) {
+					logger.debug("command: " + line);
+				}
+				for (String line : new xReadLines(p.getErrorStream())) {
+					logger.error("command: " + line);
+				}
+			}
+
+			logger.debug("Command exited with result: " + result);
+
+			return result;
+		} catch (Exception e) {
+			throw new StingException("Error running command:" + command, e);
+		}
+	}
+}
--- a/packages/AnalyzeConcordance.xml
+++ b/packages/AnalyzeConcordance.xml
@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Currently built via:
+    ant playground package -Dexecutable=AnalyzeConcordance
+-->
+<package name="AnalyzeConcordance">
+  <executable name="AnalyzeConcordance">
+    <main-class name="org.broadinstitute.sting.playground.analyzeconcordance.AnalyzeConcordance" />
+    <resource-bundle file="StingText.properties" />
+    <dependencies>
+      <class name="org.broadinstitute.sting.playground.analyzeconcordance.AnalyzeConcordance" />
+    </dependencies>
+  </executable>
+  <resources>
+    <!-- Supplemental scripts for graph generation, etc. -->
+    <file name="R/analyzeConcordance/analyzeConcordance.R" />
+  </resources>
+</package>