From 3738b763200bfd81c9c93268e817daf0ba7b760c Mon Sep 17 00:00:00 2001 From: kshakir Date: Mon, 22 Feb 2010 20:28:52 +0000 Subject: [PATCH] Added a playground concordance analyzer for summarizing VariantEval across a group. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2867 348d0f76-0448-11de-a6fe-93d51630548a --- R/analyzeConcordance/analyzeConcordance.R | 61 ++++++ .../AnalyzeConcordance.java | 185 ++++++++++++++++++ .../AnalyzeConcordanceField.java | 41 ++++ .../sting/playground/utils/ProcessUtils.java | 41 ++++ packages/AnalyzeConcordance.xml | 18 ++ 5 files changed, 346 insertions(+) create mode 100755 R/analyzeConcordance/analyzeConcordance.R create mode 100644 java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordance.java create mode 100644 java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordanceField.java create mode 100644 java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java create mode 100644 packages/AnalyzeConcordance.xml diff --git a/R/analyzeConcordance/analyzeConcordance.R b/R/analyzeConcordance/analyzeConcordance.R new file mode 100755 index 000000000..584e0783c --- /dev/null +++ b/R/analyzeConcordance/analyzeConcordance.R @@ -0,0 +1,61 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(TRUE) + +base_name = args[1] +input = args[2] + +d <- read.table(input, header=T) +# separate the data into filtered and unfiltered + +d.filtered <- d[d$filter_type=="filtered",] +d.unfiltered <- d[d$filter_type=="unfiltered",] + +if (nrow(d.filtered) > 0) { + d.display <- d.filtered +} else { + d.display <- d.unfiltered +} + +# +# Plot histograms of the known versus novel Ti/Tv +# + +outfile = paste(base_name, ".histograms.png", sep="") + +if (nrow(d.filtered) > 0) { + nFilterTypes <- 2 +} else { + nFilterTypes <- 1 +} + +png(outfile, width=600, height=(300 * nFilterTypes)) +par(cex=1.1, mfrow=c(1 * nFilterTypes,2)) +nbreaks <- 20 +color <- "grey" +xlim <- c(0,4) + +hist(d.unfiltered$known_titv, nbreaks, col=color, xlim=xlim) +hist(d.unfiltered$novel_titv, nbreaks, col=color, xlim=xlim) + +if (nrow(d.filtered) > 0) { + hist(d.filtered$known_titv, nbreaks, col=color, xlim=xlim) + hist(d.filtered$novel_titv, nbreaks, col=color, xlim=xlim) +} + +dev.off() + +# +# Plot samples in order of novel Ti/Tv versus known Ti/Tv +# + +outfile = paste(base_name, ".novel_vs_known_titv.png", sep="") + +png(outfile, width=600, height=600) + +d.display <- d.display[order(d.display$novel_titv),] +plot(1:length(d.display$known_titv),d.display$known_titv,type="b",col="blue",ylim=c(0,4), xlab="Sample #", ylab="Ti / Tv") +points(1:length(d.display$novel_titv),d.display$novel_titv,type="b",col="red",ylim=c(0,4)) +legend("bottomright", c("known","novel"), col=c("blue","red"), pch=21) + +dev.off() diff --git a/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordance.java b/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordance.java new file mode 100644 index 000000000..7692f174e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordance.java @@ -0,0 +1,185 @@ +package org.broadinstitute.sting.playground.analyzeconcordance; + +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.xReadLines; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.PathUtils; +import org.broadinstitute.sting.playground.utils.ProcessUtils; +import org.apache.log4j.Logger; + +import java.io.*; +import java.util.*; + +/** + * Compares results of VariantEval across a population or a case/control group. + */ +public class AnalyzeConcordance extends CommandLineProgram { + + @Argument(fullName = "group_name", shortName = "groupName", doc = "The name of the group which will be prefixed output files", required = false) + private String baseName = "analyze_concordance"; + @Argument(fullName = "eval_list", shortName = "evalList", doc = "The input list of unfiltered eval files to analyze", required = true) + private String evalListFile = null; + @Argument(fullName = "filtered_eval_list", shortName = "filteredEvalList", doc = "The input list of filtered eval files to analyze", required = false) + private String filteredEvalListFile = null; + @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false) + private String outputDir = "analyzeConcordance"; + @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript", required = false) + private String pathToRscript = "env Rscript"; + @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting analyze concordance R scripts", required = false) + private String pathToResources = "R" + File.separator + "analyzeConcordance"; + + private enum EvalFilterType { + UNFILTERED, FILTERED + } + + private static final AnalyzeConcordanceField[] ANALYZE_CONCORDANCE_FIELDS = AnalyzeConcordanceField.values(); + + private String evalDataFile; + private List data = new ArrayList(); + + private static Logger logger = Logger.getLogger(AnalyzeConcordance.class); + + protected int execute() { + int result; + + try { + createOutputDirectory(); + + // initialize all the data from the csv file and allocate the list of covariates + logger.info("Reading in input csv file..."); + initializeData(); + logger.info("...Done!"); + + // output data tables for Rscript to read in + logger.info("Writing out intermediate tables for R..."); + writeDataTables(); + logger.info("...Done!"); + + // perform the analysis using Rscript and output the plots + logger.info("Calling analysis R scripts and writing out figures..."); + result = callRScripts(); + logger.info("...Done!"); + + // perform the analysis using Rscript and output the plots + logger.info("Generating html report..."); + generateHtmlReport(); + logger.info("...Done!"); + + } catch (StingException se) { + throw se; + } catch (Exception e) { + throw new StingException("Error analyzing concordance", e); + } + + return result; + } + + private void createOutputDirectory() { + // create the output directory where all the data tables and plots will go + File outputDir = new File(this.outputDir); + if (!outputDir.exists() && !outputDir.mkdirs()) { + throw new StingException("Couldn't create directory: " + this.outputDir); + } + } + + private void initializeData() throws FileNotFoundException { + // add the column headers to the data + addHeader(); + + // read the list of unfiltered eval files + addEvalListFile(EvalFilterType.UNFILTERED, new File(evalListFile)); + + // if provided, read the list of filtered eval files + if (filteredEvalListFile != null) { + addEvalListFile(EvalFilterType.FILTERED, new File(filteredEvalListFile)); + } + } + + private void addHeader() { + String[] headers = new String[ANALYZE_CONCORDANCE_FIELDS.length + 2]; + int column = 0; + headers[column++] = "eval_id"; + headers[column++] = "filter_type"; + + for (AnalyzeConcordanceField field : ANALYZE_CONCORDANCE_FIELDS) { + headers[column++] = field.getColumnHeader(); + } + + data.add(headers); + } + + private void addEvalListFile(EvalFilterType filterType, File evalListFile) throws FileNotFoundException { + for (String line : new xReadLines(evalListFile)) { + String[] parts = line.split("\t"); + addEvalFile(parts[0], filterType, new File(parts[1])); + } + } + + private void addEvalFile(String evalID, EvalFilterType filterType, File evalFile) throws FileNotFoundException { + SortedMap fieldValues = new TreeMap(); + + for (String line : new xReadLines(evalFile)) { + for (AnalyzeConcordanceField field : ANALYZE_CONCORDANCE_FIELDS) { + String value = field.parseLine(line); + if (value != null) { + fieldValues.put(field, value); + break; // continue to the next line. + } + } + } + + String[] values = new String[ANALYZE_CONCORDANCE_FIELDS.length + 2]; + int column = 0; + values[column++] = evalID; + values[column++] = filterType.toString().toLowerCase(); + + // get all the values, including null if for some reason a value wasn't found + for (AnalyzeConcordanceField field : ANALYZE_CONCORDANCE_FIELDS) { + values[column++] = fieldValues.get(field); + } + + data.add(values); + } + + private void writeDataTables() throws FileNotFoundException { + evalDataFile = baseName + ".eval_data.tsv"; + // Create a PrintStream + PrintStream output = new PrintStream(new File(outputDir, evalDataFile)); + for (String[] line : data) { + output.println(Utils.join("\t", line)); + } + output.close(); + } + + private int callRScripts() { + String command = pathToRscript + " " + + new File(pathToResources, "analyzeConcordance.R") + " " + + new File(outputDir, baseName) + " " + + new File(outputDir, evalDataFile); + + return ProcessUtils.runCommandAndWait(command); + } + + private void generateHtmlReport() throws FileNotFoundException { + // TODO: Enhance the reports + PrintStream output = new PrintStream(new File(outputDir, "report.html")); + output.println(""); + for (File pngFile : new File(outputDir).listFiles(new PathUtils.ExtensionFilter("png"))) { + output.println("
"); + } + output.println(""); + output.close(); + } + + public static void main(String[] argv) { + try { + AnalyzeConcordance instance = new AnalyzeConcordance(); + start(instance, argv); + System.exit(CommandLineProgram.result); + } catch (Exception e) { + exitSystemWithError(e); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordanceField.java b/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordanceField.java new file mode 100644 index 000000000..b7ff2e50f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/analyzeconcordance/AnalyzeConcordanceField.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.playground.analyzeconcordance; + +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +/** + * Created by IntelliJ IDEA. + * User: kshakir + * Date: Feb 11, 2010 + */ +public enum AnalyzeConcordanceField { + N_BASES_COVERED("all_bases", "all,summary,variant_counts", "n bases covered"), + ALL_DBSNP_RATE("all_dbsnp", "all,summary,db_coverage", "dbsnp_rate"), + ALL_VARIANT_COUNT("all_variants", "all,summary,variant_counts", "variants"), + ALL_TITV_RATIO("all_titv", "all,summary,transitions_transversions", "ratio"), + KNOWN_VARIANT_COUNT("known_variants", "known,summary,variant_counts", "variants"), + KNOWN_TITV_RATIO("known_titv", "known,summary,transitions_transversions", "ratio"), + NOVEL_VARIANT_COUNT("novel_variants", "novel,summary,variant_counts", "variants"), + NOVEL_TITV_RATIO("novel_titv", "novel,summary,transitions_transversions", "ratio"); + + private String columnHeader; + private Pattern pattern; + + private AnalyzeConcordanceField(String columnHeader, String evalHeader, String analysis) { + this.columnHeader = columnHeader; + + String lineRegex = evalHeader + " {2,}" + analysis + " {2,}([0-9.]+).*"; + this.pattern = Pattern.compile(lineRegex); + } + + public String getColumnHeader() { + return this.columnHeader; + } + + public String parseLine(String line) { + Matcher matcher = this.pattern.matcher(line); + if (!matcher.matches()) + return null; + return matcher.group(1); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java b/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java new file mode 100644 index 000000000..bb2e19c19 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/utils/ProcessUtils.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.playground.utils; + +import org.broadinstitute.sting.utils.xReadLines; +import org.broadinstitute.sting.utils.StingException; +import org.apache.log4j.Logger; + +/** + * A set of utilities for managing external processes. + */ +public class ProcessUtils { + private static Logger logger = Logger.getLogger(ProcessUtils.class); + + /** + * Runs a command line and returns the result code. + * @param command Command line to execute. + * @return Result code of the command. + */ + public static int runCommandAndWait(String command) { + try { + logger.debug("Running command: " + command); + + Process p = Runtime.getRuntime().exec(command); + int result = p.waitFor(); + + if (logger.isDebugEnabled()) { + for (String line : new xReadLines(p.getInputStream())) { + logger.debug("command: " + line); + } + for (String line : new xReadLines(p.getErrorStream())) { + logger.error("command: " + line); + } + } + + logger.debug("Command exited with result: " + result); + + return result; + } catch (Exception e) { + throw new StingException("Error running command:" + command, e); + } + } +} diff --git a/packages/AnalyzeConcordance.xml b/packages/AnalyzeConcordance.xml new file mode 100644 index 000000000..d0878089b --- /dev/null +++ b/packages/AnalyzeConcordance.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + +