Placeholder commit: improvements to VariantReport (now shows stats for variants that are called in one set and filtered in another). Better command-line argument support.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4404 348d0f76-0448-11de-a6fe-93d51630548a
2010-10-01 18:46:53 +00:00 · 2010-10-01 18:46:53 +00:00 · fe29c8b09c
parent d4398f2686
commit fe29c8b09c
2 changed files with 144 additions and 51 deletions
--- a/R/VariantReport/VariantReport.R
+++ b/R/VariantReport/VariantReport.R
@ -1,39 +1,72 @@
 source(paste(Sys.getenv("STING_DIR"), "/R/gsacommons.R", sep=""));

-if (interactive()) {
-    if (!exists("plotRoot")) {
-        plotRoot = "test.plot";
-    }
-} else {
-    args = commandArgs(TRUE);
+if (!interactive()) {
+    options(warn=-1);

-    evalRoot = args[1];
-    plotRoot = args[2];
+    library(getopt);
+
+    spec = c(
+        'evalRoot', 'e', 1, 'character', "root of the VariantEval files",
+        'plotRoot', 'p', 1, 'character', "output root of the PDF file"
+    );
+
+    opt = getopt(matrix(spec, byrow=T, nrow=length(spec)/5));
+} else {
+    opt = list(
+        evalRoot = "results/v9/with_1kg/recalibrated.with1KGSites.vcf.intermediate/merged.vcf.eval",
+        plotRoot = "results/v9/with_1kg/recalibrated.with1KGSites.vcf.intermediate/merged.vcf.eval/plot"
+    );
 }

-eval = read.eval(evalRoot);
+eval = read.eval(opt$evalRoot);

-plot.begin(plotRoot, "variantReport");
+plot.begin(opt$plotRoot, "variantReport");

-# Venn diagram
-plot.callsetConcordance(eval);
+if (length(eval$CallsetNames) > 0) {
+    # Venn diagram
+    plot.callsetConcordance(eval);

-# Venn by AC
-plot.callsetConcordanceByAC(eval, novelty_name="all");
-plot.callsetConcordanceByAC(eval, novelty_name="known");
-plot.callsetConcordanceByAC(eval, novelty_name="novel");
+    # Venn by AC
+    plot.callsetConcordanceByAC(eval, novelty_name="all");
+    plot.callsetConcordanceByAC(eval, novelty_name="known");
+    plot.callsetConcordanceByAC(eval, novelty_name="novel");

-# Allele count spectrum
-plot.alleleCountSpectrum(eval, novelty_name="all");
-plot.alleleCountSpectrum(eval, novelty_name="known");
-plot.alleleCountSpectrum(eval, novelty_name="novel");
+    # Allele count spectrum
+    plot.alleleCountSpectrum(eval, novelty_name="all");
+    plot.alleleCountSpectrum(eval, novelty_name="known");
+    plot.alleleCountSpectrum(eval, novelty_name="novel");

-# Ti/Tv spectrum
-plot.titvSpectrum(eval, novelty_name="all");
-plot.titvSpectrum(eval, novelty_name="known");
-plot.titvSpectrum(eval, novelty_name="novel");
+    # Ti/Tv spectrum
+    plot.titvSpectrum(eval, novelty_name="all");
+    plot.titvSpectrum(eval, novelty_name="known");
+    plot.titvSpectrum(eval, novelty_name="novel");

-# Per-sample
-#plot.variantsPerSample(eval);
+    # Per-sample
+    #plot.variantsPerSample(eval);
+} else {
+    plot.variantsPerSample(eval, novelty_name="all");
+    plot.variantsPerSample(eval, novelty_name="known");
+    plot.variantsPerSample(eval, novelty_name="novel");
+}

-plot.end(plotRoot);
+plot.variantsPerSample(eval, novelty_name="all");
+plot.variantsPerSample(eval, novelty_name="known");
+plot.variantsPerSample(eval, novelty_name="novel");
+
+allVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == "none" & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "all"),]$nVariantLoci;
+knownVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == "none" & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "known"),]$nVariantLoci;
+novelVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == "none" & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "novel"),]$nVariantLoci;
+
+allTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == "none" & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "all"),]$ti.tv_ratio;
+knownTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == "none" & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "known"),]$ti.tv_ratio;
+novelTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == "none" & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "novel"),]$ti.tv_ratio;
+
+suppressPackageStartupMessages(library(gplots));
+
+variantsummary = as.data.frame(t(matrix(c(allVariants, allTiTv, knownVariants, knownTiTv, novelVariants, novelTiTv), nrow=2)));
+colnames(variantsummary) = c("counts", "ti/tv");
+rownames(variantsummary) = c("all", "known", "novel");
+
+textplot(variantsummary);
+
+plot.end(opt$plotRoot);
--- a/R/gsacommons.R
+++ b/R/gsacommons.R
@ -56,7 +56,11 @@ plot.venn <- function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0,
    a = readPNG(filename);
    
    plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab="");
-    rasterImage(a, pos[1], pos[2], pos[3], pos[4]);
+    if (a >= b) {
+        rasterImage(a, pos[1], pos[2], pos[3], pos[4]);
+    } else {
+        rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180);
+    }

    # Clean up!
    unlink(filename);
@ -83,6 +87,7 @@ read.eval <- function(evalRoot) {
    fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
    fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
    fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
+    fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
    fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
    fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
    fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
@ -98,6 +103,7 @@ read.eval <- function(evalRoot) {
        QualityScoreHistogram = NA,
        SampleStatistics = NA,
        SampleSummaryStatistics = NA,
+        SimpleMetricsBySample = NA,
        TiTv = NA,
        TiTvStats = NA,
        Variant_Quality_Score = NA,
@ -117,14 +123,18 @@ read.eval <- function(evalRoot) {
    eval$QualityScoreHistogram           = .attemptToLoadFile(fileQualityScoreHistogram);
    eval$SampleStatistics                = .attemptToLoadFile(fileSampleStatistics);
    eval$SampleSummaryStatistics         = .attemptToLoadFile(fileSampleSummaryStatistics);
+    eval$SimpleMetricsBySample           = .attemptToLoadFile(fileSimpleMetricsBySample);
    eval$TiTv                            = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
    eval$TiTvStats                       = .attemptToLoadFile(fileTiTvStats);
    eval$Variant_Quality_Score           = .attemptToLoadFile(fileVariant_Quality_Score);

    uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
-    eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("Filtered|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
+    eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
    eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
-    eval$CallsetFilteredNames = as.vector(c(paste(eval$CallsetNames[1], "-filteredIn", eval$CallsetNames[2], sep=""), paste(eval$CallsetNames[2], "-filteredIn", eval$CallsetNames[1], sep="")));
+    eval$CallsetFilteredNames = as.vector(c(
+        paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""),
+        paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep=""))
+    );

    eval;
 }
@ -142,8 +152,14 @@ eval.getMetrics <- function(eval, jexl_expression) {
    callset.novelCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "novel"),]$nVariantLoci;
    callset.novelCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio;

-    callset.filteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci;
-    callset.filteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio;
+    callset.allFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci;
+    callset.allFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio;
+
+    callset.knownFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "known"),]$nVariantLoci;
+    callset.knownFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio;
+
+    callset.novelFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "novel"),]$nVariantLoci;
+    callset.novelFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio;
    
    metrics = list(
        all = callset.calledCounts,
@ -155,48 +171,66 @@ eval.getMetrics <- function(eval, jexl_expression) {
        novel = callset.novelCounts,
        novel.titv = callset.novelCounts.titv,

-        filtered = callset.filteredCounts,
-        filtered.titv = callset.filteredCounts.titv
+        filtered.all = callset.allFilteredCounts,
+        filtered.all.titv = callset.allFilteredCounts.titv,
+
+        filtered.known = callset.knownFilteredCounts,
+        filtered.known.titv = callset.knownFilteredCounts.titv,
+
+        filtered.novel = callset.novelFilteredCounts,
+        filtered.novel.titv = callset.novelFilteredCounts.titv
    );
 }

-.plot.callsetConcordance.getLabelText <- function(name, metrics, union) {
-    text = sprintf("%s (%0.1f%% of union)\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f\n",
-                   name,             100.0*metrics$all/union$all,
+.plot.callsetConcordance.getLabelText <- function(name, othername, metrics, filtered.metrics=NA, union) {
+    if (is.na(filtered.metrics)) {
+        text = sprintf("%s (%0.01f%% of union)\nCalled:\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f",
+                   name,             100*metrics$all/union$all.withfiltered,
                   metrics$all,      metrics$all.titv,
                   metrics$known,    metrics$known.titv,
                   metrics$novel,    metrics$novel.titv
-    );
+        );
+    } else {
+        text = sprintf("%s (%0.01f%% of union)\nCalled in %s, absent in %s:\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f\n\nCalled in %s, filtered in %s:\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f",
+                   name,             100*(metrics$all + filtered.metrics$all)/union$all.withfiltered,
+                   name,             othername,
+                   metrics$all,      metrics$all.titv,
+                   metrics$known,    metrics$known.titv,
+                   metrics$novel,    metrics$novel.titv,
+
+                   name, othername,
+                   filtered.metrics$all,      filtered.metrics$all.titv,
+                   filtered.metrics$known,    filtered.metrics$known.titv,
+                   filtered.metrics$novel,    filtered.metrics$novel.titv
+        );
+    }
 }

 plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63")) {
    aonly = eval.getMetrics(eval, eval$CallsetOnlyNames[1]);
+    aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]);
    intersection = eval.getMetrics(eval, "Intersection");
    bonly = eval.getMetrics(eval, eval$CallsetOnlyNames[2]);
+    bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]);

    union = list(
        all = intersection$all + aonly$all + bonly$all,
-        known = intersection$known + aonly$known + bonly$known,
-        novel = intersection$novel + aonly$novel + bonly$novel,
-        filtered = intersection$filtered + aonly$filtered + bonly$filtered
+        all.withfiltered = intersection$all + aonly$all + bonly$all + aonly.filtered$all + bonly.filtered$all
    );

-    #par.def = par(no.readonly = TRUE);
-    #par(mar=c(0.1, 0.1, 0.1, 0.1));
+    plot.venn(aonly$all + intersection$all + aonly.filtered$all, bonly$all + intersection$all + bonly.filtered$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col);

-    plot.venn(aonly$all + intersection$all, bonly$all + intersection$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col);
-
-    text(0, 0.5, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], aonly, union));
-    text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", intersection, union));
-    text(1, 0.5, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], bonly, union));
-
-    #par(par.def);
+    text(0, 0.45, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], eval$CallsetNames[2], aonly, aonly.filtered, union));
+    text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", NA, intersection, NA, union));
+    text(1, 0.45, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], eval$CallsetNames[1], bonly, bonly.filtered, union));
 }

 plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#5C92A4", "#55BBFF")) {
    aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name);
+    aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]);
    intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name);
    bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name);
+    bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]);

    title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep="");

@ -304,7 +338,7 @@ plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#5C92A
    legend("topleft", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col);
 }

-plot.variantsPerSample <- function(eval) {
+plot.variantsPerSample2 <- function(eval) {
    if (!is.na(eval$MetricsBySample)) {
        metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),]; 
        metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),]; 
@ -323,3 +357,29 @@ plot.variantsPerSample <- function(eval) {
        axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4);
    }
 }
+
+plot.variantsPerSample <- function(eval, novelty_name="all") {
+    if (!is.na(eval$SimpleMetricsBySample)) {
+        metrics = eval$SimpleMetricsBySample[which(eval$SimpleMetricsBySample$evaluation_name == "eval" & eval$SimpleMetricsBySample$comparison_name == "dbsnp" & as.character(eval$SimpleMetricsBySample$jexl_expression) == "none" & eval$SimpleMetricsBySample$filter_name == "called" & eval$SimpleMetricsBySample$novelty_name == novelty_name),]; 
+
+        title = paste("Calls per sample (", novelty_name, ")", sep="");
+        indices = order(metrics$CountVariants, decreasing=TRUE);
+
+        par.def = par(no.readonly = TRUE);
+        par(mar=c(5, 4, 4, 4));
+
+        plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$row)), ylim=c(0, max(metrics$CountVariants)), xlab="", ylab="Number of variants", main=title, bty="n");
+        points(c(1:length(metrics$row)), (metrics$CountVariants)[indices], pch=21, col="black");
+
+        axis(1, at=c(1:length(metrics$row)), labels=(metrics$row)[indices], las=2, cex.axis=0.4);
+
+        par(new=TRUE);
+        plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=c(1, length(metrics$row)), ylim=c(min(metrics$TiTvRatio), 1.2*max(metrics$TiTvRatio)), xlab="", ylab="", main=title, bty="n");
+        points(c(1:length(metrics$row)), (metrics$TiTvRatio)[indices], pch=19, col="black");
+
+        titvaxis = c(min(metrics$TiTvRatio), max(metrics$TiTvRatio));
+        axis(4, at=titvaxis, labels=titvaxis, las=2);
+
+        par(par.def);
+    }
+}