diff --git a/R/VariantReport/VariantReport.R b/R/VariantReport/VariantReport.R index 1fefb256c..2d2295636 100644 --- a/R/VariantReport/VariantReport.R +++ b/R/VariantReport/VariantReport.R @@ -1,39 +1,72 @@ source(paste(Sys.getenv("STING_DIR"), "/R/gsacommons.R", sep="")); -if (interactive()) { - if (!exists("plotRoot")) { - plotRoot = "test.plot"; - } -} else { - args = commandArgs(TRUE); +if (!interactive()) { + options(warn=-1); - evalRoot = args[1]; - plotRoot = args[2]; + library(getopt); + + spec = c( + 'evalRoot', 'e', 1, 'character', "root of the VariantEval files", + 'plotRoot', 'p', 1, 'character', "output root of the PDF file" + ); + + opt = getopt(matrix(spec, byrow=T, nrow=length(spec)/5)); +} else { + opt = list( + evalRoot = "results/v9/with_1kg/recalibrated.with1KGSites.vcf.intermediate/merged.vcf.eval", + plotRoot = "results/v9/with_1kg/recalibrated.with1KGSites.vcf.intermediate/merged.vcf.eval/plot" + ); } -eval = read.eval(evalRoot); +eval = read.eval(opt$evalRoot); -plot.begin(plotRoot, "variantReport"); +plot.begin(opt$plotRoot, "variantReport"); -# Venn diagram -plot.callsetConcordance(eval); +if (length(eval$CallsetNames) > 0) { + # Venn diagram + plot.callsetConcordance(eval); -# Venn by AC -plot.callsetConcordanceByAC(eval, novelty_name="all"); -plot.callsetConcordanceByAC(eval, novelty_name="known"); -plot.callsetConcordanceByAC(eval, novelty_name="novel"); + # Venn by AC + plot.callsetConcordanceByAC(eval, novelty_name="all"); + plot.callsetConcordanceByAC(eval, novelty_name="known"); + plot.callsetConcordanceByAC(eval, novelty_name="novel"); -# Allele count spectrum -plot.alleleCountSpectrum(eval, novelty_name="all"); -plot.alleleCountSpectrum(eval, novelty_name="known"); -plot.alleleCountSpectrum(eval, novelty_name="novel"); + # Allele count spectrum + plot.alleleCountSpectrum(eval, novelty_name="all"); + plot.alleleCountSpectrum(eval, novelty_name="known"); + plot.alleleCountSpectrum(eval, novelty_name="novel"); -# Ti/Tv spectrum -plot.titvSpectrum(eval, novelty_name="all"); -plot.titvSpectrum(eval, novelty_name="known"); -plot.titvSpectrum(eval, novelty_name="novel"); + # Ti/Tv spectrum + plot.titvSpectrum(eval, novelty_name="all"); + plot.titvSpectrum(eval, novelty_name="known"); + plot.titvSpectrum(eval, novelty_name="novel"); -# Per-sample -#plot.variantsPerSample(eval); + # Per-sample + #plot.variantsPerSample(eval); +} else { + plot.variantsPerSample(eval, novelty_name="all"); + plot.variantsPerSample(eval, novelty_name="known"); + plot.variantsPerSample(eval, novelty_name="novel"); +} -plot.end(plotRoot); +plot.variantsPerSample(eval, novelty_name="all"); +plot.variantsPerSample(eval, novelty_name="known"); +plot.variantsPerSample(eval, novelty_name="novel"); + +allVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == "none" & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "all"),]$nVariantLoci; +knownVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == "none" & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "known"),]$nVariantLoci; +novelVariants = eval$CountVariants[which(eval$CountVariants$jexl_expression == "none" & eval$CountVariants$filter_name == "called" & eval$CountVariants$novelty_name == "novel"),]$nVariantLoci; + +allTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == "none" & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "all"),]$ti.tv_ratio; +knownTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == "none" & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "known"),]$ti.tv_ratio; +novelTiTv = eval$TiTv[which(eval$TiTv$jexl_expression == "none" & eval$TiTv$filter_name == "called" & eval$TiTv$novelty_name == "novel"),]$ti.tv_ratio; + +suppressPackageStartupMessages(library(gplots)); + +variantsummary = as.data.frame(t(matrix(c(allVariants, allTiTv, knownVariants, knownTiTv, novelVariants, novelTiTv), nrow=2))); +colnames(variantsummary) = c("counts", "ti/tv"); +rownames(variantsummary) = c("all", "known", "novel"); + +textplot(variantsummary); + +plot.end(opt$plotRoot); diff --git a/R/gsacommons.R b/R/gsacommons.R index 14d99bd5f..1b00651a9 100644 --- a/R/gsacommons.R +++ b/R/gsacommons.R @@ -56,7 +56,11 @@ plot.venn <- function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0, a = readPNG(filename); plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab=""); - rasterImage(a, pos[1], pos[2], pos[3], pos[4]); + if (a >= b) { + rasterImage(a, pos[1], pos[2], pos[3], pos[4]); + } else { + rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180); + } # Clean up! unlink(filename); @@ -83,6 +87,7 @@ read.eval <- function(evalRoot) { fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); + fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); @@ -98,6 +103,7 @@ read.eval <- function(evalRoot) { QualityScoreHistogram = NA, SampleStatistics = NA, SampleSummaryStatistics = NA, + SimpleMetricsBySample = NA, TiTv = NA, TiTvStats = NA, Variant_Quality_Score = NA, @@ -117,14 +123,18 @@ read.eval <- function(evalRoot) { eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram); eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics); eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics); + eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample); eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); eval$TiTvStats = .attemptToLoadFile(fileTiTvStats); eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score); uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); - eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("Filtered|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); + eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); - eval$CallsetFilteredNames = as.vector(c(paste(eval$CallsetNames[1], "-filteredIn", eval$CallsetNames[2], sep=""), paste(eval$CallsetNames[2], "-filteredIn", eval$CallsetNames[1], sep=""))); + eval$CallsetFilteredNames = as.vector(c( + paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""), + paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep="")) + ); eval; } @@ -142,8 +152,14 @@ eval.getMetrics <- function(eval, jexl_expression) { callset.novelCounts = callset.counts[which(callset.counts$filter_name == "called" & callset.counts$novelty_name == "novel"),]$nVariantLoci; callset.novelCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "called" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; - callset.filteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci; - callset.filteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; + callset.allFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "all"),]$nVariantLoci; + callset.allFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "all"),]$ti.tv_ratio; + + callset.knownFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "known"),]$nVariantLoci; + callset.knownFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "known"),]$ti.tv_ratio; + + callset.novelFilteredCounts = callset.counts[which(callset.counts$filter_name == "filtered" & callset.counts$novelty_name == "novel"),]$nVariantLoci; + callset.novelFilteredCounts.titv = callset.counts.titv[which(callset.counts.titv$filter_name == "filtered" & callset.counts.titv$novelty_name == "novel"),]$ti.tv_ratio; metrics = list( all = callset.calledCounts, @@ -155,48 +171,66 @@ eval.getMetrics <- function(eval, jexl_expression) { novel = callset.novelCounts, novel.titv = callset.novelCounts.titv, - filtered = callset.filteredCounts, - filtered.titv = callset.filteredCounts.titv + filtered.all = callset.allFilteredCounts, + filtered.all.titv = callset.allFilteredCounts.titv, + + filtered.known = callset.knownFilteredCounts, + filtered.known.titv = callset.knownFilteredCounts.titv, + + filtered.novel = callset.novelFilteredCounts, + filtered.novel.titv = callset.novelFilteredCounts.titv ); } -.plot.callsetConcordance.getLabelText <- function(name, metrics, union) { - text = sprintf("%s (%0.1f%% of union)\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f\n", - name, 100.0*metrics$all/union$all, +.plot.callsetConcordance.getLabelText <- function(name, othername, metrics, filtered.metrics=NA, union) { + if (is.na(filtered.metrics)) { + text = sprintf("%s (%0.01f%% of union)\nCalled:\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f", + name, 100*metrics$all/union$all.withfiltered, metrics$all, metrics$all.titv, metrics$known, metrics$known.titv, metrics$novel, metrics$novel.titv - ); + ); + } else { + text = sprintf("%s (%0.01f%% of union)\nCalled in %s, absent in %s:\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f\n\nCalled in %s, filtered in %s:\nAll: %d, Ti/Tv: %0.1f\nKnown: %d, Ti/Tv: %0.1f\nNovel: %d, Ti/Tv: %0.1f", + name, 100*(metrics$all + filtered.metrics$all)/union$all.withfiltered, + name, othername, + metrics$all, metrics$all.titv, + metrics$known, metrics$known.titv, + metrics$novel, metrics$novel.titv, + + name, othername, + filtered.metrics$all, filtered.metrics$all.titv, + filtered.metrics$known, filtered.metrics$known.titv, + filtered.metrics$novel, filtered.metrics$novel.titv + ); + } } plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63")) { aonly = eval.getMetrics(eval, eval$CallsetOnlyNames[1]); + aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]); intersection = eval.getMetrics(eval, "Intersection"); bonly = eval.getMetrics(eval, eval$CallsetOnlyNames[2]); + bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]); union = list( all = intersection$all + aonly$all + bonly$all, - known = intersection$known + aonly$known + bonly$known, - novel = intersection$novel + aonly$novel + bonly$novel, - filtered = intersection$filtered + aonly$filtered + bonly$filtered + all.withfiltered = intersection$all + aonly$all + bonly$all + aonly.filtered$all + bonly.filtered$all ); - #par.def = par(no.readonly = TRUE); - #par(mar=c(0.1, 0.1, 0.1, 0.1)); + plot.venn(aonly$all + intersection$all + aonly.filtered$all, bonly$all + intersection$all + bonly.filtered$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col); - plot.venn(aonly$all + intersection$all, bonly$all + intersection$all, 0, intersection$all, 0, 0, pos=c(0.32, 0.32, 0.68, 0.70), col=col); - - text(0, 0.5, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], aonly, union)); - text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", intersection, union)); - text(1, 0.5, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], bonly, union)); - - #par(par.def); + text(0, 0.45, cex=1.2, pos=4, .plot.callsetConcordance.getLabelText(eval$CallsetNames[1], eval$CallsetNames[2], aonly, aonly.filtered, union)); + text(0.5, 0.75, cex=1.2, adj=c(0.5, 0.33), .plot.callsetConcordance.getLabelText("Intersection", NA, intersection, NA, union)); + text(1, 0.45, cex=1.2, pos=2, .plot.callsetConcordance.getLabelText(eval$CallsetNames[2], eval$CallsetNames[1], bonly, bonly.filtered, union)); } plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#5C92A4", "#55BBFF")) { aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); + aonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[1]); intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); + bonly.filtered = eval.getMetrics(eval, eval$CallsetFilteredNames[2]); title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep=""); @@ -304,7 +338,7 @@ plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#5C92A legend("topleft", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col); } -plot.variantsPerSample <- function(eval) { +plot.variantsPerSample2 <- function(eval) { if (!is.na(eval$MetricsBySample)) { metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),]; metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),]; @@ -323,3 +357,29 @@ plot.variantsPerSample <- function(eval) { axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4); } } + +plot.variantsPerSample <- function(eval, novelty_name="all") { + if (!is.na(eval$SimpleMetricsBySample)) { + metrics = eval$SimpleMetricsBySample[which(eval$SimpleMetricsBySample$evaluation_name == "eval" & eval$SimpleMetricsBySample$comparison_name == "dbsnp" & as.character(eval$SimpleMetricsBySample$jexl_expression) == "none" & eval$SimpleMetricsBySample$filter_name == "called" & eval$SimpleMetricsBySample$novelty_name == novelty_name),]; + + title = paste("Calls per sample (", novelty_name, ")", sep=""); + indices = order(metrics$CountVariants, decreasing=TRUE); + + par.def = par(no.readonly = TRUE); + par(mar=c(5, 4, 4, 4)); + + plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$row)), ylim=c(0, max(metrics$CountVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); + points(c(1:length(metrics$row)), (metrics$CountVariants)[indices], pch=21, col="black"); + + axis(1, at=c(1:length(metrics$row)), labels=(metrics$row)[indices], las=2, cex.axis=0.4); + + par(new=TRUE); + plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=c(1, length(metrics$row)), ylim=c(min(metrics$TiTvRatio), 1.2*max(metrics$TiTvRatio)), xlab="", ylab="", main=title, bty="n"); + points(c(1:length(metrics$row)), (metrics$TiTvRatio)[indices], pch=19, col="black"); + + titvaxis = c(min(metrics$TiTvRatio), max(metrics$TiTvRatio)); + axis(4, at=titvaxis, labels=titvaxis, las=2); + + par(par.def); + } +}