From a7409df1a66ad138a64deb4f5ba74f26e2c44782 Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 11 Aug 2010 02:22:50 +0000 Subject: [PATCH] Be more robust to missing or empty files in VariantEval output. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4008 348d0f76-0448-11de-a6fe-93d51630548a --- R/gsacommons.R | 150 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 113 insertions(+), 37 deletions(-) diff --git a/R/gsacommons.R b/R/gsacommons.R index 30fd2fc46..3617a052e 100644 --- a/R/gsacommons.R +++ b/R/gsacommons.R @@ -62,6 +62,16 @@ plot.venn <- function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0, unlink(filename); } +.attemptToLoadFile <- function(filename) { + file = NA; + + if (file.exists(filename) & file.info(filename)$size > 500) { + file = read.csv(filename, header=TRUE, comment.char="#"); + } + + file; +} + read.eval <- function(evalRoot) { fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); @@ -78,25 +88,39 @@ read.eval <- function(evalRoot) { fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); eval = list( - AlleleCountStats = read.csv(fileAlleleCountStats, header=TRUE, comment.char="#"), - CompOverlap = read.csv(fileCompOverlap, header=TRUE, comment.char="#"), - CountVariants = read.csv(fileCountVariants, header=TRUE, comment.char="#"), - GenotypeConcordance = read.csv(fileGenotypeConcordance, header=TRUE, comment.char="#"), - MetricsByAc = read.csv(fileMetricsByAc, header=TRUE, comment.char="#"), - MetricsBySample = read.csv(fileMetricsBySample, header=TRUE, comment.char="#"), - Quality_Metrics_by_allele_count = read.csv(fileQuality_Metrics_by_allele_count, header=TRUE, comment.char="#"), - QualityScoreHistogram = read.csv(fileQualityScoreHistogram, header=TRUE, comment.char="#"), - SampleStatistics = read.csv(fileSampleStatistics, header=TRUE, comment.char="#"), - SampleSummaryStatistics = read.csv(fileSampleSummaryStatistics, header=TRUE, comment.char="#"), - TiTv = read.csv(fileTi_slash_Tv_Variant_Evaluator, header=TRUE, comment.char="#"), - TiTvStats = read.csv(fileTiTvStats, header=TRUE, comment.char="#"), - Variant_Quality_Score = read.csv(fileVariant_Quality_Score, header=TRUE, comment.char="#"), + AlleleCountStats = NA, + CompOverlap = NA, + CountVariants = NA, + GenotypeConcordance = NA, + MetricsByAc = NA, + MetricsBySample = NA, + Quality_Metrics_by_allele_count = NA, + QualityScoreHistogram = NA, + SampleStatistics = NA, + SampleSummaryStatistics = NA, + TiTv = NA, + TiTvStats = NA, + Variant_Quality_Score = NA, CallsetNames = c(), CallsetOnlyNames = c(), CallsetFilteredNames = c() ); + eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats); + eval$CompOverlap = .attemptToLoadFile(fileCompOverlap); + eval$CountVariants = .attemptToLoadFile(fileCountVariants); + eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance); + eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc); + eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample); + eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count); + eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram); + eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics); + eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics); + eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); + eval$TiTvStats = .attemptToLoadFile(fileTiTvStats); + eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score); + uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("Filtered|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); @@ -170,34 +194,62 @@ plot.callsetConcordance <- function(eval, col=c("#FF6342", "#63C6DE", "#ADDE63") } plot.callsetConcordanceByAC <- function(eval, normalize=TRUE, novelty_name="all", col=c("#FF6342", "#5C92A4", "#55BBFF")) { - aonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[1]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; - intersection = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & eval$MetricsByAc$jexl_expression == "Intersection" & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; - bonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[2]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #aonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[1]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #intersection = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & eval$MetricsByAc$jexl_expression == "Intersection" & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #bonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[2]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + + aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); + intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); + bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); title = paste("Callset concordance per allele count (", novelty_name, " variants)", sep=""); + if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { + aonly = intersection; + aonly$n = 0; + } + + if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { + bonly = intersection; + bonly$n = 0; + } + if (normalize == TRUE) { norm = aonly$n + intersection$n + bonly$n; matnorm = rbind(aonly$n/norm, intersection$n/norm, bonly$n/norm); - barplot(matnorm, col=col, xlab="Allele count", ylab="Fraction", main=title, names.arg=intersection$AC, ylim=c(0, 1.3), border=NA); + barplot(matnorm, col=col, xlab="Allele count", ylab="Fraction", main=title, names.arg=intersection$AC, xlim=c(0, max(intersection$AC)), ylim=c(0, 1.3), border=NA); } else { mat = rbind(aonly$n, intersection$n, bonly$n); - barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, border="gray"); + barplot(mat, col=col, xlab="Allele count", ylab="counts", main=title, names.arg=intersection$AC, xlim=c(0, max(intersection$AC)), ylim=c(0, 1), border=NA); } legend("topright", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col); } plot.alleleCountSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#5C92A4", "#55BBFF")) { - aonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[1]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; - intersection = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & eval$MetricsByAc$jexl_expression == "Intersection" & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; - bonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[2]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #aonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[1]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #intersection = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & eval$MetricsByAc$jexl_expression == "Intersection" & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #bonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[2]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + + aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); + intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); + bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); title = paste("Allele count spectrum (", novelty_name, " variants)", sep=""); - suppressWarnings(plot(0, 0, type="n", xlim=c(1, length(intersection$n)), ylim=c(1, max(aonly$n + intersection$n, bonly$n + intersection$n)), xlab="Allele count", ylab="Number of variants", main=title, log="xy", bty="n")); + if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { + aonly = intersection; + aonly$n = 0; + } + + if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { + bonly = intersection; + bonly$n = 0; + } + + suppressWarnings(plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(1, max(aonly$n + intersection$n, bonly$n + intersection$n)), xlab="Allele count", ylab="Number of variants", main=title, log="xy", bty="n")); suppressWarnings(points(intersection$AC, aonly$n + intersection$n, type="l", lwd=2, col=col[1])); suppressWarnings(points(intersection$AC, intersection$n, type="l", lwd=2, col=col[2])); suppressWarnings(points(intersection$AC, bonly$n + intersection$n, type="l", lwd=2, col=col[3])); @@ -205,13 +257,35 @@ plot.alleleCountSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", legend("topright", c(eval$CallsetOnlyNames[1], "Intersection", eval$CallsetOnlyNames[2]), fill=col); } +eval.getMetricsByAc <- function(eval, jexl_expression, novelty_name="all") { + piece = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(jexl_expression) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; +} + plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#5C92A4", "#55BBFF")) { - aonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[1]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; - intersection = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & eval$MetricsByAc$jexl_expression == "Intersection" & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; - bonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[2]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #aonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[1]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #intersection = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & eval$MetricsByAc$jexl_expression == "Intersection" & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + #bonly = eval$MetricsByAc[which(eval$MetricsByAc$evaluation_name == "eval" & eval$MetricsByAc$comparison_name == "dbsnp" & as.character(eval$MetricsByAc$jexl_expression) == as.character(eval$CallsetOnlyNames[2]) & eval$MetricsByAc$filter_name == "called" & eval$MetricsByAc$novelty_name == novelty_name),]; + + aonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[1], novelty_name); + intersection = eval.getMetricsByAc(eval, "Intersection", novelty_name); + bonly = eval.getMetricsByAc(eval, eval$CallsetOnlyNames[2], novelty_name); title = paste("Ti/Tv spectrum (", novelty_name, " variants)", sep=""); + if (length(intersection$AC) > 0 && length(aonly$AC) == 0) { + aonly = intersection; + aonly$n = 0; + aonly$nTi = 0; + aonly$nTv = 0; + } + + if (length(intersection$AC) > 0 && length(bonly$AC) == 0) { + bonly = intersection; + bonly$n = 0; + bonly$nTi = 0; + bonly$nTv = 0; + } + titv.aonly = (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv); titv.aonly.finite = titv.aonly[which(is.finite(titv.aonly))]; @@ -223,7 +297,7 @@ plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#5C92A titv.min = min(titv.aonly.finite, titv.intersection.finite, titv.bonly.finite); titv.max = max(titv.aonly.finite, titv.intersection.finite, titv.bonly.finite); - plot(0, 0, type="n", xlim=c(1, length(intersection$n)), ylim=c(titv.min, titv.max), xlab="Allele count", ylab="Transition/transversion (Ti/Tv) ratio", main=title, bty="n"); + plot(0, 0, type="n", xlim=c(1, length(intersection$AC)), ylim=c(titv.min, titv.max), xlab="Allele count", ylab="Transition/transversion (Ti/Tv) ratio", main=title, bty="n"); points(intersection$AC, (aonly$nTi + intersection$nTi)/(aonly$nTv + intersection$nTv), type="l", lwd=2, col=col[1]); points(intersection$AC, intersection$Ti.Tv, type="l", lwd=2, col=col[2]); points(intersection$AC, (bonly$nTi + intersection$nTi)/(bonly$nTv + intersection$nTv), type="l", lwd=2, col=col[3]); @@ -238,19 +312,21 @@ plot.titvSpectrum <- function(eval, novelty_name="all", col=c("#FF6342", "#5C92A } plot.variantsPerSample <- function(eval) { - metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),]; - metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),]; - metrics.novel = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "novel"),]; + if (!is.na(eval$MetricsBySample)) { + metrics.all = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "all"),]; + metrics.known = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "known"),]; + metrics.novel = eval$MetricsBySample[which(eval$MetricsBySample$evaluation_name == "eval" & eval$MetricsBySample$comparison_name == "dbsnp" & as.character(eval$MetricsBySample$jexl_expression) == "none" & eval$MetricsBySample$filter_name == "called" & eval$MetricsBySample$novelty_name == "novel"),]; - title = "Calls per sample"; - indices = order(metrics$nVariants, decreasing=TRUE); + title = "Calls per sample"; + indices = order(metrics.all$nVariants, decreasing=TRUE); - plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics$sample)), ylim=c(0, max(metrics$nVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); - points(c(1:length(metrics.all$sample)), (metrics.all$nVariants)[indices], pch=21, col="black"); - points(c(1:length(metrics.known$sample)), (metrics.known$nVariants)[indices], pch=21, col="blue"); - points(c(1:length(metrics.novel$sample)), (metrics.novel$nVariants)[indices], pch=21, col="red"); + plot(0, 0, type="n", xaxt="n", xlim=c(1, length(metrics.all$sample)), ylim=c(0, max(metrics.all$nVariants)), xlab="", ylab="Number of variants", main=title, bty="n"); + points(c(1:length(metrics.all$sample)), (metrics.all$nVariants)[indices], pch=21, col="black"); + points(c(1:length(metrics.known$sample)), (metrics.known$nVariants)[indices], pch=21, col="blue"); + points(c(1:length(metrics.novel$sample)), (metrics.novel$nVariants)[indices], pch=21, col="red"); - legend("topright", c("All", "Known", "Novel"), pch=21, col=c("black", "blue", "red")); + legend("topright", c("All", "Known", "Novel"), pch=21, col=c("black", "blue", "red")); - axis(1, at=c(1:length(metrics$sample)), labels=(metrics$sample)[indices], las=2, cex.axis=0.4); + axis(1, at=c(1:length(metrics.all$sample)), labels=(metrics.all$sample)[indices], las=2, cex.axis=0.4); + } }