diff --git a/R/exomePostQC.R b/R/exomePostQC.R deleted file mode 100644 index 7fb27cad8..000000000 --- a/R/exomePostQC.R +++ /dev/null @@ -1,271 +0,0 @@ -library("gsalib", lib.loc="/Users/depristo/Desktop/broadLocal/GATK/trunk/R/") -require("ggplot2") -require("gplots") - -args = commandArgs(TRUE) -onCMDLine = ! is.na(args[1]) -LOAD_DATA = F - -parseHighlightSamples <- function(s) { - return(unlist(strsplit(s, ",", fixed=T))) -} - -if ( onCMDLine ) { - ProjectName = args[1] - VariantEvalRoot = args[2] - preQCFile = args[3] - outputPDF = args[4] - if ( ! is.na(args[5]) ) - highlightSamples = parseHighlightSamples(args[5]) - else - highlightSamples = c() -} else { - ProjectName = "InDevelopmentInR" - preQCFile <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv" - VariantEvalRoot <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005.cleaned.snps_and_indels.filtered.annotated" - outputPDF = "bar.pdf" - highlightSamples = c() # parseHighlightSamples("29029,47243") -} - -expandVEReport <- function(d) { - d$TiTvVariantEvaluator$tiTvRatio = round(d$TiTvVariantEvaluator$tiTvRatio,2) - d$CountVariants$deletionInsertionRatio = round(d$CountVariants$deletionInsertionRatio,2) - d$CountVariants$nIndels = d$CountVariants$nInsertions + d$CountVariants$nDeletions - return(d) -} - -createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) { - # Metrics by sites: - # bySite -> counts of SNPs and Indels by novelty, with expectations - # byAF -> snps and indels (known / novel) - r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))), - byAF = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAF.eval", sep=""))) - r$byAF$CountVariants$nIndels = r$byAF$CountVariants$nInsertions + r$byAF$CountVariants$nDeletions - - nChrom = 1 / min(r$byAF$TiTvVariantEvaluator$AlleleFrequency[r$byAF$TiTvVariantEvaluator$AlleleFrequency > 0]) - r$byAF$TiTvVariantEvaluator$nSNPs = r$byAF$TiTvVariantEvaluator$nTi + r$byAF$TiTvVariantEvaluator$nTv - r$byAF$CountVariants$AC = r$byAF$CountVariants$AlleleFrequency * nChrom - r$byAF$TiTvVariantEvaluator$AC = r$byAF$TiTvVariantEvaluator$AlleleFrequency * nChrom - return(r) -} - -summaryTable <- function(metricsBySites, metricsBySample) { - # SNP summary statistics - merged = merge(metricsBySites$bySite$CountVariants, metricsBySites$bySite$TiTvVariantEvaluator) - sub <- subset(merged, FunctionalClass=="all") - raw = melt(sub, id.vars=c("Novelty"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio")) - table = cast(raw, Novelty ~ ...) - # doesn't work with textplot - colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio") - return(table) -} - -summaryPlots <- function(metricsBySites) { - name = "SNP and Indel count by novelty and allele frequency" - molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels"))) - p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable) - p <- p + opts(title = name) - p <- p + scale_y_log10("Number of variants") - p <- p + geom_area() - p <- p + facet_grid(variable ~ ., scales="free") - p <- p + scale_x_continuous("Allele count (AC)") - print(p) - p <- p + scale_x_log10("Allele count (AC)") - print(p) - - # Counts vs. Allele frequency - name = "Variant counts by allele count" - for ( measure in c("nSNPs", "nIndels")) { - molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure)) - p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable) - p <- p + opts(title = paste(name, ":", measure)) - p <- p + scale_y_log10("Number of variants") - p <- p + scale_x_log10("Allele count (AC)") - p <- p + geom_point(alpha=0.5, size=4) - p <- p + geom_smooth(aes(weight=value), size=1, method="lm", formula = y ~ x) - p <- p + facet_grid(Novelty ~ ., scales="free") - print(p) - } - - name = "Transition / transversion ratio by allele count" - byAFNoAll = subset(metricsBySites$byAF$TiTvVariantEvaluator, Novelty != "all" & FunctionalClass == "all" & AC > 0) - p <- ggplot(data=byAFNoAll, aes(x=AC, y=tiTvRatio, color=Novelty)) - p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4)) - p <- p + opts(title = name) - p <- p + geom_smooth(size=2) - p <- p + geom_point(aes(size=log10(nSNPs), weight=nSNPs), alpha=0.5) - p <- p + scale_x_continuous("Allele count (AC)") - print(p) - p <- p + scale_x_log10("Allele count (AC)") - print(p) - - # SNPs to indels ratio by allele frequency - name = "SNPs to indels ratio by allele frequency" - metricsBySites$byAF$CountVariants$SNP.Indel.Ratio = metricsBySites$byAF$CountVariants$nSNPs / metricsBySites$byAF$CountVariants$nIndels - metricsBySites$byAF$CountVariants$SNP.Indel.Ratio[metricsBySites$byAF$CountVariants$nIndels == 0] = NaN - p <- ggplot(data=subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty == "all"), aes(x=AC, y=SNP.Indel.Ratio)) - p <- p + opts(title = name) - p <- p + scale_y_continuous("SNP to indel ratio") - #p <- p + scale_y_log10() - p <- p + geom_point(alpha=0.5, aes(size=log10(nIndels))) - p <- p + geom_smooth(size=2, aes(weight=nIndels)) - print(p) - - name = "SNP counts by functional class" - molten = melt(subset(metricsBySites$bySite$CountVariants, Novelty != "all" & FunctionalClass != "all"), id.vars=c("Novelty", "FunctionalClass"), measure.vars=c(c("nSNPs"))) - p <- ggplot(data=molten, aes(x=FunctionalClass, y=value, fill=Novelty), group=FunctionalClass) - p <- p + opts(title = name) - p <- p + scale_y_log10("No. of SNPs") - p <- p + geom_bar(position="dodge") - print(p) -} - -addSection <- function(name) { - par("mar", c(5, 4, 4, 2)) - frame() - title(name, cex=2) -} - -createMetricsBySamples <- function(VariantEvalRoot) { - byAFEval <- expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".bySample.eval", sep=""))) - preQCMetrics <- read.table(preQCFile, header=T) - r = merge(merge(byAFEval$TiTvVariantEvaluator, byAFEval$CountVariants), preQCMetrics) - - # order the samples by nSNPs -- it's the natural ordering. - x = subset(r, Novelty=="all") - r$Sample <- factor(x$Sample, levels=x$Sample[order(x$nSNPs)]) - - # add highlight info - r$highlight = r$Sample %in% highlightSamples - - #r = merge(merge(preQCMetrics, byAFEval$TiTvVariantEvaluator), byAFEval$CountVariants) - return(subset(r, Sample != "all")) -} - -if ( onCMDLine || LOAD_DATA ) { - metricsBySites <- createMetricsBySites(VariantEvalRoot) - metricsBySamples <- createMetricsBySamples(VariantEvalRoot) -} - -# ------------------------------------------------------- -# Per sample plots -# ------------------------------------------------------- - -perSamplePlots <- function(metricsBySamples) { - metricsBySamples$highlightTextSizes = c(1,2)[metricsBySamples$highlight+1] - sampleTextLabel <- geom_text(aes(label=Sample, size=highlightTextSizes)) - sampleTextLabelScale <- scale_size("Highlighted samples", to=c(3,5), breaks=c(1,2), labels=c("regular", "highlighted")) - xAxis <- scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") - myRug <- geom_rug(position="jitter") - #myRug <- geom_rug(aes(x = NULL)) - - measures = c("nSNPs", "tiTvRatio", "nSingletons", "nIndels", "nInsertions", "nDeletions", "deletionInsertionRatio") - # Counts vs. Allele frequency - name = "Variant counts by allele count" - for ( measure in measures ) { - molten = melt(metricsBySamples, id.vars=c("Novelty", "Sample", "highlightTextSizes"), measure.vars=c(measure)) - p <- ggplot(data=molten, aes(x=Sample, y=value, group=Novelty, color=Novelty), y=value) - p <- p + opts(title = paste(name, ":", measure)) - p <- p + geom_smooth(alpha=0.5, aes(group=Novelty)) - p <- p + sampleTextLabel + sampleTextLabelScale - p <- p + myRug - p <- p + facet_grid(Novelty ~ ., scales="free") - # how do we remove the labels? - p <- p + xAxis - print(p) - } - - for ( novelty in c("all", "known", "novel") ) { - # TODO -- how can I color it as before? - # TODO -- add marginal distributions? - molten = melt(subset(metricsBySamples, Novelty==novelty), id.vars=c("Sample", "highlightTextSizes"), measure.vars=measures) - p <- ggplot(data=molten, aes(x=Sample, y=value)) - p <- p + opts(title = paste(name, ":", novelty)) -# p <- p + scale_y_log10("Number of variants") -# p <- p + geom_point(alpha=0.5, size=4) - p <- p + sampleTextLabel + sampleTextLabelScale - p <- p + myRug - p <- p + facet_grid(variable ~ ., scales="free") - # how do we remove the labels? - p <- p + xAxis - print(p) - } -} - -# ------------------------------------------------------- -# Actually invoke the above plotting functions -# ------------------------------------------------------- - -if ( ! is.na(outputPDF) ) { - pdf(outputPDF, height=8.5, width=11) -} - -# Table of overall counts and quality -textplot(as.data.frame(summaryTable(metricsBySites)), show.rownames=F) -title(paste("Overall summary metrics for project", ProjectName), cex=3) - -summaryPlots(metricsBySites) -perSamplePlots(metricsBySamples) - -if ( ! is.na(outputPDF) ) { - dev.off() -} - - -# countVariants = subset(as.data.frame(d$CountVariants), Filter=="called" & Novelty != "all" & Sample != "all") -# countVariantsNoNovelty = subset(as.data.frame(d$CountVariants), Filter=="called" & Novelty == "all" & Sample != "all") -# titv = subset(as.data.frame(d$TiTvVariantEvaluator), Filter=="called" & Novelty != "all" & Sample != "all") -# titv$count = titv$nTi + titv$nTv -# -# print(qplot(data=titv, Sample, tiTvRatio, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(count)) -# + geom_point() -# + geom_smooth(weight=count)) -# -# print(qplot(data=titv, Sample, tiTvRatio, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(count)) -# + geom_point() -# + geom_smooth(weight=count, level=0.99, method="lm")) -# -# print(qplot(data=countVariants, Sample, nInsertions, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(nVariantLoci)) -# + geom_point() -# + geom_smooth(weight=count)) -# -# # moltenCountVariants = melt(subset(countVariants, Filter == "called" & FunctionalClass == "all"), id.vars=c("Sample", "Novelty"), -# # measure.vars=c("hetHomRatio", "deletionInsertionRatio")) -# # -# # qplot(data=subset(moltenCountVariants), Sample, value, facets = Novelty ~ variable, log="", color=Novelty) -# -# # TODO -- really could use nIndels, not just nInsertions and nDeletions, as CountVariants output -# -# plotByNoveltyAndSample <- function(data, measures) { -# sub = subset(data, Filter == "called" & FunctionalClass == "all") -# molten = melt(sub, id.vars=c("Sample", "Novelty"), measure.vars=measures) -# print(qplot(data=molten, Sample, value, color=Novelty) + facet_grid( variable ~ ., scales="free")) -# } -# -# plotBySample <- function(data, measures) { -# sub = subset(data, Filter == "called" & FunctionalClass == "all" & Novelty == "all") -# molten = melt(sub, id.vars=c("Sample"), measure.vars=measures) -# print(summary(molten)) -# print(head(molten)) -# print(qplot(data=molten, Sample, value) + facet_grid( variable ~ ., scales="free")) -# molten -# } -# -# scatterCounts <- function(data, measures) { -# sub = subset(data, Filter == "called" & FunctionalClass == "all") -# for ( measure in measures ) { -# print(measure) -# raw = melt(sub, id.vars=c("Sample", "Novelty"), measure.vars=c(measure)) -# xy = cast(raw, Sample ~ Novelty) -# print(qplot(data=xy, known, novel, geom="blank", main=measure) + geom_text(aes(label=Sample))) -# } -# } -# -# variantMeasures = c("nSNPs", "nInsertions", "nDeletions", "nSingletons") -# genotypingMeasures = c("nHomVar", "nHets", "nNoCalls") -# -# plotByNoveltyAndSample(countVariants, variantMeasures) -# scatterCounts(countVariants, variantMeasures) -# -# plotBySample(countVariantsNoNovelty, genotypingMeasures) -