gatk-3.8/R/exomeQC.R

272 lines
12 KiB
R

library("gsalib", lib.loc="/Users/depristo/Desktop/broadLocal/GATK/trunk/R/")
require("ggplot2")
require("gplots")
args = commandArgs(TRUE)
onCMDLine = ! is.na(args[1])
LOAD_DATA = F
parseHighlightSamples <- function(s) {
return(unlist(strsplit(s, ",", fixed=T)))
}
if ( onCMDLine ) {
ProjectName = args[1]
VariantEvalRoot = args[2]
preQCFile = args[3]
outputPDF = args[4]
if ( ! is.na(args[5]) )
highlightSamples = parseHighlightSamples(args[5])
else
highlightSamples = c()
} else {
ProjectName = "InDevelopmentInR"
preQCFile <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005_per_sample_metrics.tsv"
VariantEvalRoot <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005.cleaned.snps_and_indels.filtered.annotated"
outputPDF = "bar.pdf"
highlightSamples = c() # parseHighlightSamples("29029,47243")
}
expandVEReport <- function(d) {
d$TiTvVariantEvaluator$tiTvRatio = round(d$TiTvVariantEvaluator$tiTvRatio,2)
d$CountVariants$deletionInsertionRatio = round(d$CountVariants$deletionInsertionRatio,2)
d$CountVariants$nIndels = d$CountVariants$nInsertions + d$CountVariants$nDeletions
return(d)
}
createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) {
# Metrics by sites:
# bySite -> counts of SNPs and Indels by novelty, with expectations
# byAF -> snps and indels (known / novel)
r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))),
byAF = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAF.eval", sep="")))
r$byAF$CountVariants$nIndels = r$byAF$CountVariants$nInsertions + r$byAF$CountVariants$nDeletions
nChrom = 1 / min(r$byAF$TiTvVariantEvaluator$AlleleFrequency[r$byAF$TiTvVariantEvaluator$AlleleFrequency > 0])
r$byAF$TiTvVariantEvaluator$nSNPs = r$byAF$TiTvVariantEvaluator$nTi + r$byAF$TiTvVariantEvaluator$nTv
r$byAF$CountVariants$AC = r$byAF$CountVariants$AlleleFrequency * nChrom
r$byAF$TiTvVariantEvaluator$AC = r$byAF$TiTvVariantEvaluator$AlleleFrequency * nChrom
return(r)
}
summaryTable <- function(metricsBySites, metricsBySample) {
# SNP summary statistics
merged = merge(metricsBySites$bySite$CountVariants, metricsBySites$bySite$TiTvVariantEvaluator)
sub <- subset(merged, FunctionalClass=="all")
raw = melt(sub, id.vars=c("Novelty"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio"))
table = cast(raw, Novelty ~ ...)
# doesn't work with textplot
colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio")
return(table)
}
summaryPlots <- function(metricsBySites) {
name = "SNP and Indel count by novelty and allele frequency"
molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels")))
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable)
p <- p + opts(title = name)
p <- p + scale_y_log10("Number of variants")
p <- p + geom_area()
p <- p + facet_grid(variable ~ ., scales="free")
p <- p + scale_x_continuous("Allele count (AC)")
print(p)
p <- p + scale_x_log10("Allele count (AC)")
print(p)
# Counts vs. Allele frequency
name = "Variant counts by allele count"
for ( measure in c("nSNPs", "nIndels")) {
molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure))
p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable)
p <- p + opts(title = paste(name, ":", measure))
p <- p + scale_y_log10("Number of variants")
p <- p + scale_x_log10("Allele count (AC)")
p <- p + geom_point(alpha=0.5, size=4)
p <- p + geom_smooth(aes(weight=value), size=1, method="lm", formula = y ~ x)
p <- p + facet_grid(Novelty ~ ., scales="free")
print(p)
}
name = "Transition / transversion ratio by allele count"
byAFNoAll = subset(metricsBySites$byAF$TiTvVariantEvaluator, Novelty != "all" & FunctionalClass == "all" & AC > 0)
p <- ggplot(data=byAFNoAll, aes(x=AC, y=tiTvRatio, color=Novelty))
p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4))
p <- p + opts(title = name)
p <- p + geom_smooth(size=2)
p <- p + geom_point(aes(size=log10(nSNPs), weight=nSNPs), alpha=0.5)
p <- p + scale_x_continuous("Allele count (AC)")
print(p)
p <- p + scale_x_log10("Allele count (AC)")
print(p)
# SNPs to indels ratio by allele frequency
name = "SNPs to indels ratio by allele frequency"
metricsBySites$byAF$CountVariants$SNP.Indel.Ratio = metricsBySites$byAF$CountVariants$nSNPs / metricsBySites$byAF$CountVariants$nIndels
metricsBySites$byAF$CountVariants$SNP.Indel.Ratio[metricsBySites$byAF$CountVariants$nIndels == 0] = NaN
p <- ggplot(data=subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty == "all"), aes(x=AC, y=SNP.Indel.Ratio))
p <- p + opts(title = name)
p <- p + scale_y_continuous("SNP to indel ratio")
#p <- p + scale_y_log10()
p <- p + geom_point(alpha=0.5, aes(size=log10(nIndels)))
p <- p + geom_smooth(size=2, aes(weight=nIndels))
print(p)
name = "SNP counts by functional class"
molten = melt(subset(metricsBySites$bySite$CountVariants, Novelty != "all" & FunctionalClass != "all"), id.vars=c("Novelty", "FunctionalClass"), measure.vars=c(c("nSNPs")))
p <- ggplot(data=molten, aes(x=FunctionalClass, y=value, fill=Novelty), group=FunctionalClass)
p <- p + opts(title = name)
p <- p + scale_y_log10("No. of SNPs")
p <- p + geom_bar(position="dodge")
print(p)
}
addSection <- function(name) {
par("mar", c(5, 4, 4, 2))
frame()
title(name, cex=2)
}
createMetricsBySamples <- function(VariantEvalRoot) {
byAFEval <- expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".bySample.eval", sep="")))
preQCMetrics <- read.table(preQCFile, header=T)
r = merge(merge(byAFEval$TiTvVariantEvaluator, byAFEval$CountVariants), preQCMetrics)
# order the samples by nSNPs -- it's the natural ordering.
x = subset(r, Novelty=="all")
r$Sample <- factor(x$Sample, levels=x$Sample[order(x$nSNPs)])
# add highlight info
r$highlight = r$Sample %in% highlightSamples
#r = merge(merge(preQCMetrics, byAFEval$TiTvVariantEvaluator), byAFEval$CountVariants)
return(subset(r, Sample != "all"))
}
if ( onCMDLine || LOAD_DATA ) {
metricsBySites <- createMetricsBySites(VariantEvalRoot)
metricsBySamples <- createMetricsBySamples(VariantEvalRoot)
}
# -------------------------------------------------------
# Per sample plots
# -------------------------------------------------------
perSamplePlots <- function(metricsBySamples) {
metricsBySamples$highlightTextSizes = c(1,2)[metricsBySamples$highlight+1]
sampleTextLabel <- geom_text(aes(label=Sample, size=highlightTextSizes))
sampleTextLabelScale <- scale_size("Highlighted samples", to=c(3,5), breaks=c(1,2), labels=c("regular", "highlighted"))
xAxis <- scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
myRug <- geom_rug(position="jitter")
#myRug <- geom_rug(aes(x = NULL))
measures = c("nSNPs", "tiTvRatio", "nSingletons", "nIndels", "nInsertions", "nDeletions", "deletionInsertionRatio")
# Counts vs. Allele frequency
name = "Variant counts by allele count"
for ( measure in measures ) {
molten = melt(metricsBySamples, id.vars=c("Novelty", "Sample", "highlightTextSizes"), measure.vars=c(measure))
p <- ggplot(data=molten, aes(x=Sample, y=value, group=Novelty, color=Novelty), y=value)
p <- p + opts(title = paste(name, ":", measure))
p <- p + geom_smooth(alpha=0.5, aes(group=Novelty))
p <- p + sampleTextLabel + sampleTextLabelScale
p <- p + myRug
p <- p + facet_grid(Novelty ~ ., scales="free")
# how do we remove the labels?
p <- p + xAxis
print(p)
}
for ( novelty in c("all", "known", "novel") ) {
# TODO -- how can I color it as before?
# TODO -- add marginal distributions?
molten = melt(subset(metricsBySamples, Novelty==novelty), id.vars=c("Sample", "highlightTextSizes"), measure.vars=measures)
p <- ggplot(data=molten, aes(x=Sample, y=value))
p <- p + opts(title = paste(name, ":", novelty))
# p <- p + scale_y_log10("Number of variants")
# p <- p + geom_point(alpha=0.5, size=4)
p <- p + sampleTextLabel + sampleTextLabelScale
p <- p + myRug
p <- p + facet_grid(variable ~ ., scales="free")
# how do we remove the labels?
p <- p + xAxis
print(p)
}
}
# -------------------------------------------------------
# Actually invoke the above plotting functions
# -------------------------------------------------------
if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
# Table of overall counts and quality
textplot(as.data.frame(summaryTable(metricsBySites)), show.rownames=F)
title(paste("Overall summary metrics for project", ProjectName), cex=3)
summaryPlots(metricsBySites)
perSamplePlots(metricsBySamples)
if ( ! is.na(outputPDF) ) {
dev.off()
}
# countVariants = subset(as.data.frame(d$CountVariants), Filter=="called" & Novelty != "all" & Sample != "all")
# countVariantsNoNovelty = subset(as.data.frame(d$CountVariants), Filter=="called" & Novelty == "all" & Sample != "all")
# titv = subset(as.data.frame(d$TiTvVariantEvaluator), Filter=="called" & Novelty != "all" & Sample != "all")
# titv$count = titv$nTi + titv$nTv
#
# print(qplot(data=titv, Sample, tiTvRatio, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(count))
# + geom_point()
# + geom_smooth(weight=count))
#
# print(qplot(data=titv, Sample, tiTvRatio, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(count))
# + geom_point()
# + geom_smooth(weight=count, level=0.99, method="lm"))
#
# print(qplot(data=countVariants, Sample, nInsertions, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(nVariantLoci))
# + geom_point()
# + geom_smooth(weight=count))
#
# # moltenCountVariants = melt(subset(countVariants, Filter == "called" & FunctionalClass == "all"), id.vars=c("Sample", "Novelty"),
# # measure.vars=c("hetHomRatio", "deletionInsertionRatio"))
# #
# # qplot(data=subset(moltenCountVariants), Sample, value, facets = Novelty ~ variable, log="", color=Novelty)
#
# # TODO -- really could use nIndels, not just nInsertions and nDeletions, as CountVariants output
#
# plotByNoveltyAndSample <- function(data, measures) {
# sub = subset(data, Filter == "called" & FunctionalClass == "all")
# molten = melt(sub, id.vars=c("Sample", "Novelty"), measure.vars=measures)
# print(qplot(data=molten, Sample, value, color=Novelty) + facet_grid( variable ~ ., scales="free"))
# }
#
# plotBySample <- function(data, measures) {
# sub = subset(data, Filter == "called" & FunctionalClass == "all" & Novelty == "all")
# molten = melt(sub, id.vars=c("Sample"), measure.vars=measures)
# print(summary(molten))
# print(head(molten))
# print(qplot(data=molten, Sample, value) + facet_grid( variable ~ ., scales="free"))
# molten
# }
#
# scatterCounts <- function(data, measures) {
# sub = subset(data, Filter == "called" & FunctionalClass == "all")
# for ( measure in measures ) {
# print(measure)
# raw = melt(sub, id.vars=c("Sample", "Novelty"), measure.vars=c(measure))
# xy = cast(raw, Sample ~ Novelty)
# print(qplot(data=xy, known, novel, geom="blank", main=measure) + geom_text(aes(label=Sample)))
# }
# }
#
# variantMeasures = c("nSNPs", "nInsertions", "nDeletions", "nSingletons")
# genotypingMeasures = c("nHomVar", "nHets", "nNoCalls")
#
# plotByNoveltyAndSample(countVariants, variantMeasures)
# scatterCounts(countVariants, variantMeasures)
#
# plotBySample(countVariantsNoNovelty, genotypingMeasures)