diff --git a/R/exomePostQC.R b/R/exomePostQC.R new file mode 100644 index 000000000..5ee5287ee --- /dev/null +++ b/R/exomePostQC.R @@ -0,0 +1,271 @@ +library("gsalib", lib.loc="/Users/depristo/Desktop/broadLocal/GATK/trunk/R/") +require("ggplot2") + +args = commandArgs(TRUE); + +onCMDLine = ! is.na(args[1]) +LOAD_DATA = F + +parseHighlightSamples <- function(s) { + return(unlist(strsplit(s, ",", fixed=T))) +} + +if ( onCMDLine ) { + ProjectName = args[1] + VariantEvalRoot = args[2] + preQCFile = args[3] + outputPDF = args[4] + if ( ! is.na(args[5]) ) + highlightSamples = parseHighlightSamples(args[5]) + else + highlightSamples = c() +} else { + ProjectName = "InDevelopmentInR" + preQCFile <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005.cleaned.snps_and_indels.filtered.annotated.preQC.tsv" + VariantEvalRoot <- "~/Desktop/broadLocal/GATK/trunk/qcTestData/GoT2D_exomes_batch_005.cleaned.snps_and_indels.filtered.annotated" + outputPDF = "bar.pdf" + highlightSamples = c() # parseHighlightSamples("29029,47243") +} + +expandVEReport <- function(d) { + d$TiTvVariantEvaluator$tiTvRatio = round(d$TiTvVariantEvaluator$tiTvRatio,2) + d$CountVariants$deletionInsertionRatio = round(d$CountVariants$deletionInsertionRatio,2) + d$CountVariants$nIndels = d$CountVariants$nInsertions + d$CountVariants$nDeletions + return(d) +} + +createMetricsBySites <- function(VariantEvalRoot, PreQCMetrics) { + # Metrics by sites: + # bySite -> counts of SNPs and Indels by novelty, with expectations + # byAF -> snps and indels (known / novel) + r = list( bySite = expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".summary.eval", sep=""))), + byAF = gsa.read.gatkreport(paste(VariantEvalRoot, ".byAF.eval", sep=""))) + r$byAF$CountVariants$nIndels = r$byAF$CountVariants$nInsertions + r$byAF$CountVariants$nDeletions + + nChrom = 1 / min(r$byAF$TiTvVariantEvaluator$AlleleFrequency[r$byAF$TiTvVariantEvaluator$AlleleFrequency > 0]) + r$byAF$TiTvVariantEvaluator$nSNPs = r$byAF$TiTvVariantEvaluator$nTi + r$byAF$TiTvVariantEvaluator$nTv + r$byAF$CountVariants$AC = r$byAF$CountVariants$AlleleFrequency * nChrom + r$byAF$TiTvVariantEvaluator$AC = r$byAF$TiTvVariantEvaluator$AlleleFrequency * nChrom + return(r) +} + +summaryTable <- function(metricsBySites, metricsBySample) { + # SNP summary statistics + merged = merge(metricsBySites$bySite$CountVariants, metricsBySites$bySite$TiTvVariantEvaluator) + sub <- subset(merged, FunctionalClass=="all") + raw = melt(sub, id.vars=c("Novelty"), measure.vars=c("nProcessedLoci", "nSNPs", "tiTvRatio", "nIndels", "deletionInsertionRatio")) + table = cast(raw, Novelty ~ ...) + # doesn't work with textplot + colnames(table) <- c("Novelty", "Target size (bp)", "No. SNPs", "Ti/Tv", "No. Indels", "deletion/insertion ratio") + return(table) +} + +summaryPlots <- function(metricsBySites) { + name = "SNP and Indel count by novelty and allele frequency" + molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty != "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(c("nSNPs", "nIndels"))) + p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable) + p <- p + opts(title = name) + p <- p + scale_y_log10("Number of variants") + p <- p + geom_area() + p <- p + facet_grid(variable ~ ., scales="free") + p <- p + scale_x_continuous("Allele count (AC)") + print(p) + p <- p + scale_x_log10("Allele count (AC)") + print(p) + + # Counts vs. Allele frequency + name = "Variant counts by allele count" + for ( measure in c("nSNPs", "nIndels")) { + molten = melt(subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & AC > 0), id.vars=c("Novelty", "AC"), measure.vars=c(measure)) + p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty), group=variable) + p <- p + opts(title = paste(name, ":", measure)) + p <- p + scale_y_log10("Number of variants") + p <- p + scale_x_log10("Allele count (AC)") + p <- p + geom_point(alpha=0.5, size=4) + p <- p + geom_smooth(aes(weight=value), size=1, method="lm", formula = y ~ x) + p <- p + facet_grid(Novelty ~ ., scales="free") + print(p) + } + + name = "Transition / transversion ratio by allele count" + byAFNoAll = subset(metricsBySites$byAF$TiTvVariantEvaluator, Novelty != "all" & FunctionalClass == "all" & AC > 0) + p <- ggplot(data=byAFNoAll, aes(x=AC, y=tiTvRatio, color=Novelty)) + p <- p + scale_y_continuous("Transition / transversion ratio", limits=c(0,4)) + p <- p + opts(title = name) + p <- p + geom_smooth(size=2) + p <- p + geom_point(aes(size=log10(nSNPs), weight=nSNPs), alpha=0.5) + p <- p + scale_x_continuous("Allele count (AC)") + print(p) + p <- p + scale_x_log10("Allele count (AC)") + print(p) + + # SNPs to indels ratio by allele frequency + name = "SNPs to indels ratio by allele frequency" + metricsBySites$byAF$CountVariants$SNP.Indel.Ratio = metricsBySites$byAF$CountVariants$nSNPs / metricsBySites$byAF$CountVariants$nIndels + metricsBySites$byAF$CountVariants$SNP.Indel.Ratio[metricsBySites$byAF$CountVariants$nIndels == 0] = NaN + p <- ggplot(data=subset(metricsBySites$byAF$CountVariants, FunctionalClass == "all" & Novelty == "all"), aes(x=AC, y=SNP.Indel.Ratio)) + p <- p + opts(title = name) + p <- p + scale_y_continuous("SNP to indel ratio") + #p <- p + scale_y_log10() + p <- p + geom_point(alpha=0.5, aes(size=log10(nIndels))) + p <- p + geom_smooth(size=2, aes(weight=nIndels)) + print(p) + + name = "SNP counts by functional class" + molten = melt(subset(metricsBySites$bySite$CountVariants, Novelty != "all" & FunctionalClass != "all"), id.vars=c("Novelty", "FunctionalClass"), measure.vars=c(c("nSNPs"))) + p <- ggplot(data=molten, aes(x=FunctionalClass, y=value, fill=Novelty), group=FunctionalClass) + p <- p + opts(title = name) + p <- p + scale_y_log10("No. of SNPs") + p <- p + geom_bar(position="dodge") + print(p) +} + +addSection <- function(name) { + par("mar", c(5, 4, 4, 2)) + frame() + title(name, cex=2) +} + +createMetricsBySamples <- function(VariantEvalRoot) { + byAFEval <- expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".bySample.eval", sep=""))) + #preQCMetrics <- read.table(preQCFile, header=T) + r = merge(byAFEval$TiTvVariantEvaluator, byAFEval$CountVariants) + + # order the samples by nSNPs -- it's the natural ordering. + x = subset(r, Novelty=="all") + r$Sample <- factor(x$Sample, levels=x$Sample[order(x$nSNPs)]) + + # add highlight info + r$highlight = r$Sample %in% highlightSamples + + #r = merge(merge(preQCMetrics, byAFEval$TiTvVariantEvaluator), byAFEval$CountVariants) + return(subset(r, Sample != "all")) +} + +if ( onCMDLine || LOAD_DATA ) { + metricsBySites <- createMetricsBySites(VariantEvalRoot) + metricsBySamples <- createMetricsBySamples(VariantEvalRoot) +} + +# ------------------------------------------------------- +# Per sample plots +# ------------------------------------------------------- + +perSamplePlots <- function(metricsBySamples) { + metricsBySamples$highlightTextSizes = c(1,2)[metricsBySamples$highlight+1] + sampleTextLabel <- geom_text(aes(label=Sample, size=highlightTextSizes)) + sampleTextLabelScale <- scale_size("Highlighted samples", to=c(3,5), breaks=c(1,2), labels=c("regular", "highlighted")) + xAxis <- scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") + myRug <- geom_rug(position=jitter) + #myRug <- geom_rug(aes(x = NULL)) + + measures = c("nSNPs", "tiTvRatio", "nSingletons", "nIndels", "nInsertions", "nDeletions", "deletionInsertionRatio") + # Counts vs. Allele frequency + name = "Variant counts by allele count" + for ( measure in measures ) { + molten = melt(metricsBySamples, id.vars=c("Novelty", "Sample", "highlightTextSizes"), measure.vars=c(measure)) + p <- ggplot(data=molten, aes(x=Sample, y=value, group=Novelty, color=Novelty), y=value) + p <- p + opts(title = paste(name, ":", measure)) + p <- p + geom_smooth(alpha=0.5, aes(group=Novelty)) + p <- p + sampleTextLabel + sampleTextLabelScale + p <- p + myRug + p <- p + facet_grid(Novelty ~ ., scales="free") + # how do we remove the labels? + p <- p + xAxis + print(p) + } + + for ( novelty in c("all", "known", "novel") ) { + # TODO -- how can I color it as before? + # TODO -- add marginal distributions? + molten = melt(subset(metricsBySamples, Novelty==novelty), id.vars=c("Sample", "highlightTextSizes"), measure.vars=measures) + p <- ggplot(data=molten, aes(x=Sample, y=value)) + p <- p + opts(title = paste(name, ":", novelty)) +# p <- p + scale_y_log10("Number of variants") +# p <- p + geom_point(alpha=0.5, size=4) + p <- p + sampleTextLabel + sampleTextLabelScale + p <- p + myRug + p <- p + facet_grid(variable ~ ., scales="free") + # how do we remove the labels? + p <- p + xAxis +print(p) + } +} + +# ------------------------------------------------------- +# Actually invoke the above plotting functions +# ------------------------------------------------------- + +if ( ! is.na(outputPDF) ) { + pdf(outputPDF, height=8.5, width=11) +} + +# Table of overall counts and quality +textplot(as.data.frame(summaryTable(metricsBySites)), show.rownames=F) +title(paste("Overall summary metrics for project", ProjectName), cex=3) + +summaryPlots(metricsBySites) +perSamplePlots(metricsBySamples) + +if ( ! is.na(outputPDF) ) { + dev.off() +} + + +# countVariants = subset(as.data.frame(d$CountVariants), Filter=="called" & Novelty != "all" & Sample != "all") +# countVariantsNoNovelty = subset(as.data.frame(d$CountVariants), Filter=="called" & Novelty == "all" & Sample != "all") +# titv = subset(as.data.frame(d$TiTvVariantEvaluator), Filter=="called" & Novelty != "all" & Sample != "all") +# titv$count = titv$nTi + titv$nTv +# +# print(qplot(data=titv, Sample, tiTvRatio, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(count)) +# + geom_point() +# + geom_smooth(weight=count)) +# +# print(qplot(data=titv, Sample, tiTvRatio, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(count)) +# + geom_point() +# + geom_smooth(weight=count, level=0.99, method="lm")) +# +# print(qplot(data=countVariants, Sample, nInsertions, group=Novelty, color=Novelty, facets = FunctionalClass ~ Novelty, size=log10(nVariantLoci)) +# + geom_point() +# + geom_smooth(weight=count)) +# +# # moltenCountVariants = melt(subset(countVariants, Filter == "called" & FunctionalClass == "all"), id.vars=c("Sample", "Novelty"), +# # measure.vars=c("hetHomRatio", "deletionInsertionRatio")) +# # +# # qplot(data=subset(moltenCountVariants), Sample, value, facets = Novelty ~ variable, log="", color=Novelty) +# +# # TODO -- really could use nIndels, not just nInsertions and nDeletions, as CountVariants output +# +# plotByNoveltyAndSample <- function(data, measures) { +# sub = subset(data, Filter == "called" & FunctionalClass == "all") +# molten = melt(sub, id.vars=c("Sample", "Novelty"), measure.vars=measures) +# print(qplot(data=molten, Sample, value, color=Novelty) + facet_grid( variable ~ ., scales="free")) +# } +# +# plotBySample <- function(data, measures) { +# sub = subset(data, Filter == "called" & FunctionalClass == "all" & Novelty == "all") +# molten = melt(sub, id.vars=c("Sample"), measure.vars=measures) +# print(summary(molten)) +# print(head(molten)) +# print(qplot(data=molten, Sample, value) + facet_grid( variable ~ ., scales="free")) +# molten +# } +# +# scatterCounts <- function(data, measures) { +# sub = subset(data, Filter == "called" & FunctionalClass == "all") +# for ( measure in measures ) { +# print(measure) +# raw = melt(sub, id.vars=c("Sample", "Novelty"), measure.vars=c(measure)) +# xy = cast(raw, Sample ~ Novelty) +# print(qplot(data=xy, known, novel, geom="blank", main=measure) + geom_text(aes(label=Sample))) +# } +# } +# +# variantMeasures = c("nSNPs", "nInsertions", "nDeletions", "nSingletons") +# genotypingMeasures = c("nHomVar", "nHets", "nNoCalls") +# +# plotByNoveltyAndSample(countVariants, variantMeasures) +# scatterCounts(countVariants, variantMeasures) +# +# plotBySample(countVariantsNoNovelty, genotypingMeasures) + diff --git a/scala/qscript/oneoffs/depristo/ExomePostQCEval.scala b/scala/qscript/oneoffs/depristo/ExomePostQCEval.scala new file mode 100755 index 000000000..fa02f82b3 --- /dev/null +++ b/scala/qscript/oneoffs/depristo/ExomePostQCEval.scala @@ -0,0 +1,67 @@ +package oneoffs.depristo + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.function.JavaCommandLineFunction + +class ExomePostQCEval extends QScript { + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar") + + @Argument(shortName = "R", doc="ref", required=false) + var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + @Argument(shortName = "eval", doc="ref", required=true) + var evalVCF: File = _ + + @Argument(shortName = "intervals", doc="intervals", required=true) + val myIntervals: String = null; + + @Argument(shortName = "dbSNP", doc="dbSNP", required=false) + val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_132.b37.vcf") + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.reference_sequence = referenceFile; + this.memoryLimit = 4 + } + + // TODO -- should include "standard" eval for plotting expectations + + def script = { + // The basic summary eval + createEval(".summary", + List("TiTvVariantEvaluator", "CountVariants", "CompOverlap"), + List("EvalRod", "CompRod", "Novelty", "FunctionalClass")) + + // The basic summary eval, by AF + createEval(".byAF", + List("TiTvVariantEvaluator", "CountVariants", "CompOverlap"), + List("EvalRod", "CompRod", "Novelty", "AlleleFrequency", "FunctionalClass")) + + // By sample + createEval(".bySample", + List("TiTvVariantEvaluator", "CountVariants", "CompOverlap"), + List("EvalRod", "CompRod", "Novelty", "Sample")) + } + + def createEval(prefix: String, evalModules: List[String], extraStrats: List[String]) { + val eval = new Eval(evalVCF) + eval.out = swapExt(evalVCF,".vcf", prefix + ".eval") + eval.evalModule = evalModules + eval.stratificationModule = List("EvalRod", "CompRod", "Novelty") ::: extraStrats + add(eval) + } + + class Eval(@Input vcf: File) extends VariantEval with UNIVERSAL_GATK_ARGS { + this.rodBind :+= RodBind("eval", "VCF", evalVCF) + if ( dbSNP.exists() ) + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.doNotUseAllStandardStratifications = true + this.doNotUseAllStandardModules = true + this.intervalsString = List(myIntervals); + } +} +