Merge pull request #680 from broadinstitute/ldg_VQSRscript

Update VQSR Rnd BQSR  script generation code for compatibility with late...
This commit is contained in:
droazen 2014-07-11 10:16:37 -04:00
commit b8751ad598
4 changed files with 31 additions and 28 deletions

View File

@ -510,17 +510,17 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
stream.println("dummyData <- " + dataFrame + "[1,]");
stream.println("dummyData$x <- NaN");
stream.println("dummyData$y <- NaN");
stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")");
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
stream.println("p1 = p +ggtitle(\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\", space=\"rgb\")");
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)");
stream.println("p2 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))");
stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("p2 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),guide=\"legend\",labels=c(\"filtered\",\"retained\"))");
stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)");
stream.println("p3 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1), labels=c(\"neg\", \"pos\"))");
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("p3 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1),guide=\"legend\", labels=c(\"neg\", \"pos\"))");
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)");
stream.println("p4 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))");
stream.println("p4 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1),guide=\"legend\", labels=c(\"novel\",\"known\"))");
stream.println("arrange(p1, p2, p3, p4, ncol=2)");
}
}

View File

@ -15,7 +15,7 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
inputFileName = "~/Desktop/broadLocal/projects/pipelinePerformance/FullProcessingPipeline.jobreport.txt"
inputFileName = "~/workspaces/scratch/SingleSample_VQSRwNewSOR.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
@ -57,7 +57,7 @@ plotJobsGantt <- function(gatkReport, sortOverall, title, includeText) {
p <- p + xlim(0, maxRelTime * 1.3)
p <- p + xlab(paste("Start time, relative to first job", RUNTIME_UNITS))
p <- p + ylab("Job number")
p <- p + opts(title = title)
p <- p + ggtitle(title)
print(p)
}
@ -95,7 +95,7 @@ plotProgressByTime <- function(gatkReport) {
p <- p + facet_grid(variable ~ ., scales="free")
p <- p + geom_line(size=2)
p <- p + xlab(paste("Time since start of first job", RUNTIME_UNITS))
p <- p + opts(title = "Job scheduling")
p <- p + ggtitle("Job scheduling")
print(p)
}
@ -123,7 +123,7 @@ plotGroup <- function(groupTable) {
# todo -- how do we group by annotations?
p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram()
p <- p + xlab(paste("runtime", RUNTIME_UNITS)) + ylab("No. of jobs")
p <- p + opts(title=paste("Job runtime histogram for", name))
p <- p + ggtitle(paste("Job runtime histogram for", name))
print(p)
}
@ -171,9 +171,9 @@ plotTimeByHost <- function(gatkReportData) {
p = p + facet_grid(analysisName ~ ., scale="free")
p = p + vis()
p = p + xlab("Job execution host")
p = p + opts(title = paste(name, "of job runtimes by analysis name and execution host"))
p = p + ggtitle(paste(name, "of job runtimes by analysis name and execution host"))
p = p + ylab(paste("Distribution of runtimes", RUNTIME_UNITS))
p = p + opts(axis.text.x=theme_text(angle=45, hjust=1, vjust=1))
p = p + theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
print(p)
}

View File

@ -6,19 +6,21 @@ library("tools") #For compactPDF in R 2.13+
library(gsalib)
if ( interactive() ) {
args <- c("NA12878.6.1.dedup.realign.recal.bqsr.grp.csv", "NA12878.6.1.dedup.realign.recal.bqsr.grp", NA)
} else {
args <- commandArgs(TRUE)
}
data <- read.csv(args[1])
data$Recalibration = as.factor(sapply(as.character(data$Recalibration),function(x) {
xu = toupper(x);
if (xu == "ORIGINAL") "BEFORE" else
if (xu == "RECALIBRATED") "AFTER" else
if (xu == "RECALIBRATION") "BQSR" else
xu }));
if (xu == "RECALIBRATED") "AFTER" else
if (xu == "RECALIBRATION") "BQSR" else
xu }));
gsa.report <- gsa.read.gatkreport(args[2])
@ -31,12 +33,12 @@ gsa.report$Arguments$Value[gsa.report$Argument$Value == "null"] = "None";
gsa.report.covariate.argnum = gsa.report$Arguments$Argument == "covariate";
gsa.report$Arguments$Value[gsa.report.covariate.argnum] = sapply(strsplit(gsa.report$Arguments$Value[gsa.report.covariate.argnum],","),function(x) {
y = sub("(^.+)Covariate","\\1",x); paste(y,collapse=",") } );
y = sub("(^.+)Covariate","\\1",x); paste(y,collapse=",") } );
data <- within(data, EventType <- factor(EventType, levels = rev(levels(EventType))))
numRG = length(unique(data$ReadGroup))
blankTheme = opts(panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank(), axis.ticks = theme_blank())
blankTheme = theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.ticks = element_blank())
# Viewport (layout 2 graphs top to bottom)
distributeGraphRows <- function(graphs, heights = c()) {
@ -72,7 +74,7 @@ for(cov in levels(data$CovariateName)) { # for each covariate in turn
dIns=dIns[sample.int(length(dIns[,1]),min(length(dIns[,1]),2000)),] # don't plot too many values because it makes the PDFs too massive
dDel=dDel[sample.int(length(dDel[,1]),min(length(dDel[,1]),2000)),] # don't plot too many values because it makes the PDFs too massive
d=rbind(dSub, dIns, dDel)
if( cov != "QualityScore" ) {
p <- ggplot(d, aes(x=CovariateValue,y=Accuracy,alpha=log10(Observations))) + ylim(min(-10,d$Accuracy),max(10,d$Accuracy)) +
geom_abline(intercept=0, slope=0, linetype=2) +
@ -81,25 +83,25 @@ for(cov in levels(data$CovariateName)) { # for each covariate in turn
blankTheme
if(cov == "Cycle") {
b <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("BEFORE"="maroon1","AFTER"="blue","BQSR"="black")) + facet_grid(.~EventType) +
opts(axis.text.x=theme_text(angle=90, hjust=0))
theme(axis.text.x=element_text(angle=90, hjust=0))
p <- ggplot(d, aes(x=CovariateValue,y=AverageReportedQuality,alpha=log10(Observations))) +
xlab(paste(cov,"Covariate")) +
ylab("Mean Quality Score") + ylim(0,max(42,d$AverageReportedQuality)) +
blankTheme
e <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("BEFORE"="maroon1","AFTER"="blue","BQSR"="black")) + facet_grid(.~EventType) +
opts(axis.text.x=theme_text(angle=90, hjust=0))
theme(axis.text.x=element_text(angle=90, hjust=0))
} else {
c <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("BEFORE"="maroon1","AFTER"="blue","BQSR"="black")) + facet_grid(.~EventType) +
opts(axis.text.x=theme_text(angle=90, hjust=0)) + xlab(paste(cov,"Covariate (3 base suffix)"))
theme(axis.text.x=element_text(angle=90, hjust=0)) + xlab(paste(cov,"Covariate (3 base suffix)"))
p <- ggplot(d, aes(x=CovariateValue,y=AverageReportedQuality,alpha=log10(Observations))) +
xlab(paste(cov,"Covariate (3 base suffix)")) +
ylab("Mean Quality Score") +
blankTheme
f <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("BEFORE"="maroon1","AFTER"="blue","BQSR"="black")) + facet_grid(.~EventType) +
opts(axis.text.x=theme_text(angle=90, hjust=0))
theme(axis.text.x=element_text(angle=90, hjust=0))
}
} else {
@ -117,7 +119,7 @@ for(cov in levels(data$CovariateName)) { # for each covariate in turn
d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations,y=..ndensity..),alpha=0.6,binwidth=1,position="identity")
d <- d + scale_fill_manual(values=c("BEFORE"="maroon1","AFTER"="blue","BQSR"="black"))
d <- d + facet_grid(.~EventType)
# d <- d + scale_y_continuous(formatter="comma")
# d <- d + scale_y_continuous(formatter="comma")
}
}
@ -154,3 +156,4 @@ if ( ! is.na(args[3]) ) {
compactPDF(args[2])
}
}

View File

@ -27,7 +27,7 @@ distributeGraphRows <- function(graphs, heights = c()) {
distributeLogGraph <- function(graph, xName) {
continuousGraph <- graph + scale_x_continuous(xName)
logGraph <- graph + scale_x_log10(xName) + opts(title="")
logGraph <- graph + scale_x_log10(xName) + ggtitle("")
distributeGraphRows(list(continuousGraph, logGraph))
}
@ -194,7 +194,7 @@ plotVariantQC <- function(metrics, measures, requestedStrat = "Sample",
titleText=paste(titleText, note, sep="\n")
}
paste(titleText)
title <- opts(title=titleText)
title <- ggtitle(titleText)
determineFacet <- function(onX) {
if ( onX ) {
@ -232,7 +232,7 @@ plotVariantQC <- function(metrics, measures, requestedStrat = "Sample",
distGraph <- distGraph + facet_grid(distFacet, scales=scale)
distGraph <- distGraph + ylab("Relative frequency")
distGraph <- distGraph + xlab("Variable value (see facet for variable by color)")
distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none")
distGraph <- distGraph + theme(axis.text.x=element_text(angle=-45)) # , legend.position="none")
} else {
distGraph <- NA
}