Updates formatting and combines plots into the tearsheet.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5109 348d0f76-0448-11de-a6fe-93d51630548a
2011-01-28 17:45:23 +00:00 · 2011-01-28 17:45:23 +00:00 · 88ea60b864
parent 2182b8c7e2
commit 88ea60b864
1 changed files with 74 additions and 36 deletions
--- a/R/DataProcessingReport/GetTearsheetStats.R
+++ b/R/DataProcessingReport/GetTearsheetStats.R
@ -6,7 +6,7 @@
 ##change layouts so that it looks better
 ##get sample numbers in correctly

-.libPaths('/humgen/gsa-firehose2/pipeline/repositories/StingProduction/R/') 
+.libPaths('~/Documents/trunk/R/') 

 suppressMessages(library(gplots));
 suppressMessages(library(ReadImages));
@ -18,13 +18,13 @@ cmdargs = gsa.getargs(
    list(
        bamlist = list(value=NA, doc="list of BAM files"),
        evalroot = list(value=NA, doc="VariantEval root"),
-        tearout = list(value=NA, doc="Output path for tearsheet PDF"),
-        plotout = list(value=NA, doc="Output path for PDF")
+        tearout = list(value=NA, doc="Output path for tearsheet PDF")#,
+        #plotout = list(value=NA, doc="Output path for PDF")
    ),
    doc="Creates a tearsheet"
 );

-#if (0) {
+bamgetter<-function(){
 bamlist = read.table(cmdargs$bamlist);

 fclanes = c();
@ -65,16 +65,18 @@ squid_fclanes = gsub("A.XX", "", squid_fclanes);

 dproj = d[which(squid_fclanes %in% fclanes),];
 d2proj = d2[which(d2$"Project" %in% unique(dproj$"Project") & d2$"Sample" %in% dproj$"External ID"),];
-#}
+}

-tearsheetdrop <- "data/tearsheetdrop.jpg" #put the path to the tearsheet backdrop here

 tearsheet<-function(){
-	
-	pdf(file= cmdargs$tearout, width=22, height=15, pagecentre=TRUE, pointsize=24) 
+	tearsheetdrop <- "data/tearsheetdrop.jpg" #put the path to the tearsheet backdrop here
+
+	pdf(file= "tester.pdf", width=22, height=17, pagecentre=TRUE, pointsize=24) 
 	
 	#define layout
-	layout(matrix(c(1,1,2,4,3,5), ncol=2, nrow=3, byrow=TRUE), heights=c(1, 2.5,2.5), respect=FALSE)
+	postable<-matrix(c(1, 1, 1, 1, 1, 1, rep(c(2, 2, 2, 4, 4, 4), 5), rep(c(3, 3, 3, 4, 4, 4), 3), rep(c(3,3,3,5,5,5), 5), 6,6,6,7,7,7), nrow=15, ncol=6, byrow=TRUE)
+	layout(postable, heights=c(1, rep(.18, 13), 2), respect=FALSE)
+    
    
    #prep for title bar
    full<-strsplit(cmdargs$evalroot, "/")
@ -84,7 +86,7 @@ tearsheet<-function(){
    #plot title bar
    par(mar=c(0,0,0,0))
    plot(drop)
-    text(110, 45, name, family="serif", adj=c(0,0), cex=3, col=gray(.25))
+    text(155, 50, name, family="serif", adj=c(0,0), cex=3, col=gray(.25))
    

 	# Project summary
@ -111,18 +113,17 @@ tearsheet<-function(){
 	table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target)
 	print(nrow(table1))
 	rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target")
-	par(mar=c(4,4,4,4))
-	textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25)
-    title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.75)
+	par(mar=c(0,0,1,0))
+	textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25, valign="top")
+    title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1)	
+    # Bases summary 

-	# Bases summary 
-
-	reads_per_lane_mean = signif(mean(dproj$"PF Reads (HS)"), 3);
-	reads_per_lane_sd = signif(sd(dproj$"PF Reads (HS)"), 3);
+	reads_per_lane_mean = format(mean(dproj$"PF Reads (HS)"), 8, 3,1, scientific=TRUE);
+	reads_per_lane_sd = format(sd(dproj$"PF Reads (HS)"), 8, 3,1, scientific=TRUE);
 	lanes<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd)

-	used_bases_per_lane_mean = signif(mean(dproj$"PF HQ Aligned Q20 Bases"), 3);
-	used_bases_per_lane_sd = signif(sd(dproj$"PF HQ Aligned Q20 Bases"), 3);
+	used_bases_per_lane_mean = format(mean(dproj$"PF HQ Aligned Q20 Bases"),8, 3,1, scientific=TRUE);
+	used_bases_per_lane_sd = format(sd(dproj$"PF HQ Aligned Q20 Bases"), 8, 3,1, scientific=TRUE);
 	lanes<-c(lanes, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd));

 	target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage"));
@ -142,12 +143,12 @@ tearsheet<-function(){
 	lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));


-	reads_per_sample_mean = signif(mean(d2proj$"PF Reads"), 3);
-	reads_per_sample_sd = signif(sd(d2proj$"PF Reads"), 3);
+	reads_per_sample_mean = format(mean(d2proj$"PF Reads"), 8, 3,1, scientific=TRUE);
+	reads_per_sample_sd = format(sd(d2proj$"PF Reads"), 8, 3,1, scientific=TRUE);
 	samps<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd);

-	used_bases_per_sample_mean = signif(mean(d2proj$"PF HQ Aligned Q20 Bases"), 3);
-	used_bases_per_sample_sd = signif(sd(d2proj$"PF HQ Aligned Q20 Bases"), 3);
+	used_bases_per_sample_mean = format(mean(d2proj$"PF HQ Aligned Q20 Bases"),8, 3,1, scientific=TRUE);
+	used_bases_per_sample_sd = format(sd(d2proj$"PF HQ Aligned Q20 Bases"), 8, 3,1, scientific=TRUE);
 	samps<-c(samps, sprintf("%s +/- %s\n", used_bases_per_sample_mean, 	used_bases_per_sample_sd));

 	target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage"));
@ -171,9 +172,9 @@ tearsheet<-function(){
 		print(nrow(table2))

 	rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x")
-	par(mar=c(4,4,4,4))
-	textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25)
-	title(main="Bases Summary", family="sans", cex.main=1.75)
+	par(mar=c(0,0,1,0))
+	textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
+	title(main="Bases Summary", family="sans", cex.main=1.25, line=0)


 # Sequencing summary
@ -226,15 +227,12 @@ tearsheet<-function(){
 	
 	
 	table3<-rbind(paste(instrument), used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date))
-		print(nrow(table3))
-		print(ncol(table3))
-		print(used_lanes)
-		print(instrument)
+		

  	rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane parities", "Read lengths", "Sequencing dates")
-	par(mar=c(4,4,4,4))
-	textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25)
-	title(main="Sequencing Summary", family="sans", cex.main=1.75)
+	par(mar=c(0,0,1,0))
+	textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25, valign="top")
+	title(main="Sequencing Summary", family="sans", cex.main=1.25, line=0)

 # Variant summary
 	
@ -258,8 +256,49 @@ tearsheet<-function(){

 	

-	textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25)
-	title(main="Variant Summary", family="sans", cex.main=1.75)
+	par(mar=c(0,0,0,0))
+	textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25, valign="top")
+	title(main="Variant Summary", family="sans", cex.main=1.25, line=-2)
+	
+#plots
+	eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#");
+	eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
+	eval.bysample.called.all = subset(eval.bysample.called, novelty_name == "all");
+	eval.bysample.called.known = subset(eval.bysample.called, novelty_name == "known");
+	eval.bysample.called.novel = subset(eval.bysample.called, novelty_name == "novel");
+
+	eval.ac = read.csv(paste(cmdargs$evalroot, ".MetricsByAc.csv", sep=""), header=TRUE, comment.char="#");
+	eval.ac.called = subset(eval.ac, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
+	eval.ac.called.all = subset(eval.ac.called, novelty_name == "all");
+	eval.ac.called.known = subset(eval.ac.called, novelty_name == "known");
+	eval.ac.called.novel = subset(eval.ac.called, novelty_name == "novel");
+
+	eval.func = read.csv(paste(cmdargs$evalroot, ".Functional_Class_Counts_by_Sample.csv", sep=""), header=TRUE, comment.char="#");
+	eval.func.called = subset(eval.func, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called");
+	eval.func.called.all = subset(eval.func.called, novelty_name == "all");
+	eval.func.called.known = subset(eval.func.called, novelty_name == "known");
+	eval.func.called.novel = subset(eval.func.called, novelty_name == "novel");
+
+
+	#boxplot(eval.bysample.called.all$CountVariants, eval.bysample.called.known$CountVariants, eval.bysample.called.novel$CountVariants, names=c("All", "Known", "Novel"), ylab="Variants per sample", main="", cex=1.3, cex.lab=1.3, cex.axis=1.3);
+
+	ind = order(eval.bysample.called.all$CountVariants);
+	plot(c(1:length(eval.bysample.called.all$CountVariants)), eval.bysample.called.all$CountVariants[ind], col="black", cex=1.3, cex.lab=1.3, cex.axis=1.3, main="Varients per Sample" xlab="Sample", ylab="Number of variants", bty="n", ylim=c(0, max(eval.bysample.called.all$CountVariants)));
+	points(c(1:length(eval.bysample.called.known$CountVariants)), eval.bysample.called.known$CountVariants[ind], col="blue", cex=1.3);
+	points(c(1:length(eval.bysample.called.novel$CountVariants)), eval.bysample.called.novel$CountVariants[ind], col="red", cex=1.3);
+	legend(0, max(eval.bysample.called.all$CountVariants)/2, c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
+
+	plot(eval.ac.called.all$AC, eval.ac.called.all$n, col="black", type="l", lwd=2, cex=1.3, cex.lab=1.3, cex.axis=1.3, xlab="Allele count", ylab="Number of variants", main="Varients by Allele Count", log="xy", bty="n");
+	points(eval.ac.called.known$AC, eval.ac.called.known$n, col="blue", type="l", lwd=2);
+	points(eval.ac.called.novel$AC, eval.ac.called.novel$n, col="red", type="l", lwd=2);
+	legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), lwd=2);
+
+	#plot(eval.func.called.all$Synonymous[ind] / (eval.func.called.all$Missense + eval.func.called.all$Nonsense)[ind], ylim=c(0, 2), cex=1.3, cex.lab=1.3, cex.axis=1.3, bty="n", xlab="Sample", ylab="Ratio of synonymous to non-synonymous variants", col="black");
+	#points(eval.func.called.known$Synonymous[ind] / (eval.func.called.known$Missense + eval.func.called.known$Nonsense)[ind], cex=1.3, col="blue");
+	#points(eval.func.called.novel$Synonymous[ind] / (eval.func.called.novel$Missense + eval.func.called.novel$Nonsense)[ind], cex=1.3, col="red");
+	#legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21);
+
+
 	
 	dev.off()
 	}
@ -309,4 +348,3 @@ legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt
 dev.off();
 }

-plots()