diff --git a/R/DataProcessingReport/GetTearsheetStats.R b/R/DataProcessingReport/GetTearsheetStats.R index fe1b97b13..344a7ad30 100644 --- a/R/DataProcessingReport/GetTearsheetStats.R +++ b/R/DataProcessingReport/GetTearsheetStats.R @@ -1,11 +1,27 @@ +##put titles/rownames left +##make titles blue +##decrease margins below titles +## put row names in black +##put background rows in. +##change layouts so that it looks better +##get sample numbers in correctly + + + +.libPaths('~/Sting/R/') #but make sure this is universal +#put use scripts here + +suppressMessages(library(gplots)); +suppressMessages(library(ReadImages)); + suppressMessages(library(gsalib)); suppressMessages(library(ROracle)); -cmdargs = getargs( +cmdargs = gsa.getargs( list( bamlist = list(value=NA, doc="list of BAM files"), evalroot = list(value=NA, doc="VariantEval root"), - statsout = list(value=NA, doc="Output path for stats"), + tearxsout = list(value=NA, doc="Output path for tearsheet PDF"), plotout = list(value=NA, doc="Output path for PDF") ), doc="Creates a tearsheet" @@ -54,156 +70,187 @@ dproj = d[which(squid_fclanes %in% fclanes),]; d2proj = d2[which(d2$"Project" %in% unique(dproj$"Project") & d2$"Sample" %in% dproj$"External ID"),]; #} -## Tear sheet components +tearsheetdrop <- "tearsheetdrop.jpg" #put the path to the tearsheet backdrop here -# Project summary -projects = paste(unique(dproj$"Project"), collapse=", "); -cat(sprintf("Project Summary (%s)\n", projects), file=cmdargs$statsout, append=FALSE); +tearsheet<-function(){ + + pdf(file= cmdargs$statsout, width=22, height=15, pagecentre=TRUE, pointsize=24) + + #define layout + layout(matrix(c(1,1,2,4,3,5), ncol=2, nrow=3, byrow=TRUE), heights=c(1, 2.5,2.5), respect=FALSE) + + #prep for title bar + full<-strsplit(cmdargs$evalroot, "/") + name<-strsplit(full[[1]][length(full[[1]])], ".",fixed=TRUE)[[1]][1] + title=paste(name, ": TEAR SHEET", sep="") + drop<-read.jpeg(system.file(tearsheetdrop, package="gsalib")) + + #plot title bar + par(mar=c(0,0,0,0)) + plot(drop) + text(110, 45, title, family="serif", adj=c(0,0), cex=3, col=gray(.25)) + -used_samples = length(unique(dproj$"External ID")); -cat(sprintf("\tUsed samples: %s\n", used_samples), file=cmdargs$statsout, append=TRUE); + # Project summary + projects = paste(unique(dproj$"Project"), collapse=", "); -unused_samples = 0; -cat(sprintf("\tUnused samples: %s\n", unused_samples), file=cmdargs$statsout, append=TRUE); + used_samples = length(unique(dproj$"External ID")); -sequencing_protocol = "Hybrid selection"; -cat(sprintf("\tSequencing protocol: %s\n", sequencing_protocol), file=cmdargs$statsout, append=TRUE); + unused_samples = 0; -bait_design = paste(unique(dproj$"Bait Set"), collapse=", "); -cat(sprintf("\tBait design: %s\n", bait_design), file=cmdargs$statsout, append=TRUE); + sequencing_protocol = "Hybrid selection"; #can this be extracted? -callable_target = paste(unique(dproj$"Target Territory"), collapse=", "); -cat(sprintf("\tCallable target: %s\n", callable_target), file=cmdargs$statsout, append=TRUE); + bait_design = paste(unique(dproj$"Bait Set"), collapse=", "); -# Bases summary per lane -cat("\nBases summary (excluding unused lanes/samples)\n", file=cmdargs$statsout, append=TRUE); + callable_target = paste(unique(dproj$"Target Territory"), collapse=", "); -cat("\tBases summary per lane\n", file=cmdargs$statsout, append=TRUE); + table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target) + print(nrow(table1)) + rownames(table1)<-c("Samples","Sequencing Protocol", "Bait Design","Callable Target") + par(mar=c(4,4,4,4)) + textplot(table1, col.rownames="darkblue", show.colnames=FALSE, cex=1.25) + title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.75) -reads_per_lane_mean = mean(dproj$"PF Reads (HS)"); -reads_per_lane_sd = sd(dproj$"PF Reads (HS)"); -cat(sprintf("\t\tReads: %s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd), file=cmdargs$statsout, append=TRUE); + # Bases summary -used_bases_per_lane_mean = mean(dproj$"PF HQ Aligned Q20 Bases"); -used_bases_per_lane_sd = sd(dproj$"PF HQ Aligned Q20 Bases"); -cat(sprintf("\t\tUsed bases: %s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd), file=cmdargs$statsout, append=TRUE); + reads_per_lane_mean = signif(mean(dproj$"PF Reads (HS)"), 3); + reads_per_lane_sd = signif(sd(dproj$"PF Reads (HS)"), 3); + lanes<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd) -target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage")); -target_coverage_sd = sd(na.omit(dproj$"Mean Target Coverage")); -cat(sprintf("\t\tTarget coverage: %0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd), file=cmdargs$statsout, append=TRUE); + used_bases_per_lane_mean = signif(mean(dproj$"PF HQ Aligned Q20 Bases"), 3); + used_bases_per_lane_sd = signif(sd(dproj$"PF HQ Aligned Q20 Bases"), 3); + lanes<-c(lanes, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd)); -pct_loci_gt_10x_mean = mean(na.omit(dproj$"Target Bases 10x %")); -pct_loci_gt_10x_sd = sd(na.omit(dproj$"Target Bases 10x %")); -cat(sprintf("\t\t%% loci > 10x covered: %0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd), file=cmdargs$statsout, append=TRUE); + target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage")); + target_coverage_sd = sd(na.omit(dproj$"Mean Target Coverage")); + lanes<-c(lanes, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd)); -pct_loci_gt_20x_mean = mean(na.omit(dproj$"Target Bases 20x %")); -pct_loci_gt_20x_sd = sd(na.omit(dproj$"Target Bases 20x %")); -cat(sprintf("\t\t%% loci > 20x covered: %0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd), file=cmdargs$statsout, append=TRUE); + pct_loci_gt_10x_mean = mean(na.omit(dproj$"Target Bases 10x %")); + pct_loci_gt_10x_sd = sd(na.omit(dproj$"Target Bases 10x %")); + lanes<-c(lanes, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd)); -pct_loci_gt_30x_mean = mean(na.omit(dproj$"Target Bases 30x %")); -pct_loci_gt_30x_sd = sd(na.omit(dproj$"Target Bases 30x %")); -cat(sprintf("\t\t%% loci > 30x covered: %0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd), file=cmdargs$statsout, append=TRUE); + pct_loci_gt_20x_mean = mean(na.omit(dproj$"Target Bases 20x %")); + pct_loci_gt_20x_sd = sd(na.omit(dproj$"Target Bases 20x %")); + lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd)); + + pct_loci_gt_30x_mean = mean(na.omit(dproj$"Target Bases 30x %")); + pct_loci_gt_30x_sd = sd(na.omit(dproj$"Target Bases 30x %")); + lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd)); -cat("\tBases summary per sample\n", file=cmdargs$statsout, append=TRUE); + reads_per_sample_mean = signif(mean(d2proj$"PF Reads"), 3); + reads_per_sample_sd = signif(sd(d2proj$"PF Reads"), 3); + samps<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd); -reads_per_sample_mean = mean(d2proj$"PF Reads"); -reads_per_sample_sd = sd(d2proj$"PF Reads"); -cat(sprintf("\t\tReads: %s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd), file=cmdargs$statsout, append=TRUE); + used_bases_per_sample_mean = signif(mean(d2proj$"PF HQ Aligned Q20 Bases"), 3); + used_bases_per_sample_sd = signif(sd(d2proj$"PF HQ Aligned Q20 Bases"), 3); + samps<-c(samps, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd)); -used_bases_per_sample_mean = mean(d2proj$"PF HQ Aligned Q20 Bases"); -used_bases_per_sample_sd = sd(d2proj$"PF HQ Aligned Q20 Bases"); -cat(sprintf("\t\tUsed bases: %s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd), file=cmdargs$statsout, append=TRUE); + target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage")); + target_coverage_sd = sd(na.omit(d2proj$"Mean Target Coverage")); + samps<-c(samps, sprintf("%0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd)); -target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage")); -target_coverage_sd = sd(na.omit(d2proj$"Mean Target Coverage")); -cat(sprintf("\t\tTarget coverage: %0.2fx +/- %0.2fx\n", target_coverage_mean, target_coverage_sd), file=cmdargs$statsout, append=TRUE); + pct_loci_gt_10x_mean = mean(na.omit(d2proj$"Target Bases 10x %")); + pct_loci_gt_10x_sd = sd(na.omit(d2proj$"Target Bases 10x %")); + samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd)); -pct_loci_gt_10x_mean = mean(na.omit(d2proj$"Target Bases 10x %")); -pct_loci_gt_10x_sd = sd(na.omit(d2proj$"Target Bases 10x %")); -cat(sprintf("\t\t%% loci > 10x covered: %0.2f%% +/- %0.2f%%\n", pct_loci_gt_10x_mean, pct_loci_gt_10x_sd), file=cmdargs$statsout, append=TRUE); + pct_loci_gt_20x_mean = mean(na.omit(d2proj$"Target Bases 20x %")); + pct_loci_gt_20x_sd = sd(na.omit(d2proj$"Target Bases 20x %")); + samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd)); -pct_loci_gt_20x_mean = mean(na.omit(d2proj$"Target Bases 20x %")); -pct_loci_gt_20x_sd = sd(na.omit(d2proj$"Target Bases 20x %")); -cat(sprintf("\t\t%% loci > 20x covered: %0.2f%% +/- %0.2f%%\n", pct_loci_gt_20x_mean, pct_loci_gt_20x_sd), file=cmdargs$statsout, append=TRUE); + pct_loci_gt_30x_mean = mean(na.omit(d2proj$"Target Bases 30x %")); + pct_loci_gt_30x_sd = sd(na.omit(d2proj$"Target Bases 30x %")); + samps<-c(samps, sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd)); -pct_loci_gt_30x_mean = mean(na.omit(d2proj$"Target Bases 30x %")); -pct_loci_gt_30x_sd = sd(na.omit(d2proj$"Target Bases 30x %")); -cat(sprintf("\t\t%% loci > 30x covered: %0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd), file=cmdargs$statsout, append=TRUE); + table2<-cbind(lanes, samps) + colnames(table2)<-c("Per lane", "Per sample") + print(nrow(table2)) -# Sequencing summary + rownames(table2)<-c("Reads", "Used bases", "Average target coverage", "% loci covered to 10x", "% loci covered to 20x","% loci covered to 30x") + par(mar=c(4,4,4,4)) + textplot(table2, rmar=1, col.rownames="dark blue", cex=1.25) + title(main="Bases Summary", family="sans", cex.main=1.75) cat("\nSequencing summary\n", file=cmdargs$statsout, append=TRUE); -instrument = "Illumina GA2"; -cat(sprintf("\tSequencer: %s\n", instrument), file=cmdargs$statsout, append=TRUE); -used_lanes = nrow(dproj); -cat(sprintf("\tUsed lanes: %s\n", used_lanes), file=cmdargs$statsout, append=TRUE); +# Sequencing summary -unused_lanes_by_sequencing = 0; -unused_lanes_by_analysis = 0; -cat(sprintf("\tUnused lanes: %s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), file=cmdargs$statsout, append=TRUE); + instrument = "Illumina GA2";#Can we get this? + used_lanes = nrow(dproj); + unused_lanes_by_sequencing = 0; #can we get this? + unused_lanes_by_analysis = 0; -lanes_per_sample_mean = mean(table(dproj$"External ID")); -lanes_per_sample_sd = sd(table(dproj$"External ID")); -lanes_per_sample_median = median(table(dproj$"External ID")); -cat(sprintf("\tLanes/sample: %0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), file=cmdargs$statsout, append=TRUE); -lanes_paired = nrow(subset(dproj, dproj$"Lane Type" == "Paired")); -lanes_widowed = nrow(subset(dproj, dproj$"Lane Type" == "Widowed")); -lanes_single = nrow(subset(dproj, dproj$"Lane Type" == "Single")); -cat(sprintf("\tLane parities: %s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), file=cmdargs$statsout, append=TRUE); + lanes_per_sample_mean = mean(table(dproj$"External ID")); + lanes_per_sample_sd = sd(table(dproj$"External ID")); + lanes_per_sample_median = median(table(dproj$"External ID")); + lanes_paired = nrow(subset(dproj, dproj$"Lane Type" == "Paired")); + lanes_widowed = nrow(subset(dproj, dproj$"Lane Type" == "Widowed")); + lanes_single = nrow(subset(dproj, dproj$"Lane Type" == "Single")); -read_length_mean = mean(dproj$"Mean Read Length (P)"); -read_length_sd = sd(dproj$"Mean Read Length (P)"); -read_length_median = median(dproj$"Mean Read Length (P)"); -cat(sprintf("\tRead length: %0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), file=cmdargs$statsout, append=TRUE); + read_length_mean = mean(dproj$"Mean Read Length (P)"); + read_length_sd = sd(dproj$"Mean Read Length (P)"); + read_length_median = median(dproj$"Mean Read Length (P)"); -date = dproj$"Run Date"; -date = sub("JAN", "01", date); -date = sub("FEB", "02", date); -date = sub("MAR", "03", date); -date = sub("APR", "04", date); -date = sub("MAY", "05", date); -date = sub("JUN", "06", date); -date = sub("JUL", "07", date); -date = sub("AUG", "08", date); -date = sub("SEP", "09", date); -date = sub("OCT", "10", date); -date = sub("NOV", "11", date); -date = sub("DEC", "12", date); -date = date[order(as.Date(date, format="%d-%m-%Y"))]; + date = dproj$"Run Date"; + date = sub("JAN", "01", date); + date = sub("FEB", "02", date); + date = sub("MAR", "03", date); + date = sub("APR", "04", date); + date = sub("MAY", "05", date); + date = sub("JUN", "06", date); + date = sub("JUL", "07", date); + date = sub("AUG", "08", date); + date = sub("SEP", "09", date); + date = sub("OCT", "10", date); + date = sub("NOV", "11", date); + date = sub("DEC", "12", date); + date = date[order(as.Date(date, format="%d-%m-%Y"))]; -start_date = date[1]; -end_date = date[length(date)]; -cat(sprintf("\tSequencing dates: %s to %s\n", start_date, end_date), file=cmdargs$statsout, append=TRUE); + start_date = date[1]; + end_date = date[length(date)]; + + + table3<-rbind(instrument, used_lanes, sprintf("%s rejected by sequencing, %s by analysis\n", unused_lanes_by_sequencing, unused_lanes_by_analysis), sprintf("%0.1f +/- %0.1f lanes (median=%0.1f)\n", lanes_per_sample_mean, lanes_per_sample_sd, lanes_per_sample_median), sprintf("%s paired, %s widowed, %s single\n", lanes_paired, lanes_widowed, lanes_single), sprintf("%0.1f +/- %0.1f bases (median=%0.1f)\n", read_length_mean, read_length_sd, read_length_median), sprintf("\tSequencing dates: %s to %s\n", start_date, end_date)) + print(nrow(table3)) + + rownames(table3)<-c("Sequencer", "Used lanes", "Unused lanes","Used lanes/sample", "Lane pariteies", "Read legnths", "Sequencing dates") + par(mar=c(4,4,4,4)) + textplot(table3, rmar=1, col.rownames="dark blue", show.colnames=FALSE, cex=1.25) + title(main="Sequencing Summary", family="sans", cex.main=1.75) # Variant summary -cat("\nVariant summary\n", file=cmdargs$statsout, append=TRUE); + + eval.counts = read.csv(paste(cmdargs$evalroot, ".Count_Variants.csv", sep=""), header=TRUE, comment.char="#"); + eval.counts.called = subset(eval.counts, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); + eval.counts.called.all = subset(eval.counts.called, novelty_name == "all")$nVariantLoci; + eval.counts.called.known = subset(eval.counts.called, novelty_name == "known")$nVariantLoci; + eval.counts.called.novel = subset(eval.counts.called, novelty_name == "novel")$nVariantLoci; -eval.counts = read.csv(paste(cmdargs$evalroot, ".Count_Variants.csv", sep=""), header=TRUE, comment.char="#"); -eval.counts.called = subset(eval.counts, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); -eval.counts.called.all = subset(eval.counts.called, novelty_name == "all")$nVariantLoci; -eval.counts.called.known = subset(eval.counts.called, novelty_name == "known")$nVariantLoci; -eval.counts.called.novel = subset(eval.counts.called, novelty_name == "novel")$nVariantLoci; + eval.titv = read.csv(paste(cmdargs$evalroot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""), header=TRUE, comment.char="#"); + eval.titv.called = subset(eval.titv, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); + eval.titv.called.all = subset(eval.titv.called, novelty_name == "all")$ti.tv_ratio; + eval.titv.called.known = subset(eval.titv.called, novelty_name == "known")$ti.tv_ratio; + eval.titv.called.novel = subset(eval.titv.called, novelty_name == "novel")$ti.tv_ratio; -eval.titv = read.csv(paste(cmdargs$evalroot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""), header=TRUE, comment.char="#"); -eval.titv.called = subset(eval.titv, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); -eval.titv.called.all = subset(eval.titv.called, novelty_name == "all")$ti.tv_ratio; -eval.titv.called.known = subset(eval.titv.called, novelty_name == "known")$ti.tv_ratio; -eval.titv.called.novel = subset(eval.titv.called, novelty_name == "novel")$ti.tv_ratio; + table4 = matrix(c(eval.counts.called.all, eval.counts.called.known, eval.counts.called.novel, eval.titv.called.all, eval.titv.called.known, eval.titv.called.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3); + print(nrow(table4)) -vars = matrix(c(eval.counts.called.all, eval.counts.called.known, eval.counts.called.novel, eval.titv.called.all, eval.titv.called.known, eval.titv.called.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3); -rownames(vars) = c("All", "Known", "Novel"); -colnames(vars) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio"); + rownames(table4) = c("All", "Known", "Novel"); + colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio"); -cat(sprintf("\t\tFound\tTi/Tv ratio\tExpected Ti/Tv ratio\n"), file=cmdargs$statsout, append=TRUE); -cat(sprintf("\tAll\t%s\t%s\t\t%s\n", eval.counts.called.all, eval.titv.called.all, "3.0 - 3.2"), file=cmdargs$statsout, append=TRUE); -cat(sprintf("\tKnown\t%s\t%s\t\t%s\n", eval.counts.called.known, eval.titv.called.known, "3.2 - 3.4"), file=cmdargs$statsout, append=TRUE); -cat(sprintf("\tNovel\t%s\t%s\t\t%s\n", eval.counts.called.novel, eval.titv.called.novel, "2.7 - 3.0"), file=cmdargs$statsout, append=TRUE); + + + textplot(table4, rmar=1, col.rownames="dark blue", cex=1.25) + title(main="Variant Summary", family="sans", cex.main=1.75) + + dev.off() + } + +tearsheet() # Plots - +plots<-function(){ eval.bysample = read.csv(paste(cmdargs$evalroot, ".SimpleMetricsBySample.csv", sep=""), header=TRUE, comment.char="#"); eval.bysample.called = subset(eval.bysample, evaluation_name == "eval" & comparison_name == "dbsnp" & jexl_expression == "none" & filter_name == "called"); eval.bysample.called.all = subset(eval.bysample.called, novelty_name == "all"); @@ -243,3 +290,6 @@ points(eval.func.called.novel$Synonymous[ind] / (eval.func.called.novel$Missense legend("topright", c("All", "Known", "Novel"), col=c("black", "blue", "red"), pt.cex=1.3, pch=21); dev.off(); +} + +plots()