From 1e2892a35dde01f30c69dfc83257e91366e7ac94 Mon Sep 17 00:00:00 2001 From: corin Date: Thu, 26 May 2011 19:57:00 +0000 Subject: [PATCH] Preliminary QC script in R, which checks coverage, fingerprints, library duplication, total SNPs, dbSNP%, and availability of sample data git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5881 348d0f76-0448-11de-a6fe-93d51630548a --- R/DataProcessingReport/prelimqc.r | 177 ++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 R/DataProcessingReport/prelimqc.r diff --git a/R/DataProcessingReport/prelimqc.r b/R/DataProcessingReport/prelimqc.r new file mode 100644 index 000000000..5c3fc2ecb --- /dev/null +++ b/R/DataProcessingReport/prelimqc.r @@ -0,0 +1,177 @@ +#preqc.r +library(gplots) +.libPaths('/humgen/gsa-pipeline/.repository/R/') +library(gsalib) + +cmdargs = gsa.getargs( + list( + tsv = list(value=NA, doc="pipeline tsv file"), + qcout=list(value=NA, doc="path to output root") + ), + doc="Creates a tearsheet" +); + +read.delim(cmdargs$tsv, header=FALSE)->settable + + squids<-unique(settable[,1]) +print(paste(nrow(settable), "samples in tsv")) +lane<-data.frame() +samp<-data.frame() +for(squid in squids){ + gsa.read.squidmetrics(squid, TRUE)->lanemetrics + print(paste("Got lane metrics for", squid)) + addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),] + gsa.read.squidmetrics(squid, FALSE)->samplemetrics + print(paste("Got sample metrics for", squid)) + addsamps<-samplemetrics[which(samplemetrics$Sample %in% settable[,2]),] + lane<-rbind(lane, addlanes) + samp<-rbind(samp, addsamps) +} + +print(paste(nrow(samp), "samples in samp")) +print(paste(length(unique(lane$"External ID")), "samples in lane")) + +print(paste(setdiff(settable[,2], samp$Sample), "do not overlap between samp and tsv")) +print(paste(setdiff(settable[,2], lane$"External ID"), "do not overlap between lane and tsv")) +print(paste(setdiff(samp$Sample, lane$"External ID"), "do not overlap between lane and samp")) + +missingSamp<-setdiff(settable[,2], samp$Sample) +missingLane<-setdiff(settable[,2], lane$"External ID") + +drv = dbDriver("Oracle"); +con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD"); + +rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS")); +d = fetch(rs, n=-1); +dbHasCompleted(rs); +dbClearResult(rs); + +rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG")); +d2 = fetch(rs2, n=-1); +dbHasCompleted(rs2); +dbClearResult(rs2); + +oraCloseDriver(drv); + + compsamp=d2[which(d2$"Bait Set" %in% samp$"Bait Set"),] + complane=d[which(d$"Bait Set" %in% lane$"Bait Set"),] + + + +pdf(paste(cmdargs$qcout, "pdf", sep="."), width=11, height=8.5) + +plot(samp$"Target Bases 20x %", main="Coverage to 20x", ylab="% Targets Covered to 20x", xlab="Sample", ylim=c(0,100)) +abline(h=80, lty=2) +legend("bottomright", lty=2, legend="80% coverage to 20x") +lowcoverage<-samp$Sample[which(samp$"Target Bases 20x %"<80)] +if(length(lowcoverage)>0){ +text(which(samp$"Target Bases 20x %"<80),samp$"Target Bases 20x %"[which(samp$"Target Bases 20x %"<80)], labels=samp$Sample[which(samp$"Target Bases 20x %"<80)], pos=2, srt=270, cex=.6, col="hotpink") +} + +plot(samp$"Zero Coverage Targets %", main="Zero Coverage", ylab="% Targets with zero coverage", log="y", xlab="Sample", ylim=c(0.01,100)) +abline(h=3, lty=2) +legend("bottomright", lty=2, legend="3% Targets Zero Coverage") +lowcoverage<-c(lowcoverage,samp$Sample[which(samp$"Zero Coverage">3)]) +if(length(which(samp$"Zero Coverage Targets %">3))>0){ +text(which(samp$"Zero Coverage Targets %">3), samp$"Zero Coverage Targets %"[which(samp$"Zero Coverage Targets %">3)], labels=samp$Sample[which(samp$"Zero Coverage Targets %">3)], pos=2, srt=270, cex=.6, col="hotpink") +} + +print("Coverage stats done") +nofp<-lane$"External ID"[which(is.na(lane$"FP LOD"))] + +if(length(which(is.na(lane$"FP LOD")))< nrow(lane)){ + +plot(lane$"FP Confident Calls"~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level # FP calls", main="Fingerprint Calls/Sample Instance", xaxt="n") +medians<-tapply(lane$"FP Confident Calls",lane$"External ID", median, na.rm=TRUE) +points(as.factor(dimnames(medians)[[1]]),medians,col="red", lwd=2) +legend("topleft", legend="Median across sample instances", pch=1, lwd=2, col="red", lty=0) +poorFPcov<-dimnames(medians)[[1]][which(medians<5 )] +if(length(poorFPcov)>0){ +text(which(medians<5), medians[which(medians<5)],poorFPcov, pos=2, srt=270, cex=.6, col="hotpink") +} + +print("1 fp plot") +plot(100*(lane$"FP Confident Matching SNPs"/lane$"FP Confident Calls")~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level % matching FP calls", main="% Confident calls matching for samples with low confident calls", xaxt="n", ylim=c(0,110)) + +print("2 fp plot") + +plot(lane$"FP LOD"~as.factor(lane$"External ID"), xlab="sample", ylab="Sample Fingerprint LOD", main="Fingerprint Pass:Samples", xaxt="n") +offsamps<-lane$"External ID"[which(lane$"FP LOD"<(-3))] +lowfpLOD<-lane$"External ID"[which(lane$"FP LOD"<6)] + +if(length(lowfpLOD)>0){ +text(which(lane$"External ID" %in% lowfpLOD), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="hotpink") +} +print("3 fp plot") + +if(length(lowfpLOD)>0){ +plot((lane$"FP Confident Calls"-lane$"FP Confident Matching SNPs")~as.factor(lane$"External ID"), main="Calls vs Matching Calls for Samples failing FP QC", ylab="# Mismatches", xlab="") +} +if(length(lowfpLOD)>0){ +text(which(lane$"FP LOD"<6), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="RED") +} + + +}else{ +offsamps<-"NO FPDATA" +lowfpLOD<-"NO FP DATA" +poorFPcov<-"NO FP DATA" +} +print("FP stats done") + +boxplot(samp$"Total SNPs", compsamp$"Total SNPs", names=c("Current Set", "All Sets"), ylab="Total SNPs per sample", main="Total SNPs") +standardQuants<-boxplot.stats(compsamp$"Total SNPs")$stats +offSNPs<-samp$Sample[which(samp$"Total SNPs" standardQuants[5])]) +if(length(offSNPs >0)){ + text(1, samp$"Total SNPs"[which(samp$Sample %in% offSNPs)], labels=offSNPs, pos=2, col="hot pink") +} +print("SNP stats done") + +boxplot(samp$"dbSNP %", compsamp$"dbSNP %", names=c("Current Set", "All Sets"), ylab="% SNPs in dbSNP per sample", main="dbSNP Percentage") +standardQuants<-boxplot.stats(compsamp$"dbSNP %")$stats +offdbSNP<-samp$Sample[which(samp$"dbSNP %" standardQuants[5])]) +if(length(offdbSNP >0)){ + text(1, samp$"dbSNP %"[which(samp$Sample %in% offdbSNP)], labels=offdbSNP, pos=2, col="hot pink") +} +print("DBSNP stats done") + +sampDuplication<-sub(pattern="Catch-.*: ", "",samp$"Library Duplication %") +sampDuplication<-as.numeric(sub("%", "", sampDuplication)) +compsampDuplication<-sub(pattern="Catch-.*: ", "",compsamp$"Library Duplication %") +compsampDuplication<-as.numeric(sub("%", "", compsampDuplication)) + +boxplot(sampDuplication, compsampDuplication, names=c("Current Set", "All Sets"), ylab="% Duplication", main="Library Duplication") +standardQuants<-boxplot.stats(compsampDuplication)$stats +offDup<-samp$Sample[which(sampDuplication standardQuants[5])]) +if(length(offDup >0)){ + text(1, sampDuplication[which(samp$Sample %in% offDup)], labels=offDup, pos=2, col="hot pink") +} +print("Duplication stats done") + +allproblemsamples<-unique(c(lowcoverage, poorFPcov, offsamps, lowfpLOD, offSNPs, offdbSNP, offDup, missingLane, missingSamp)) +problemMat<-matrix(c(rep("PASS", length(allproblemsamples)*9)), nrow=length(allproblemsamples)) +rownames(problemMat)<-allproblemsamples +colnames(problemMat)<-c("low coverage", "low fp cov", "Identity Fail", "low FP LOD", "weird SNP count", "weird dbSNP %", "Duplicated", "Missing lane data", "missing agg data") +problemMat[which(rownames(problemMat) %in% lowcoverage),1]<-"FAIL" +problemMat[which(rownames(problemMat) %in% poorFPcov),2]<-"FAIL" +problemMat[which(rownames(problemMat) %in% offsamps),2]<-"FAIL" +problemMat[which(rownames(problemMat) %in% lowfpLOD),4]<-"FAIL" +problemMat[which(rownames(problemMat) %in% offSNPs),5]<-"FAIL" +problemMat[which(rownames(problemMat) %in% offdbSNP),6]<-"FAIL" +problemMat[which(rownames(problemMat) %in% offDup),7]<-"FAIL" +problemMat[which(rownames(problemMat) %in% missingLane),8]<-"FAIL" +problemMat[which(rownames(problemMat) %in% missingSamp),9]<-"FAIL" + +textplot(problemMat, cex=.5) + +write.table(problemMat, file=paste(cmdargs$qcout,"qc.table",sep="."), quote=FALSE, sep="\t") +print("no fp") +print(unique(nofp)) + + + +dev.off() +print("All stats done")