Preliminary QC script in R, which checks coverage, fingerprints, library duplication, total SNPs, dbSNP%, and availability of sample data
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5881 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6ecbfa9013
commit
1e2892a35d
|
|
@ -0,0 +1,177 @@
|
|||
#preqc.r
|
||||
library(gplots)
|
||||
.libPaths('/humgen/gsa-pipeline/.repository/R/')
|
||||
library(gsalib)
|
||||
|
||||
cmdargs = gsa.getargs(
|
||||
list(
|
||||
tsv = list(value=NA, doc="pipeline tsv file"),
|
||||
qcout=list(value=NA, doc="path to output root")
|
||||
),
|
||||
doc="Creates a tearsheet"
|
||||
);
|
||||
|
||||
read.delim(cmdargs$tsv, header=FALSE)->settable
|
||||
|
||||
squids<-unique(settable[,1])
|
||||
print(paste(nrow(settable), "samples in tsv"))
|
||||
lane<-data.frame()
|
||||
samp<-data.frame()
|
||||
for(squid in squids){
|
||||
gsa.read.squidmetrics(squid, TRUE)->lanemetrics
|
||||
print(paste("Got lane metrics for", squid))
|
||||
addlanes<-lanemetrics[which(lanemetrics$"External ID" %in% settable[,2]),]
|
||||
gsa.read.squidmetrics(squid, FALSE)->samplemetrics
|
||||
print(paste("Got sample metrics for", squid))
|
||||
addsamps<-samplemetrics[which(samplemetrics$Sample %in% settable[,2]),]
|
||||
lane<-rbind(lane, addlanes)
|
||||
samp<-rbind(samp, addsamps)
|
||||
}
|
||||
|
||||
print(paste(nrow(samp), "samples in samp"))
|
||||
print(paste(length(unique(lane$"External ID")), "samples in lane"))
|
||||
|
||||
print(paste(setdiff(settable[,2], samp$Sample), "do not overlap between samp and tsv"))
|
||||
print(paste(setdiff(settable[,2], lane$"External ID"), "do not overlap between lane and tsv"))
|
||||
print(paste(setdiff(samp$Sample, lane$"External ID"), "do not overlap between lane and samp"))
|
||||
|
||||
missingSamp<-setdiff(settable[,2], samp$Sample)
|
||||
missingLane<-setdiff(settable[,2], lane$"External ID")
|
||||
|
||||
drv = dbDriver("Oracle");
|
||||
con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
|
||||
|
||||
rs = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS"));
|
||||
d = fetch(rs, n=-1);
|
||||
dbHasCompleted(rs);
|
||||
dbClearResult(rs);
|
||||
|
||||
rs2 = dbSendQuery(con, statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG"));
|
||||
d2 = fetch(rs2, n=-1);
|
||||
dbHasCompleted(rs2);
|
||||
dbClearResult(rs2);
|
||||
|
||||
oraCloseDriver(drv);
|
||||
|
||||
compsamp=d2[which(d2$"Bait Set" %in% samp$"Bait Set"),]
|
||||
complane=d[which(d$"Bait Set" %in% lane$"Bait Set"),]
|
||||
|
||||
|
||||
|
||||
pdf(paste(cmdargs$qcout, "pdf", sep="."), width=11, height=8.5)
|
||||
|
||||
plot(samp$"Target Bases 20x %", main="Coverage to 20x", ylab="% Targets Covered to 20x", xlab="Sample", ylim=c(0,100))
|
||||
abline(h=80, lty=2)
|
||||
legend("bottomright", lty=2, legend="80% coverage to 20x")
|
||||
lowcoverage<-samp$Sample[which(samp$"Target Bases 20x %"<80)]
|
||||
if(length(lowcoverage)>0){
|
||||
text(which(samp$"Target Bases 20x %"<80),samp$"Target Bases 20x %"[which(samp$"Target Bases 20x %"<80)], labels=samp$Sample[which(samp$"Target Bases 20x %"<80)], pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
|
||||
plot(samp$"Zero Coverage Targets %", main="Zero Coverage", ylab="% Targets with zero coverage", log="y", xlab="Sample", ylim=c(0.01,100))
|
||||
abline(h=3, lty=2)
|
||||
legend("bottomright", lty=2, legend="3% Targets Zero Coverage")
|
||||
lowcoverage<-c(lowcoverage,samp$Sample[which(samp$"Zero Coverage">3)])
|
||||
if(length(which(samp$"Zero Coverage Targets %">3))>0){
|
||||
text(which(samp$"Zero Coverage Targets %">3), samp$"Zero Coverage Targets %"[which(samp$"Zero Coverage Targets %">3)], labels=samp$Sample[which(samp$"Zero Coverage Targets %">3)], pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
|
||||
print("Coverage stats done")
|
||||
nofp<-lane$"External ID"[which(is.na(lane$"FP LOD"))]
|
||||
|
||||
if(length(which(is.na(lane$"FP LOD")))< nrow(lane)){
|
||||
|
||||
plot(lane$"FP Confident Calls"~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level # FP calls", main="Fingerprint Calls/Sample Instance", xaxt="n")
|
||||
medians<-tapply(lane$"FP Confident Calls",lane$"External ID", median, na.rm=TRUE)
|
||||
points(as.factor(dimnames(medians)[[1]]),medians,col="red", lwd=2)
|
||||
legend("topleft", legend="Median across sample instances", pch=1, lwd=2, col="red", lty=0)
|
||||
poorFPcov<-dimnames(medians)[[1]][which(medians<5 )]
|
||||
if(length(poorFPcov)>0){
|
||||
text(which(medians<5), medians[which(medians<5)],poorFPcov, pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
|
||||
print("1 fp plot")
|
||||
plot(100*(lane$"FP Confident Matching SNPs"/lane$"FP Confident Calls")~as.factor(lane$"External ID"), xlab="sample", ylab="Multiplex level % matching FP calls", main="% Confident calls matching for samples with low confident calls", xaxt="n", ylim=c(0,110))
|
||||
|
||||
print("2 fp plot")
|
||||
|
||||
plot(lane$"FP LOD"~as.factor(lane$"External ID"), xlab="sample", ylab="Sample Fingerprint LOD", main="Fingerprint Pass:Samples", xaxt="n")
|
||||
offsamps<-lane$"External ID"[which(lane$"FP LOD"<(-3))]
|
||||
lowfpLOD<-lane$"External ID"[which(lane$"FP LOD"<6)]
|
||||
|
||||
if(length(lowfpLOD)>0){
|
||||
text(which(lane$"External ID" %in% lowfpLOD), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="hotpink")
|
||||
}
|
||||
print("3 fp plot")
|
||||
|
||||
if(length(lowfpLOD)>0){
|
||||
plot((lane$"FP Confident Calls"-lane$"FP Confident Matching SNPs")~as.factor(lane$"External ID"), main="Calls vs Matching Calls for Samples failing FP QC", ylab="# Mismatches", xlab="")
|
||||
}
|
||||
if(length(lowfpLOD)>0){
|
||||
text(which(lane$"FP LOD"<6), lane$"FP_LOD"[which(lane$"FP LOD"<6)], labels=lowfpLOD, pos=2, srt=270, cex=.6, col="RED")
|
||||
}
|
||||
|
||||
|
||||
}else{
|
||||
offsamps<-"NO FPDATA"
|
||||
lowfpLOD<-"NO FP DATA"
|
||||
poorFPcov<-"NO FP DATA"
|
||||
}
|
||||
print("FP stats done")
|
||||
|
||||
boxplot(samp$"Total SNPs", compsamp$"Total SNPs", names=c("Current Set", "All Sets"), ylab="Total SNPs per sample", main="Total SNPs")
|
||||
standardQuants<-boxplot.stats(compsamp$"Total SNPs")$stats
|
||||
offSNPs<-samp$Sample[which(samp$"Total SNPs" <standardQuants[1])]
|
||||
offSNPs<-c(offSNPs, samp$Sample[which(samp$"Total SNPs" >standardQuants[5])])
|
||||
if(length(offSNPs >0)){
|
||||
text(1, samp$"Total SNPs"[which(samp$Sample %in% offSNPs)], labels=offSNPs, pos=2, col="hot pink")
|
||||
}
|
||||
print("SNP stats done")
|
||||
|
||||
boxplot(samp$"dbSNP %", compsamp$"dbSNP %", names=c("Current Set", "All Sets"), ylab="% SNPs in dbSNP per sample", main="dbSNP Percentage")
|
||||
standardQuants<-boxplot.stats(compsamp$"dbSNP %")$stats
|
||||
offdbSNP<-samp$Sample[which(samp$"dbSNP %" <standardQuants[1])]
|
||||
offdbSNP<-c(offdbSNP, samp$Sample[which(samp$"dbSNP %" >standardQuants[5])])
|
||||
if(length(offdbSNP >0)){
|
||||
text(1, samp$"dbSNP %"[which(samp$Sample %in% offdbSNP)], labels=offdbSNP, pos=2, col="hot pink")
|
||||
}
|
||||
print("DBSNP stats done")
|
||||
|
||||
sampDuplication<-sub(pattern="Catch-.*: ", "",samp$"Library Duplication %")
|
||||
sampDuplication<-as.numeric(sub("%", "", sampDuplication))
|
||||
compsampDuplication<-sub(pattern="Catch-.*: ", "",compsamp$"Library Duplication %")
|
||||
compsampDuplication<-as.numeric(sub("%", "", compsampDuplication))
|
||||
|
||||
boxplot(sampDuplication, compsampDuplication, names=c("Current Set", "All Sets"), ylab="% Duplication", main="Library Duplication")
|
||||
standardQuants<-boxplot.stats(compsampDuplication)$stats
|
||||
offDup<-samp$Sample[which(sampDuplication <standardQuants[1])]
|
||||
offDup<-c(offDup, samp$Sample[which(sampDuplication >standardQuants[5])])
|
||||
if(length(offDup >0)){
|
||||
text(1, sampDuplication[which(samp$Sample %in% offDup)], labels=offDup, pos=2, col="hot pink")
|
||||
}
|
||||
print("Duplication stats done")
|
||||
|
||||
allproblemsamples<-unique(c(lowcoverage, poorFPcov, offsamps, lowfpLOD, offSNPs, offdbSNP, offDup, missingLane, missingSamp))
|
||||
problemMat<-matrix(c(rep("PASS", length(allproblemsamples)*9)), nrow=length(allproblemsamples))
|
||||
rownames(problemMat)<-allproblemsamples
|
||||
colnames(problemMat)<-c("low coverage", "low fp cov", "Identity Fail", "low FP LOD", "weird SNP count", "weird dbSNP %", "Duplicated", "Missing lane data", "missing agg data")
|
||||
problemMat[which(rownames(problemMat) %in% lowcoverage),1]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% poorFPcov),2]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offsamps),2]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% lowfpLOD),4]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offSNPs),5]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offdbSNP),6]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% offDup),7]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% missingLane),8]<-"FAIL"
|
||||
problemMat[which(rownames(problemMat) %in% missingSamp),9]<-"FAIL"
|
||||
|
||||
textplot(problemMat, cex=.5)
|
||||
|
||||
write.table(problemMat, file=paste(cmdargs$qcout,"qc.table",sep="."), quote=FALSE, sep="\t")
|
||||
print("no fp")
|
||||
print(unique(nofp))
|
||||
|
||||
|
||||
|
||||
dev.off()
|
||||
print("All stats done")
|
||||
Loading…
Reference in New Issue