This script accpets file paths to analysis metrics tables and produces tearsheet data and data processing report graphs

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3585 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
corin 2010-06-18 13:02:25 +00:00
parent b978d5946b
commit a2c266bda3
1 changed files with 212 additions and 145 deletions

View File

@ -1,10 +1,26 @@
#Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively. #Before executing this file, save squid files as csv, then as tab deliminated files with only the column values as the header, change the format of all cells to numbers. Assign the path to these files to "samples" and "lanes" respectively.
tearsheetcalc<-function(lanes, samples, sample_sets, eval1, eval2, eval3, eval4, eval5, eval6){ args<-commandArgs(TRUE)
lanes<-args[1]
samples<-args[2]
sample_sets<-args[3]
eval<-args[4]
noveltitv<-args[5]
knowntitv<-args[6]
DOC<-args[7]
if(is.na(sample_sets)){
print("Please specify sample set for file naming and press enter.")
scan("stdin", what="character",n=1)->sample_sets
print("Thanks!")
}
if(is.na(lanes) == FALSE && is.na(samples)==FALSE){
#this makes a table & graphs using Picard data
read.delim(file=lanes, header= TRUE)->bylane; read.delim(file=lanes, header= TRUE)->bylane;
read.delim(file=samples, header= TRUE)->bysample; read.delim(file=samples, header= TRUE)->bysample;
#Calc by lane metrics
attach(bylane); attach(bylane);
callable.target<-HS_TARGET_TERRITORY[1]; callable.target<-HS_TARGET_TERRITORY[1];
singlelanes<-length(which(Lane.Type=="Single")); singlelanes<-length(which(Lane.Type=="Single"));
@ -22,13 +38,12 @@ tearsheetcalc<-function(lanes, samples, sample_sets, eval1, eval2, eval3, eval4,
sd.20x.lane<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE)); sd.20x.lane<-round(sd(HS_PCT_TARGET_BASES_20X, na.rm=TRUE));
sd.30x.lane<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE)); sd.30x.lane<-round(sd(HS_PCT_TARGET_BASES_30X, na.rm=TRUE));
names<-paste(Project, " ", External.ID, "-", Lane, sep="") names<-paste(Project, " ", External.ID, "-", Lane, sep="")
#makes a plot of the number of SNPS called per lane #makes a plot of the number of SNPS called per lane
library(graphics) library(graphics)
pdf(file=paste(sample_sets, "_SNPS.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS)) pdf(file=paste(sample_sets, "_SNPS.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE) layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
@ -65,11 +80,9 @@ tearsheetcalc<-function(lanes, samples, sample_sets, eval1, eval2, eval3, eval4,
mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4) mtext(paste("Outlier SNP call counts in ", length(boxplot.stats(SNP_TOTAL_SNPS)$out), "lanes"), side=1, line=4)
} }
dev.off() dev.off()
#makes a plot of snp calls ordered by lane #makes a plot of snp calls ordered by lane
pdf(file=paste(sample_sets, "_SNPS_lane.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS)) pdf(file=paste(sample_sets, "_SNPS_lane.pdf", sep=""), width=0.2*length(SNP_TOTAL_SNPS), height=0.1*length(SNP_TOTAL_SNPS))
layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE) layout(matrix(c(1,1 , 2), 1, 3, byrow=FALSE), respect=TRUE)
@ -89,8 +102,6 @@ tearsheetcalc<-function(lanes, samples, sample_sets, eval1, eval2, eval3, eval4,
dev.off() dev.off()
#makes a plot of fingerprint calls and labels them good or bad #makes a plot of fingerprint calls and labels them good or bad
badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15)) badsnps<-union(which(FP_CONFIDENT_MATCHING_SNPS<15), which(FP_CONFIDENT_MATCHING_SNPS<15))
colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS))) colors<-c(rep("Blue", length(FP_CONFIDENT_CALLS)))
@ -106,14 +117,15 @@ tearsheetcalc<-function(lanes, samples, sample_sets, eval1, eval2, eval3, eval4,
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes"), pch=c(1, 16, 1, 16), col=c("Blue", "Blue", "Red", "Red")) legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane", "Confident calls in bad lanes", "Confident matching calls in bad lanes"), pch=c(1, 16, 1, 16), col=c("Blue", "Blue", "Red", "Red"))
mtext("Some problematic fingerprint sites", side=3) mtext("Some problematic fingerprint sites", side=3)
}else{ }else{
legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane"), pch=c(1, 16), col="Blue")} legend("bottomright", legend=c("Confident calls at fingerprint sites by lane", "Confident matching calls at fingerprint sites by lane"), pch=c(1, 16), col="Blue")
}
dev.off() dev.off()
detach(bylane); detach(bylane)
#Calc by sample metrics
attach(bysample); attach(bysample);
mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE)); mean.lanes.samp<-signif(mean(X..Lanes.included.in.aggregation, na.rm = TRUE));
sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE)); sd.lanes.samp<-signif(sd(X..Lanes.included.in.aggregation, na.rm=TRUE));
mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE)); mean.mrl.samp<-signif(mean(Mean.Read.Length, na.rm=TRUE));
@ -134,7 +146,6 @@ tearsheetcalc<-function(lanes, samples, sample_sets, eval1, eval2, eval3, eval4,
detach(bysample); detach(bysample);
#print all of this stuff out in R. #print all of this stuff out in R.
print(paste("Callable Target: ", callable.target, " bases", sep=""), quote = FALSE); print(paste("Callable Target: ", callable.target, " bases", sep=""), quote = FALSE);
print(paste("Used Lanes per Sample: ", mean.lanes.samp, " +/- ", sd.lanes.samp, sep=""), quote=FALSE); print(paste("Used Lanes per Sample: ", mean.lanes.samp, " +/- ", sd.lanes.samp, sep=""), quote=FALSE);
print(paste("Parities: ", singlelanes, " single lanes, ", pairedlanes, " paired lanes", sep=""), quote=FALSE); print(paste("Parities: ", singlelanes, " single lanes, ", pairedlanes, " paired lanes", sep=""), quote=FALSE);
@ -152,27 +163,83 @@ tearsheetcalc<-function(lanes, samples, sample_sets, eval1, eval2, eval3, eval4,
print(paste("% loci covered to 30x per lane: ", mean.30x.lane, "% +/- ", sd.30x.lane, "%", sep=""), quote = FALSE) print(paste("% loci covered to 30x per lane: ", mean.30x.lane, "% +/- ", sd.30x.lane, "%", sep=""), quote = FALSE)
print(paste("% loci covered to 30x per sample: ", mean.30x.samp, "% +/- ", sd.30x.samp, "%", sep=""), quote = FALSE) print(paste("% loci covered to 30x per sample: ", mean.30x.samp, "% +/- ", sd.30x.samp, "%", sep=""), quote = FALSE)
#still need to figure out how to get various information from eval file into R, but whatever }else{
print("Lane and Sample metrics file paths not provided")
read.csv(eval1, header=TRUE)->errpercycle }
errpercycle<-matrix(c(rep("tester", 24*90 ), rep(1:8, 3*90), rep(1:90,24), runif(90*24, min=0, max=0.7)), nrow=90*24, ncol=4); colnames(errpercycle)<-c("sample_set", "plate", "lane", "cycle", "errorrate") #delete this after testing
as.data.frame(errpercycle)->errpercycle
attach(errpercycle)
pdf(paste(sample_set, "error_rate_per_cycle.pdf", sep="")) #Makes Error Rate percycle graph
names<-paste(sample) if(is.na(eval)==FALSE){
read.delim(eval, header=TRUE)[2:ncol(read.delim(eval, header=TRUE))]->errpercycle
pdf(paste(sample_sets, "_errorrate_per_cycle.pdf", sep=""), width=6, height=5)
crazies<-which(errpercycle[75,]>0.3) #this can be changed to any kind of filter for particular lanes
colors<-rainbow(ncol(errpercycle), s=0.5, v=0.5)
colors[crazies]<-rainbow(length(crazies))
weights<-rep(1, ncol(errpercycle))
weights[crazies]<-2
matplot(errpercycle, type="l", lty="solid", col=colors, lwd=weights, main="Error Rate per Cycle", ylab="Error Rate", xlab="Cycle", ylim=c(0, 0.7))
if(length(crazies)>0){
legend("topleft", title="Unusual Lanes", legend=colnames(errpercycle)[crazies], lty="solid", lwd=2, col=colors[crazies], xjust=0.5)
}else{
legend("topleft", legend="No unusual lanes.", bty="n")
}
dev.off()
}else{
print("Error Rate Per Cycle file paths not provided")
}
#Makes TI/TV known v novel graph
if(is.na(noveltitv)==FALSE && is.na(knowntitv) == FALSE){
pdf(paste(sample_set, "_TiTv.pdf", sep=""), width=6, height=5)
read.table(file=noveltitv, header=FALSE)->novels
read.table(file=knowntitv, header=FALSE)->knowns
plot(novels[,2], col="red", ylim=c(0, 3.5), main="Ti/Tv for Novel and Known SNP calls", ylab="Ti/Tv", xlab="", xaxt="n")
points(knowns[,2], col="blue")
axis(side=1, at=(1:length(novels[,2])), labels=novels[,1], cex.axis=1, las=2)
legend("bottomright", legend=c("Known Variants", "Novel Variants"), col=c("blue", "red"), pch=1, xjust=0.5)
mtext("Lower Ti/Tv ratios indicated more false positive SNP calls.", side=1)
dev.off()
}else{
print("Transition/transversion ratio file paths not provided")
}
#Make DOC graph
if(is.na(DOC)==FALSE){
pdf(paste(sample_set, "_DOC.pdf", sep=""), width=6, height=5)
as.matrix(as.vector(read.delim(DOC, header=TRUE)[,2:502]))->DOCdata
DOCdata<-matrix(DOCdata*100/sum(DOCdata[1,]), nrow=501, ncol=29, byrow=TRUE)
colnames(DOCdata)<-read.delim(DOC, header=TRUE)[,1]
oddies<-which(apply(DOCdata, 2, max)>10) #can be assigned any particular heuristic
ncolors<-rainbow(ncol(DOCdata), s=0.5, v=0.5)
ncolors[oddies]<-rainbow(length(oddies))
nweights<-rep(1, ncol(DOCdata))
nweights[oddies]<-2
matplot(DOCdata, type="l", main="Depth of Coverage by Sample", ylab="Percent bases covered to a given depth", xlab="log(Depth)", log="x", col=ncolors, lty="solid", lwd=nweights)
if(length(oddies)>0){
legend("topright", title="Unusual Cases", legend=colnames(DOCdata)[oddies], lty="solid", lwd=2, col=ncolors[oddies], xjust=0.5)
}else{
legend("topright", legend="No unusual cases.", bty="n")
}
dev.off()
}else{
print("Depth of Coverage filepath not provided")
}
}
lane<-"mennonite_by_lane.txt"
#insert file path here before running
sample<-"mennonite_by_sample.txt"
#insertfilepath here before running
sample_set<-"Ashuldin_mennonite"
tearsheetcalc(lane,sample,sample_set)