library(ellipse); library(hexbin); library(rgl); args = commandArgs(TRUE); plotRoot = args[1]; if (is.na(plotRoot)) { plotRoot = "test"; } clusterFile = args[2]; if (is.na(clusterFile)) { clusterFile = "/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v7/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized"; } vcfTable = args[3]; if (is.na(vcfTable)) { vcfTable = "/home/radon01/kiran/scr1/projects/DataProcessingPaper/scratch/MarkBustedWEx.table"; } lociFile = args[4]; if (is.na(lociFile) | lociFile == "NA" ) { lociFile = "/home/radon01/kiran/scr1/projects/DataProcessingPaper/scratch/MarkBustedWEx.loci"; } maxVariants = args[5]; if (is.na(maxVariants)) { maxVariants = -1; } maxVariants = as.integer(maxVariants) getAnnIndex <- function(d, ann) { index = -1; for (i in c(1:length(names(d)))) { if (names(d)[i] == ann) { index = i; } } index; } getClusterAnnIndex <- function(c, ann) { index = -1; for (i in c(1:length(c[[1]]$anns))) { if (c[[1]]$anns[i] == ann) { index = i; } } index; } plotAnn <- function(d.known, d.novel, d.loci, ann, plotfile) { index = getAnnIndex(d.known, ann); k = hist(d.known[,index], breaks=100, plot=FALSE); n = hist(d.novel[,index], breaks=100, plot=FALSE); pdf(plotfile); plot(k$mids, k$density, type="b", col="blue", ylim=c(0, max(k$density)), lwd=2, xlab=ann, ylab="Density", bty="n"); points(n$mids, n$density, type="b", col="red", lwd=2); if (!is.na(d.loci)) { legend("topright", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=c(21, 21, 18)); } else { legend("topright", c("Known", "Novel"), col=c("blue", "red"), pch=21); } if (!is.na(d.loci)) { for (i in c(1:nrow(d.loci))) { points(d.loci[i, index], 0, col="yellow3", pch=18, cex=2.0); } } dev.off(); } read.clusters <- function(filename) { con = file(filename, "r", blocking = FALSE) lines = readLines(con) close(con); anns = c(); annIndex = 1; clusterIndex = 1; clusters = c(); conversions = c(); for (line in lines) { if (length(grep("ANNOTATION", line)) > 0) { linePieces = unlist(strsplit(line, ",")); anns = c(anns, linePieces[2]); conversions[[annIndex]] = list(ann = linePieces[2], offset = as.numeric(linePieces[3]), multiplier = as.numeric(linePieces[4])); annIndex = annIndex + 1; } else if (length(grep("CLUSTER", line)) > 0) { linePieces = unlist(strsplit(line, ",")); mixtureWeight = linePieces[2]; mu = linePieces[3:(3+length(anns)-1)]; cov = linePieces[(3+length(anns)):length(linePieces)]; clusters[[clusterIndex]] = list( anns = anns, conversions = conversions, mixtureWeight = as.numeric(mixtureWeight), means = as.numeric(mu), cov = matrix(cov, nrow=length(anns), ncol=length(anns)) ); clusterIndex = clusterIndex + 1; } } clusters; } plotClusters <- function(d.known, d.novel, d.loci, c, ann1, ann2, filename, maxVariants = -1) { index1 = getAnnIndex(d.known, ann1); index2 = getAnnIndex(d.known, ann2); cindex1 = getClusterAnnIndex(c, ann1); cindex2 = getClusterAnnIndex(c, ann2); mult1 = c[[1]]$conversions[[cindex1]]$multiplier; off1 = c[[1]]$conversions[[cindex1]]$offset; mult2 = c[[1]]$conversions[[cindex2]]$multiplier; off2 = c[[1]]$conversions[[cindex2]]$offset; xvalsForLims = c(-5, 5) yvalsForLims = c(-5, 5) #xvalsForLims = (d.known[,index1] - off1)/mult1 #yvalsForLims = (d.known[,index2] - off2)/mult2 xlims = c(min(xvalsForLims), 1.2*max(xvalsForLims)); ylims = c(min(yvalsForLims), max(yvalsForLims)); clusterColors = c("#A62103", "#F27405", "#F29F05", "#F2B705", "#F2CB05"); pdf(filename); par(mar=c(5, 6, 2, 5)); plot(0, 0, type="n", xaxt="n", yaxt="n", xlim=xlims, ylim=ylims, xlab=ann1, ylab=ann2, bty="n"); mv.known = if (maxVariants == -1 | maxVariants >= nrow(d.known)) { seq(1, nrow(d.known)) } else { as.integer(runif(maxVariants, 1, nrow(d.known)+1))} mv.novel = if (maxVariants == -1 | maxVariants >= nrow(d.novel)) { 1:nrow(d.novel) } else { as.integer(runif(maxVariants, 1, nrow(d.novel)+1)) } print(dim(mv.known)) print(maxVariants) points(((d.known[,index1] - off1)/mult1)[mv.known], ((d.known[,index2] - off2)/mult2)[mv.known], pch=19, cex=0.3, col="#0000FF33"); points(((d.novel[,index1] - off1)/mult1)[mv.novel], ((d.novel[,index2] - off2)/mult2)[mv.novel], pch=19, cex=0.3, col="#FF000033"); for (clusterIndex in c(1:length(c))) { mu = c(c[[clusterIndex]]$means[cindex1], c[[clusterIndex]]$means[cindex2]); cov = matrix(as.numeric( matrix( c( c[[clusterIndex]]$cov[cindex1,cindex1], c[[clusterIndex]]$cov[cindex2,cindex1], c[[clusterIndex]]$cov[cindex1,cindex2], c[[clusterIndex]]$cov[cindex2,cindex2] ), nrow=2, ncol=2 ) ), nrow=2, ncol=2 ); weight = c[[clusterIndex]]$mixtureWeight; if (weight <= 0.20) { color = clusterColors[1]; } else if (weight > 0.20 && weight <= 0.40) { color = clusterColors[2]; } else if (weight > 0.40 && weight <= 0.60) { color = clusterColors[3]; } else if (weight > 0.60 && weight <= 0.80) { color = clusterColors[4]; } else if (weight > 0.80) { color = clusterColors[5]; } lineweight = ifelse(weight > 0.50, 4, 3); points(mu[1], mu[2], pch=21, col=color, cex=0.5); points(ellipse(t(cov), centre=mu), type="l", lwd=lineweight, col=color); } axis(1, labels=as.integer(seq(from=min(d.novel[,index1]), to=max(d.novel[,index1]), by=(abs(min(d.novel[,index1])) + abs(max(d.novel[,index1])))/5)), at=seq(from=min((d.novel[,index1] - off1)/mult1), to=max((d.novel[,index1] - off1)/mult1), by=(abs(min((d.novel[,index1] - off1)/mult1)) + abs(max((d.novel[,index1] - off1)/mult1)))/5) ); axis(2, labels=as.integer(seq(from=min(d.novel[,index2]), to=max(d.novel[,index2]), by=(abs(min(d.novel[,index2])) + abs(max(d.novel[,index2])))/5)), at=seq(from=min((d.novel[,index2] - off2)/mult2), to=max((d.novel[,index2] - off2)/mult2), by=(abs(min((d.novel[,index2] - off2)/mult2)) + abs(max((d.novel[,index2] - off2)/mult2)))/5) ); if (!is.na(d.loci)) { legend("bottomleft", c("Known", "Novel", "Suspicious loci"), col=c("blue", "red", "yellow3"), pch=19); } else { legend("bottomleft", c("Known", "Novel"), col=c("blue", "red"), pch=19); } pieces = 100; scale = ((abs(ylims[1]) + abs(ylims[2]))/pieces); width = ((abs(xlims[1]) + abs(xlims[2]))/12); offset = ylims[1]; for (i in c(1:pieces)) { color = clusterColors[1]; if (i <= 20) { color = clusterColors[1]; } else if (i > 20 && i <= 40) { color = clusterColors[2]; } else if (i > 40 && i <= 60) { color = clusterColors[3]; } else if (i > 60 && i <= 80) { color = clusterColors[4]; } else if (i > 80) { color = clusterColors[5]; } polygon(x=xlims[2] + c(0, 0, width, width), y=offset + scale*c((i-1), (i), (i), (i-1)), col=color, border=color); } axis(4, labels=c(0.0, 0.20, 0.40, 0.60, 0.80, 1.0), at=c(ylims[1], ylims[1] + 20*scale, ylims[1] + 40*scale, ylims[1] + 60*scale, ylims[1] + 80*scale, ylims[2])); mtext("Mixture coefficient", 4, line=3); if (!is.na(d.loci)) { points((d.loci[,index1] - off1)/mult1, (d.loci[,index2] - off2)/mult2, pch=19, cex=0.8, col="yellow3"); } dev.off(); } l = c(); if (!is.na(lociFile)) { t = read.table(lociFile, header=TRUE); l = t$POS; } d = read.table(vcfTable, header=TRUE); c = read.clusters(clusterFile); d.known = d[which(d$DB == 1),]; d.novel = d[which(d$DB == 0),]; d.loci = NA; if (length(l) > 0) { d.loci = d[which(d$POS %in% l),]; } for (ann1 in c[[1]]$anns) { print(ann1) plotAnn(d.known, d.novel, d.loci, ann1, paste(plotRoot, ".anndist.", ann1, ".pdf", sep="")); for (ann2 in c[[1]]$anns) { if (ann1 != ann2) { print(paste("-- v ", ann2)) plotClusters(d.known, d.novel, d.loci, c, ann1, ann2, paste(plotRoot, ".cluster.", ann1, "_vs_", ann2, ".pdf", sep=""), maxVariants=maxVariants); } } }