Fixing the way that lanes are pulled into from the database so that multiplexed lanes are handled and older sequence data ids are properly handled.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5123 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4ee4fd47e9
commit
4f2882c546
|
|
@ -25,19 +25,22 @@ cmdargs = gsa.getargs(
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
bamlist = read.table(cmdargs$bamlist);
|
bamlist = scan(cmdargs$bamlist, "character");
|
||||||
|
squids <- c()
|
||||||
fclanes = c();
|
fclanes = c();
|
||||||
for (bam in bamlist$V1) {
|
for (bam in bamlist) {
|
||||||
bamheader = system(paste("samtools view -H", bam), intern=TRUE);
|
bamheader = system(paste("samtools view -H", bam), intern=TRUE);
|
||||||
|
squids<-c(squids, strsplit(bam, "/")[[1]][4])
|
||||||
|
|
||||||
if (length(bamheader) > 0) {
|
if (length(bamheader) > 0) {
|
||||||
rgs = bamheader[grep("^@RG", bamheader)];
|
rgs = bamheader[grep("^@RG", bamheader)];
|
||||||
|
|
||||||
for (rg in rgs) {
|
for (rg in rgs) {
|
||||||
id = grep("ID:", unlist(strsplit(rg, "\t")), value=TRUE);
|
id = grep("PU:", unlist(strsplit(rg, "\t")), value=TRUE);
|
||||||
id = sub("ID:", "", id);
|
id = sub("PU:", "", id);
|
||||||
|
id = gsub("A.XX......", "", id)
|
||||||
|
##change this so that it pulls out the flowcell and lane properly from old samples
|
||||||
|
##change this so that it pulls out only samples which are in the right project for new samples
|
||||||
fclanes = c(fclanes, id);
|
fclanes = c(fclanes, id);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -63,8 +66,10 @@ oraCloseDriver(drv);
|
||||||
squid_fclanes = sprintf("%s.%s", d$"Flowcell", d$"Lane");
|
squid_fclanes = sprintf("%s.%s", d$"Flowcell", d$"Lane");
|
||||||
squid_fclanes = gsub("A.XX", "", squid_fclanes);
|
squid_fclanes = gsub("A.XX", "", squid_fclanes);
|
||||||
|
|
||||||
|
|
||||||
dproj = d[which(squid_fclanes %in% fclanes),];
|
dproj = d[which(squid_fclanes %in% fclanes),];
|
||||||
d2proj = d2[which(d2$"Project" %in% unique(dproj$"Project") & d2$"Sample" %in% dproj$"External ID"),];
|
dproj = dproj[which(dproj$"Project" %in% unique(squids)),]
|
||||||
|
d2proj = d2[which(d2$"Project" %in% unique(squids) & d2$"Sample" %in% dproj$"External ID"),];
|
||||||
|
|
||||||
|
|
||||||
tearsheet<-function(){
|
tearsheet<-function(){
|
||||||
|
|
@ -91,7 +96,7 @@ tearsheet<-function(){
|
||||||
# Project summary
|
# Project summary
|
||||||
projects = paste(unique(dproj$"Project"), collapse=", ");
|
projects = paste(unique(dproj$"Project"), collapse=", ");
|
||||||
|
|
||||||
used_samples = length(bamlist$V1);
|
used_samples = length(bamlist);
|
||||||
|
|
||||||
unused_samples = 0;
|
unused_samples = 0;
|
||||||
|
|
||||||
|
|
@ -107,7 +112,7 @@ tearsheet<-function(){
|
||||||
bait_design<-strsplit(bait_design, ".Homo")[[1]][1]
|
bait_design<-strsplit(bait_design, ".Homo")[[1]][1]
|
||||||
}
|
}
|
||||||
|
|
||||||
callable_target = paste(unique(dproj$"Target Territory"), collapse=", ");
|
callable_target = paste(na.omit(unique(dproj$"Target Territory")), collapse=", ");
|
||||||
|
|
||||||
table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target)
|
table1<-rbind(paste(used_samples," used samples/", unused_samples + used_samples," total samples", sep=""), sequencing_protocol, bait_design, callable_target)
|
||||||
print(nrow(table1))
|
print(nrow(table1))
|
||||||
|
|
@ -117,12 +122,12 @@ tearsheet<-function(){
|
||||||
title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1)
|
title(main=sprintf("Project Summary (%s)\n", projects), family="sans", cex.main=1.25, line=-1)
|
||||||
# Bases summary
|
# Bases summary
|
||||||
|
|
||||||
reads_per_lane_mean = format(mean(dproj$"PF Reads (HS)"), 8, 3,1, scientific=TRUE);
|
reads_per_lane_mean = format(mean(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||||
reads_per_lane_sd = format(sd(dproj$"PF Reads (HS)"), 8, 3,1, scientific=TRUE);
|
reads_per_lane_sd = format(sd(dproj$"PF Reads (HS)", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||||
lanes<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd)
|
lanes<-sprintf("%s +/- %s\n", reads_per_lane_mean, reads_per_lane_sd)
|
||||||
|
|
||||||
used_bases_per_lane_mean = format(mean(dproj$"PF HQ Aligned Q20 Bases"),8, 3,1, scientific=TRUE);
|
used_bases_per_lane_mean = format(mean(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
|
||||||
used_bases_per_lane_sd = format(sd(dproj$"PF HQ Aligned Q20 Bases"), 8, 3,1, scientific=TRUE);
|
used_bases_per_lane_sd = format(sd(dproj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||||
lanes<-c(lanes, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd));
|
lanes<-c(lanes, sprintf("%s +/- %s\n", used_bases_per_lane_mean, used_bases_per_lane_sd));
|
||||||
|
|
||||||
target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage"));
|
target_coverage_mean = mean(na.omit(dproj$"Mean Target Coverage"));
|
||||||
|
|
@ -142,12 +147,12 @@ tearsheet<-function(){
|
||||||
lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
|
lanes<-c(lanes,sprintf("%0.2f%% +/- %0.2f%%\n", pct_loci_gt_30x_mean, pct_loci_gt_30x_sd));
|
||||||
|
|
||||||
|
|
||||||
reads_per_sample_mean = format(mean(d2proj$"PF Reads"), 8, 3,1, scientific=TRUE);
|
reads_per_sample_mean = format(mean(d2proj$"PF Reads", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||||
reads_per_sample_sd = format(sd(d2proj$"PF Reads"), 8, 3,1, scientific=TRUE);
|
reads_per_sample_sd = format(sd(d2proj$"PF Reads",na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||||
samps<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd);
|
samps<-sprintf("%s +/- %s\n", reads_per_sample_mean, reads_per_sample_sd);
|
||||||
|
|
||||||
used_bases_per_sample_mean = format(mean(d2proj$"PF HQ Aligned Q20 Bases"),8, 3,1, scientific=TRUE);
|
used_bases_per_sample_mean = format(mean(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE),8, 3,1, scientific=TRUE);
|
||||||
used_bases_per_sample_sd = format(sd(d2proj$"PF HQ Aligned Q20 Bases"), 8, 3,1, scientific=TRUE);
|
used_bases_per_sample_sd = format(sd(d2proj$"PF HQ Aligned Q20 Bases", na.rm=TRUE), 8, 3,1, scientific=TRUE);
|
||||||
samps<-c(samps, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd));
|
samps<-c(samps, sprintf("%s +/- %s\n", used_bases_per_sample_mean, used_bases_per_sample_sd));
|
||||||
|
|
||||||
target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage"));
|
target_coverage_mean = mean(na.omit(d2proj$"Mean Target Coverage"));
|
||||||
|
|
@ -195,8 +200,8 @@ tearsheet<-function(){
|
||||||
unused_lanes_by_analysis = 0;
|
unused_lanes_by_analysis = 0;
|
||||||
|
|
||||||
|
|
||||||
lanes_per_sample_mean = mean(table(dproj$"External ID"));
|
lanes_per_sample_mean = mean(table(dproj$"External ID"), na.rm=TRUE);
|
||||||
lanes_per_sample_sd = sd(table(dproj$"External ID"));
|
lanes_per_sample_sd = sd(table(dproj$"External ID"), na.rm=TRUE);
|
||||||
lanes_per_sample_median = median(table(dproj$"External ID"));
|
lanes_per_sample_median = median(table(dproj$"External ID"));
|
||||||
lanes_paired = nrow(subset(dproj, dproj$"Lane Type" == "Paired"));
|
lanes_paired = nrow(subset(dproj, dproj$"Lane Type" == "Paired"));
|
||||||
lanes_widowed = nrow(subset(dproj, dproj$"Lane Type" == "Widowed"));
|
lanes_widowed = nrow(subset(dproj, dproj$"Lane Type" == "Widowed"));
|
||||||
|
|
@ -249,7 +254,7 @@ tearsheet<-function(){
|
||||||
|
|
||||||
table4 = matrix(c(eval.counts.called.all, eval.counts.called.known, eval.counts.called.novel, eval.titv.called.all, eval.titv.called.known, eval.titv.called.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3);
|
table4 = matrix(c(eval.counts.called.all, eval.counts.called.known, eval.counts.called.novel, eval.titv.called.all, eval.titv.called.known, eval.titv.called.novel, "3.0 - 3.2", "3.2 - 3.4", "2.7 - 3.0"), nrow=3);
|
||||||
print(nrow(table4))
|
print(nrow(table4))
|
||||||
|
print(paste("columns should be three, actually are:", ncol(table4)))
|
||||||
rownames(table4) = c("All", "Known", "Novel");
|
rownames(table4) = c("All", "Known", "Novel");
|
||||||
colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio");
|
colnames(table4) = c("Found", "Ti/Tv ratio", "Expected Ti/Tv ratio");
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue