Performance optimization for gsa.read.gatkreport.R
-- instead of using y = rbind(x, y), which is O(n^2) in a loop when processing lines into a data structure in R, preallocate a matrix and explicitly assign each row to x. This results in a radical performance improvement when reading large tables into R. It's possible with this optimization to read in a 70MB table for variantQCReport.R with 200K lines for 800 samples.
This commit is contained in:
parent
7faa9938b1
commit
6c2290fb6e
|
|
@ -99,15 +99,21 @@ gsa.read.gatkreportv0 <- function(lines) {
|
||||||
|
|
||||||
# Load all GATKReport v1 tables from file
|
# Load all GATKReport v1 tables from file
|
||||||
gsa.read.gatkreportv1 <- function(lines) {
|
gsa.read.gatkreportv1 <- function(lines) {
|
||||||
|
#print("loading with optimized v1 reader")
|
||||||
|
nLines = length(lines)
|
||||||
tableEnv = new.env();
|
tableEnv = new.env();
|
||||||
|
|
||||||
tableName = NA;
|
tableName = NA;
|
||||||
tableHeader = c();
|
tableHeader = c();
|
||||||
tableRows = c();
|
tableRows = NULL;
|
||||||
version = "";
|
version = "";
|
||||||
|
rowCount = 0
|
||||||
headerRowCount = -1;
|
headerRowCount = -1;
|
||||||
|
|
||||||
|
finishTable <- function() {
|
||||||
|
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv);
|
||||||
|
}
|
||||||
|
|
||||||
for (line in lines) {
|
for (line in lines) {
|
||||||
|
|
||||||
if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
|
if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
|
||||||
|
|
@ -123,15 +129,15 @@ gsa.read.gatkreportv1 <- function(lines) {
|
||||||
headerFields = unlist(strsplit(line, ":"));
|
headerFields = unlist(strsplit(line, ":"));
|
||||||
|
|
||||||
if (!is.na(tableName)) {
|
if (!is.na(tableName)) {
|
||||||
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
|
finishTable()
|
||||||
}
|
}
|
||||||
|
|
||||||
tableName = headerFields[3];
|
tableName = headerFields[3];
|
||||||
tableHeader = c();
|
tableHeader = c();
|
||||||
tableRows = c();
|
tableRows = NULL
|
||||||
|
rowCount = 0
|
||||||
|
|
||||||
columnStarts = c();
|
columnStarts = c();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (length(grep("^#:GATKTable", line, ignore.case=TRUE)) > 0) {
|
if (length(grep("^#:GATKTable", line, ignore.case=TRUE)) > 0) {
|
||||||
|
|
@ -144,6 +150,7 @@ gsa.read.gatkreportv1 <- function(lines) {
|
||||||
headerChars = unlist(strsplit(line, ""));
|
headerChars = unlist(strsplit(line, ""));
|
||||||
# Find the first position of non space characters, excluding the first character
|
# Find the first position of non space characters, excluding the first character
|
||||||
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
|
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
|
||||||
|
tableRows = matrix(nrow=nLines, ncol=length(columnStarts)+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
row = .gsa.splitFixedWidth(line, columnStarts);
|
row = .gsa.splitFixedWidth(line, columnStarts);
|
||||||
|
|
@ -152,13 +159,14 @@ gsa.read.gatkreportv1 <- function(lines) {
|
||||||
if (length(tableHeader) == 0) {
|
if (length(tableHeader) == 0) {
|
||||||
tableHeader = row;
|
tableHeader = row;
|
||||||
} else if ( nchar(line) > 0 ) {
|
} else if ( nchar(line) > 0 ) {
|
||||||
tableRows = rbind(tableRows, row);
|
rowCount = rowCount + 1
|
||||||
|
tableRows[rowCount,] <- row
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is.na(tableName)) {
|
if (!is.na(tableName)) {
|
||||||
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
|
finishTable()
|
||||||
}
|
}
|
||||||
|
|
||||||
gatkreport = as.list(tableEnv, all.names=TRUE);
|
gatkreport = as.list(tableEnv, all.names=TRUE);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue