Performance optimization for gsa.read.gatkreport.R

-- instead of using y = rbind(x, y), which is O(n^2) in a loop when processing lines into a data structure in R, preallocate a matrix and explicitly assign each row to x.  This results in a radical performance improvement when reading large tables into R.  It's possible with this optimization to read in a 70MB table for variantQCReport.R with 200K lines for 800 samples.
This commit is contained in:
Mark DePristo 2012-03-22 18:46:37 -04:00
parent 7faa9938b1
commit 6c2290fb6e
1 changed files with 66 additions and 58 deletions

View File

@ -99,69 +99,77 @@ gsa.read.gatkreportv0 <- function(lines) {
# Load all GATKReport v1 tables from file
gsa.read.gatkreportv1 <- function(lines) {
#print("loading with optimized v1 reader")
nLines = length(lines)
tableEnv = new.env();
tableName = NA;
tableHeader = c();
tableRows = NULL;
version = "";
rowCount = 0
headerRowCount = -1;
finishTable <- function() {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv);
}
for (line in lines) {
tableEnv = new.env();
tableName = NA;
tableHeader = c();
tableRows = c();
version = "";
headerRowCount = -1;
for (line in lines) {
if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
version = "v1.0";
headerRowCount = 0;
}
if ( (headerRowCount %% 2 == 1) && (version == "v1.0") ) {
#print("Trying to start a table with line:");
#print(line);
#Get table header
headerFields = unlist(strsplit(line, ":"));
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
}
tableName = headerFields[3];
tableHeader = c();
tableRows = c();
columnStarts = c();
}
if (length(grep("^#:GATKTable", line, ignore.case=TRUE)) > 0) {
headerRowCount = headerRowCount+1;
#print("Header Row count is at:")
#print(headerRowCount);
} else if (!is.na(tableName)) {
if ( version == "v1.0") {
if (length(tableHeader) == 0) {
headerChars = unlist(strsplit(line, ""));
# Find the first position of non space characters, excluding the first character
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
}
row = .gsa.splitFixedWidth(line, columnStarts);
}
if (length(tableHeader) == 0) {
tableHeader = row;
} else if ( nchar(line) > 0 ) {
tableRows = rbind(tableRows, row);
}
}
if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
version = "v1.0";
headerRowCount = 0;
}
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
if ( (headerRowCount %% 2 == 1) && (version == "v1.0") ) {
#print("Trying to start a table with line:");
#print(line);
#Get table header
headerFields = unlist(strsplit(line, ":"));
if (!is.na(tableName)) {
finishTable()
}
tableName = headerFields[3];
tableHeader = c();
tableRows = NULL
rowCount = 0
columnStarts = c();
}
gatkreport = as.list(tableEnv, all.names=TRUE);
if (length(grep("^#:GATKTable", line, ignore.case=TRUE)) > 0) {
headerRowCount = headerRowCount+1;
#print("Header Row count is at:")
#print(headerRowCount);
} else if (!is.na(tableName)) {
if ( version == "v1.0") {
if (length(tableHeader) == 0) {
headerChars = unlist(strsplit(line, ""));
# Find the first position of non space characters, excluding the first character
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
tableRows = matrix(nrow=nLines, ncol=length(columnStarts)+1);
}
row = .gsa.splitFixedWidth(line, columnStarts);
}
if (length(tableHeader) == 0) {
tableHeader = row;
} else if ( nchar(line) > 0 ) {
rowCount = rowCount + 1
tableRows[rowCount,] <- row
}
}
}
if (!is.na(tableName)) {
finishTable()
}
gatkreport = as.list(tableEnv, all.names=TRUE);
}
# Load all GATKReport tables from a file