NestedIntegerArray: Pre-allocate only the first two dimensions

It turns out that pre-allocating the entire tree was too expensive in
terms of memory when using large values for the -mcs and -ics parameters.

Pre-allocating the first two dimensions prevents us from ever locking the
root node during a put(). Contention between threads over lower levels
of the tree should be minimal given that puts are rare compared to gets.

Also output dimensions and pre-allocation info at startup. If pre-allocation
takes longer than usual this gives the user a sense of what is causing the
delay.
This commit is contained in:
David Roazen 2012-10-25 13:30:49 -04:00
parent cc8c12b954
commit 884d031e72
1 changed files with 37 additions and 14 deletions

View File

@ -25,9 +25,11 @@
package org.broadinstitute.sting.utils.collections; package org.broadinstitute.sting.utils.collections;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
/** /**
@ -38,39 +40,50 @@ import java.util.List;
public class NestedIntegerArray<T> { public class NestedIntegerArray<T> {
private static Logger logger = Logger.getLogger(NestedIntegerArray.class);
protected final Object[] data; protected final Object[] data;
protected final int numDimensions; protected final int numDimensions;
protected final int[] dimensions; protected final int[] dimensions;
// Preallocate the first two dimensions to limit contention during tree traversals in put()
private static final int NUM_DIMENSIONS_TO_PREALLOCATE = 2;
public NestedIntegerArray(final int... dimensions) { public NestedIntegerArray(final int... dimensions) {
numDimensions = dimensions.length; numDimensions = dimensions.length;
if ( numDimensions == 0 ) if ( numDimensions == 0 )
throw new ReviewedStingException("There must be at least one dimension to an NestedIntegerArray"); throw new ReviewedStingException("There must be at least one dimension to an NestedIntegerArray");
this.dimensions = dimensions.clone(); this.dimensions = dimensions.clone();
int dimensionsToPreallocate = Math.min(dimensions.length, NUM_DIMENSIONS_TO_PREALLOCATE);
logger.info(String.format("Creating NestedIntegerArray with dimensions %s", Arrays.toString(dimensions)));
logger.info(String.format("Pre-allocating first %d dimensions", dimensionsToPreallocate));
data = new Object[dimensions[0]]; data = new Object[dimensions[0]];
prepopulateArray(data, 0); preallocateArray(data, 0, dimensionsToPreallocate);
logger.info(String.format("Done pre-allocating first %d dimensions", dimensionsToPreallocate));
} }
/** /**
* Recursively allocate the entire tree of arrays in all its dimensions. * Recursively allocate the first dimensionsToPreallocate dimensions of the tree
* *
* Doing this upfront uses more memory initially, but saves time over the course of the run * Pre-allocating the first few dimensions helps limit contention during tree traversals in put()
* and (crucially) avoids having to make threads wait while traversing the tree to check
* whether branches exist or not.
* *
* @param subarray current node in the tree * @param subarray current node in the tree
* @param dimension current level in the tree * @param dimension current level in the tree
* @param dimensionsToPreallocate preallocate only this many dimensions (starting from the first)
*/ */
private void prepopulateArray( Object[] subarray, int dimension ) { private void preallocateArray( Object[] subarray, int dimension, int dimensionsToPreallocate ) {
if ( dimension >= numDimensions - 1 ) { if ( dimension >= dimensionsToPreallocate - 1 ) {
return; return;
} }
for ( int i = 0; i < subarray.length; i++ ) { for ( int i = 0; i < subarray.length; i++ ) {
subarray[i] = new Object[dimensions[dimension + 1]]; subarray[i] = new Object[dimensions[dimension + 1]];
prepopulateArray((Object[])subarray[i], dimension + 1); preallocateArray((Object[])subarray[i], dimension + 1, dimensionsToPreallocate);
} }
} }
@ -82,8 +95,9 @@ public class NestedIntegerArray<T> {
if ( keys[i] >= dimensions[i] ) if ( keys[i] >= dimensions[i] )
return null; return null;
myData = (Object[])myData[keys[i]]; // interior nodes in the tree will never be null, so we can safely traverse myData = (Object[])myData[keys[i]];
// down to the leaves if ( myData == null )
return null;
} }
return (T)myData[keys[numNestedDimensions]]; return (T)myData[keys[numNestedDimensions]];
@ -92,8 +106,8 @@ public class NestedIntegerArray<T> {
/** /**
* Insert a value at the position specified by the given keys. * Insert a value at the position specified by the given keys.
* *
* This method is THREAD-SAFE despite not being synchronized, however the caller MUST * This method is thread-safe, however the caller MUST check the
* check the return value to see if the put succeeded. This method RETURNS FALSE if * return value to see if the put succeeded. This method RETURNS FALSE if
* the value could not be inserted because there already was a value present * the value could not be inserted because there already was a value present
* at the specified location. In this case the caller should do a get() to get * at the specified location. In this case the caller should do a get() to get
* the already-existing value and (potentially) update it. * the already-existing value and (potentially) update it.
@ -113,8 +127,17 @@ public class NestedIntegerArray<T> {
if ( keys[i] >= dimensions[i] ) if ( keys[i] >= dimensions[i] )
throw new ReviewedStingException("Key " + keys[i] + " is too large for dimension " + i + " (max is " + (dimensions[i]-1) + ")"); throw new ReviewedStingException("Key " + keys[i] + " is too large for dimension " + i + " (max is " + (dimensions[i]-1) + ")");
myData = (Object[])myData[keys[i]]; // interior nodes in the tree will never be null, so we can safely traverse // If we're at or beyond the last dimension that was pre-allocated, we need to do a synchronized
// down to the leaves // check to see if the next branch exists, and if it doesn't, create it
if ( i >= NUM_DIMENSIONS_TO_PREALLOCATE - 1 ) {
synchronized ( myData ) {
if ( myData[keys[i]] == null ) {
myData[keys[i]] = new Object[dimensions[i + 1]];
}
}
}
myData = (Object[])myData[keys[i]];
} }
synchronized ( myData ) { // lock the bottom row while we examine and (potentially) update it synchronized ( myData ) { // lock the bottom row while we examine and (potentially) update it