RR optimization: don't recalculate the entire bitset of variant sites for every read added to the sliding window. Instead, reuse as much of the previously calculated bitset as you can (basically from the window start until the start of the new read minus the context size). In some awfully performing regions this cuts down the runtime in half, although in others this doesn't seem to help much (so clearly something else is going on). Note that I still need to fix one last bug here, but it's almost done.
This commit is contained in:
parent
54f698422c
commit
27d8d3f51e
|
|
@ -24,7 +24,7 @@ import java.util.*;
|
||||||
public class SlidingWindow {
|
public class SlidingWindow {
|
||||||
|
|
||||||
// Sliding Window data
|
// Sliding Window data
|
||||||
final private LinkedList<GATKSAMRecord> readsInWindow;
|
final private TreeSet<GATKSAMRecord> readsInWindow;
|
||||||
final private LinkedList<HeaderElement> windowHeader;
|
final private LinkedList<HeaderElement> windowHeader;
|
||||||
protected int contextSize; // the largest context size (between mismatches and indels)
|
protected int contextSize; // the largest context size (between mismatches and indels)
|
||||||
protected String contig;
|
protected String contig;
|
||||||
|
|
@ -97,7 +97,13 @@ public class SlidingWindow {
|
||||||
this.MIN_MAPPING_QUALITY = minMappingQuality;
|
this.MIN_MAPPING_QUALITY = minMappingQuality;
|
||||||
|
|
||||||
this.windowHeader = new LinkedList<HeaderElement>();
|
this.windowHeader = new LinkedList<HeaderElement>();
|
||||||
this.readsInWindow = new LinkedList<GATKSAMRecord>();
|
this.readsInWindow = new TreeSet<GATKSAMRecord>(new Comparator<GATKSAMRecord>() {
|
||||||
|
@Override
|
||||||
|
public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
|
||||||
|
final int difference = read1.getSoftEnd() - read2.getSoftEnd();
|
||||||
|
return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
this.contig = contig;
|
this.contig = contig;
|
||||||
this.contigIndex = contigIndex;
|
this.contigIndex = contigIndex;
|
||||||
|
|
@ -195,55 +201,102 @@ public class SlidingWindow {
|
||||||
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
||||||
* @return all reads that have fallen to the left of the sliding window after the slide
|
* @return all reads that have fallen to the left of the sliding window after the slide
|
||||||
*/
|
*/
|
||||||
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) {
|
protected List<GATKSAMRecord> slideWindow(final int incomingReadUnclippedStart) {
|
||||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||||
|
|
||||||
if (incomingReadUnclippedStart - contextSize > getStartLocation(windowHeader)) {
|
final int windowHeaderStartLocation = getStartLocation(windowHeader);
|
||||||
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(windowHeader);
|
|
||||||
boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex);
|
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
|
||||||
|
markSites(incomingReadUnclippedStart);
|
||||||
|
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
|
||||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||||
|
|
||||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite);
|
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet());
|
||||||
finalizedReads = closeVariantRegions(regions, false);
|
finalizedReads = closeVariantRegions(regions, false);
|
||||||
|
|
||||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
|
||||||
final int windowHeaderStartLoc = getStartLocation(windowHeader);
|
readsInWindow.pollFirst();
|
||||||
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
|
||||||
if (read.getSoftEnd() < windowHeaderStartLoc) {
|
|
||||||
readsToRemove.add(read);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (GATKSAMRecord read : readsToRemove) {
|
|
||||||
readsInWindow.remove(read);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return finalizedReads;
|
return finalizedReads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private final class MarkedSites {
|
||||||
|
|
||||||
|
private boolean[] siteIsVariant = new boolean[0];
|
||||||
|
private int startLocation = 0;
|
||||||
|
|
||||||
|
public MarkedSites() {}
|
||||||
|
|
||||||
|
public boolean[] getVariantSiteBitSet() { return siteIsVariant; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates the variant site bitset given the new startlocation and size of the region to mark.
|
||||||
|
*
|
||||||
|
* @param newStartLocation the new start location of the bitset
|
||||||
|
* @param sizeOfRegion the new size of the region to be represented
|
||||||
|
*
|
||||||
|
* @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder.
|
||||||
|
*/
|
||||||
|
public int updateRegion(final int newStartLocation, final int sizeOfRegion) {
|
||||||
|
int lastPositionMarked = sizeOfRegion;
|
||||||
|
|
||||||
|
// if this is the first time we set the array and we can't reuse anything, just create a new array from scratch
|
||||||
|
if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) {
|
||||||
|
siteIsVariant = new boolean[sizeOfRegion];
|
||||||
|
lastPositionMarked = 0;
|
||||||
|
}
|
||||||
|
// if the dimensions change, copy what we can and continue
|
||||||
|
else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) {
|
||||||
|
final boolean[] tempArray = new boolean[sizeOfRegion];
|
||||||
|
final int differenceInStartPositions = newStartLocation - this.startLocation;
|
||||||
|
lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion);
|
||||||
|
System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked);
|
||||||
|
siteIsVariant = null; // explicitly allow garbage collection
|
||||||
|
siteIsVariant = tempArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.startLocation = newStartLocation;
|
||||||
|
|
||||||
|
return lastPositionMarked + newStartLocation;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final MarkedSites markedSites = new MarkedSites();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* returns an array marked with variant and non-variant regions (it uses
|
* returns an array marked with variant and non-variant regions (it uses
|
||||||
* markVariantRegions to make the marks)
|
* markVariantRegions to make the marks)
|
||||||
*
|
*
|
||||||
* @param stop check the window from start to stop (not-inclusive)
|
* @param stop check the window from start to stop (not-inclusive)
|
||||||
* @return a boolean array with 'true' marking variant regions and false marking consensus sites
|
|
||||||
*/
|
*/
|
||||||
protected boolean[] markSites(int stop) {
|
protected void markSites(final int stop) {
|
||||||
|
|
||||||
boolean[] markedSites = new boolean[stop - getStartLocation(windowHeader) + contextSize + 1];
|
final int windowHeaderStartLocation = getStartLocation(windowHeader);
|
||||||
|
final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1;
|
||||||
|
final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion);
|
||||||
|
final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize);
|
||||||
|
|
||||||
|
// update the iterator to the correct position
|
||||||
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
|
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
|
||||||
for (int i = getStartLocation(windowHeader); i < stop; i++) {
|
for (int i = windowHeaderStartLocation; i < locationToProcess; i++) {
|
||||||
|
if (headerElementIterator.hasNext())
|
||||||
|
headerElementIterator.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
// process a contextSize worth of region from scratch in case there's a variant there
|
||||||
|
for (int i = locationToProcess; i < stop; i++) {
|
||||||
if (headerElementIterator.hasNext()) {
|
if (headerElementIterator.hasNext()) {
|
||||||
HeaderElement headerElement = headerElementIterator.next();
|
HeaderElement headerElement = headerElementIterator.next();
|
||||||
|
|
||||||
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
||||||
markVariantRegion(markedSites, i - getStartLocation(windowHeader));
|
markVariantRegion(markedSites, i - windowHeaderStartLocation);
|
||||||
|
|
||||||
} else
|
} else
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return markedSites;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -252,11 +305,11 @@ public class SlidingWindow {
|
||||||
* @param markedSites the boolean array to bear the marks
|
* @param markedSites the boolean array to bear the marks
|
||||||
* @param variantSiteLocation the location where a variant site was found
|
* @param variantSiteLocation the location where a variant site was found
|
||||||
*/
|
*/
|
||||||
protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
|
protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
|
||||||
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
|
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
|
||||||
int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
|
int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
|
||||||
for (int i = from; i < to; i++)
|
for (int i = from; i < to; i++)
|
||||||
markedSites[i] = true;
|
markedSites.getVariantSiteBitSet()[i] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -625,8 +678,8 @@ public class SlidingWindow {
|
||||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||||
|
|
||||||
if (!windowHeader.isEmpty()) {
|
if (!windowHeader.isEmpty()) {
|
||||||
boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1);
|
markSites(getStopLocation(windowHeader) + 1);
|
||||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
|
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
|
||||||
finalizedReads = closeVariantRegions(regions, true);
|
finalizedReads = closeVariantRegions(regions, true);
|
||||||
|
|
||||||
if (!windowHeader.isEmpty()) {
|
if (!windowHeader.isEmpty()) {
|
||||||
|
|
@ -635,6 +688,7 @@ public class SlidingWindow {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return finalizedReads;
|
return finalizedReads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue