A checkpoint commit of two BAM reading projects going on simultaneously. These two projects

are works in progress, and this checkin will provide a baseline against which to gauge 
improvements to both projects.

Low-memory BAM protoshards (disabled by default):
- Currently passing ValidatingPileupIntegrationTest.
- Gets progressively slower throughout the traversal, but should run at least as fast as original implementation.
- Uses 10+ file handles per BAM, but should use 3.

BAM performance microbenchmark test system:
- Currently tests performance of BAM reading using SAM-JDK vs. GATK
- Tests do not hit all GATK performance hotspots.
- New tests that require input data in a slightly different form are hard to implement.
- Output of test results is not easily parseable (investigating Google Caliper for possible improvements).



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5317 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2011-02-25 17:50:32 +00:00
parent ad1e4f47b1
commit 600f73cbd6
24 changed files with 1678 additions and 52 deletions

View File

@ -25,6 +25,7 @@
package net.sf.samtools;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
@ -53,8 +54,8 @@ public class GATKBAMFileSpan extends BAMFileSpan {
* Create a new chunk list from the given list of chunks.
* @param chunks Constituent chunks.
*/
public GATKBAMFileSpan(final List<GATKChunk> chunks) {
super(new ArrayList<Chunk>(chunks));
public GATKBAMFileSpan(final GATKChunk[] chunks) {
super(Arrays.<Chunk>asList(chunks));
}
/**

View File

@ -33,38 +33,102 @@ import java.util.List;
* override GATKBin and make it public.
* TODO: Eliminate once we determine the final fate of the BAM index reading code.
*/
public class GATKBin extends Bin {
public class GATKBin implements Comparable<GATKBin> {
/**
* The reference sequence associated with this bin.
*/
private final int referenceSequence;
/**
* The number of this bin within the BAM file.
*/
private final int binNumber;
/**
* The chunks associated with this bin.
*/
private GATKChunk[] chunkList;
public GATKBin(Bin bin) {
this(bin.getReferenceSequence(),bin.getBinNumber());
}
public GATKBin(final int referenceSequence, final int binNumber) {
super(referenceSequence,binNumber);
this.referenceSequence = referenceSequence;
this.binNumber = binNumber;
}
public GATKBin(final Bin bin) {
super(bin.getReferenceSequence(),bin.getBinNumber());
}
@Override
public int getReferenceSequence() {
return super.getReferenceSequence();
return referenceSequence;
}
@Override
public int getBinNumber() {
return super.getBinNumber();
return binNumber;
}
public List<GATKChunk> getGATKChunkList() {
List<GATKChunk> gatkChunks = new ArrayList<GATKChunk>();
for(Chunk chunk: getChunkList())
gatkChunks.add(new GATKChunk(chunk));
return gatkChunks;
}
public void setGATKChunkList(List<GATKChunk> chunks) {
super.setChunkList(new ArrayList<Chunk>(chunks));
/**
* Convert this GATKBin to a normal bin, for processing with the standard BAM query interface.
* @return
*/
public Bin toBin() {
return new Bin(referenceSequence,binNumber);
}
/**
* See whether two bins are equal. If the ref seq and the bin number
* are equal, assume equality of the chunk list.
* @param other The other Bin to which to compare this.
* @return True if the two bins are equal. False otherwise.
*/
@Override
public String toString() {
return String.format("Bin %d in contig %d",getBinNumber(),getReferenceSequence());
public boolean equals(Object other) {
if(other == null) return false;
if(!(other instanceof GATKBin)) return false;
GATKBin otherBin = (GATKBin)other;
return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber;
}
/**
* Compute a unique hash code for the given reference sequence and bin number.
* @return A unique hash code.
*/
@Override
public int hashCode() {
return ((Integer)referenceSequence).hashCode() ^ ((Integer)binNumber).hashCode();
}
/**
* Compare two bins to see what ordering they should appear in.
* @param other Other bin to which this bin should be compared.
* @return -1 if this < other, 0 if this == other, 1 if this > other.
*/
public int compareTo(GATKBin other) {
if(other == null)
throw new ClassCastException("Cannot compare to a null object");
// Check the reference sequences first.
if(this.referenceSequence != other.referenceSequence)
return referenceSequence - other.referenceSequence;
// Then check the bin ordering.
return binNumber - other.binNumber;
}
/**
* Sets the chunks associated with this bin
*/
public void setChunkList(GATKChunk[] list){
chunkList = list;
}
/**
* Gets the list of chunks associated with this bin.
* @return the chunks in this bin. If no chunks are associated, an empty list will be returned.
*/
public GATKChunk[] getChunkList(){
if(chunkList == null)
return new GATKChunk[0];
return chunkList;
}
}

View File

@ -179,7 +179,7 @@ public class GATKArgumentCollection {
* method.
* @return The default downsampling mechanism, or null if none exists.
*/
public DownsamplingMethod getDefaultDownsamplingMethod() {
public static DownsamplingMethod getDefaultDownsamplingMethod() {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
}

View File

@ -0,0 +1,266 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.GATKBin;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Creates an 'index of the index' for a particular reference sequence
* within the BAM file, for easier whole-BAM-file traversal.
* file.
*/
public class BAMIndexBinIterator {
/**
* The source of index data.
*/
private final GATKBAMIndex index;
/**
* The file storing the index data.
*/
private final File indexFile;
/**
* File storing index metadata.
*/
private final File metaIndexFile;
/**
* Reference sequence that temporary file is based on.
*/
private final int referenceSequence;
/**
* Size of a long in bytes.
*/
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
public BAMIndexBinIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) {
this.index = index;
this.indexFile = indexFile;
this.referenceSequence = referenceSequence;
index.seek(4);
final int sequenceCount = index.readInteger();
if (referenceSequence >= sequenceCount)
throw new ReviewedStingException(String.format("Reference sequence past end of genome; reference sequence = %d, sequence count = %d",referenceSequence,sequenceCount));
index.skipToSequence(referenceSequence);
int binCount = index.readInteger();
try {
metaIndexFile = File.createTempFile("bammetaindex."+referenceSequence,null);
metaIndexFile.deleteOnExit();
FileOutputStream metaIndex = new FileOutputStream(metaIndexFile);
FileChannel metaIndexChannel = metaIndex.getChannel();
// zero out the contents of the file. Arrays of primitives in java are always zeroed out by default.
byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*(Long.SIZE/8)]; // byte array is zeroed out by default.
metaIndexChannel.write(ByteBuffer.wrap(emptyContents));
ByteBuffer binPositionBuffer = ByteBuffer.allocate(emptyContents.length);
binPositionBuffer.order(ByteOrder.LITTLE_ENDIAN);
for (int binNumber = 0; binNumber < binCount; binNumber++) {
long position = index.position();
final int indexBin = index.readInteger();
metaIndexChannel.position(indexBin*LONG_SIZE_IN_BYTES);
binPositionBuffer.putLong(position);
binPositionBuffer.flip();
System.out.printf("Writing bin number %d to position %d: coordinate = %d%n",indexBin,indexBin*Long.SIZE*8,position);
metaIndexChannel.write(binPositionBuffer);
binPositionBuffer.flip();
final int nChunks = index.readInteger();
index.skipBytes(16 * nChunks);
}
metaIndexChannel.close();
metaIndex.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to write BAM metaindex",ex);
}
}
public void close() {
metaIndexFile.delete();
}
public CloseableIterator<GATKBin> getIteratorOverLevel(int level) {
return new LevelIterator(level);
}
private class LevelIterator implements CloseableIterator<GATKBin> {
/**
* The raw BAM index file with unordered bins.
*/
private final FileInputStream indexInputStream;
/**
* The index metafile, with pointers to ordered bins.
*/
private final FileInputStream metaIndexInputStream;
/**
* The first and last bins in the level.
*/
private final int firstBinNumber, lastBinNumber;
/**
* The current bin in the index.
*/
private int currentBinNumber;
/**
* Position of the most recent chunk data.
*/
private GATKBin nextBin = null;
public LevelIterator(final int level) {
try {
indexInputStream = new FileInputStream(indexFile);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to open index file for reading");
}
try {
metaIndexInputStream = new FileInputStream(metaIndexFile);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to open index metafile for reading");
}
firstBinNumber = GATKBAMIndex.getFirstBinInLevel(level);
lastBinNumber = firstBinNumber + index.getLevelSize(level) - 1;
currentBinNumber = firstBinNumber - 1;
advance();
}
public void close() {
try {
indexInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index file.");
}
try {
metaIndexInputStream.close();
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index metafile");
}
}
public boolean hasNext() {
return nextBin != null;
}
public GATKBin next() {
if(!hasNext())
throw new NoSuchElementException("Out of elements in BAMIndexBinIterator");
GATKBin currentPosition = nextBin;
advance();
return currentPosition;
}
public void remove() { throw new UnsupportedOperationException("Cannot remove from a LevelIterator"); }
private void advance() {
ByteBuffer indexFilePositionDecoder = ByteBuffer.allocate(LONG_SIZE_IN_BYTES*2);
indexFilePositionDecoder.order(ByteOrder.LITTLE_ENDIAN);
nextBin = null;
try {
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES);
while(nextBin == null && currentBinNumber < lastBinNumber) {
currentBinNumber++;
metaIndexInputStream.getChannel().position(currentBinNumber*LONG_SIZE_IN_BYTES);
metaIndexInputStream.getChannel().read(indexFilePositionDecoder);
indexFilePositionDecoder.flip();
long currentPosition = indexFilePositionDecoder.getLong();
indexFilePositionDecoder.flip();
if(currentPosition != 0) {
indexInputStream.getChannel().position(currentPosition);
indexInputStream.getChannel().read(indexFilePositionDecoder);
indexFilePositionDecoder.flip();
int binNumber = indexFilePositionDecoder.getInt();
if(binNumber != currentBinNumber)
throw new ReviewedStingException("Index file and metaindex file are out of sync.");
int nChunks = indexFilePositionDecoder.getInt();
GATKChunk[] chunks = new GATKChunk[nChunks];
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES*2);
indexFilePositionDecoder.clear();
for (int ci = 0; ci < nChunks; ci++) {
indexInputStream.getChannel().read(indexFilePositionDecoder);
indexFilePositionDecoder.flip();
final long chunkBegin = indexFilePositionDecoder.getLong();
final long chunkEnd = indexFilePositionDecoder.getLong();
chunks[ci] = new GATKChunk(chunkBegin, chunkEnd);
indexFilePositionDecoder.flip();
}
nextBin = new GATKBin(referenceSequence,binNumber);
nextBin.setChunkList(chunks);
}
}
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to close index metafile");
}
}
}
}

View File

@ -100,7 +100,7 @@ class BAMIndexContent {
List<GATKChunk> allChunks = new ArrayList<GATKChunk>();
for (GATKBin b : mBinList)
if (b.getChunkList() != null) {
allChunks.addAll(b.getGATKChunkList());
allChunks.addAll(Arrays.asList(b.getChunkList()));
}
return Collections.unmodifiableList(allChunks);
}

View File

@ -0,0 +1,198 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.Bin;
import net.sf.samtools.GATKBin;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Represents a tree of overlapping bins in a single
* BAM index.
*/
public class BinTree {
/**
* The BAM index from which this bin data is sourced.
*/
private final GATKBAMIndex index;
/**
* The bins in this tree, organized by level.
*/
private final GATKBin[] bins;
public BinTree(GATKBAMIndex index,final GATKBin[] bins) {
this.index = index;
this.bins = bins;
}
public GATKBin getLowestLevelBin() {
for(int i = bins.length-1; i >= 0; i--) {
if(bins[i] != null)
return bins[i];
}
return null;
}
/**
* Retrieve the bins from the bin tree.
* @return list of bins.
*/
public GATKBin[] getBins() {
return bins;
}
/**
* Gets the number of bins in a given bin list.
* @return Number of bins in the list.
*/
public int size() {
return bins.length;
}
/**
* Checks overlap between this bin tree and other bin trees.
* @param position the position over which to detect overlap.
* @return True if the segment overlaps. False otherwise.
*/
public boolean overlaps(final GenomeLoc position) {
for(GATKBin gatkBin: bins) {
if(gatkBin == null)
continue;
Bin bin = new Bin(gatkBin.getReferenceSequence(),gatkBin.getBinNumber());
// Overlap occurs when the position is not disjoint with the bin boundaries.
if(!(position.getStop() < index.getFirstLocusInBin(bin) || position.getStart() > index.getLastLocusInBin(bin)))
return true;
}
return false;
}
}
/**
* Iterate through all bin trees in sequence, from those covering base 1 to those covering MAX_BINS.
*/
class BinTreeIterator implements Iterator<BinTree> {
/**
* The index over which to iterate.
*/
private final GATKBAMIndex index;
/**
* Iterators over each individual level.
*/
private final PeekableIterator<GATKBin>[] levelIterators;
/**
* The next bin tree to be returned.
*/
private BinTree nextBinTree;
/**
* Each iteration through the bin tree has a corresponding lowest level. Make sure
* every lowest-level bin is covered, whether that bin is present or not.
*/
private int currentBinInLowestLevel;
public BinTreeIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) {
this.index = index;
BAMIndexBinIterator binIterator = new BAMIndexBinIterator(index,indexFile,referenceSequence);
levelIterators = new PeekableIterator[GATKBAMIndex.getNumIndexLevels()];
for(int level = 0; level < GATKBAMIndex.getNumIndexLevels(); level++)
levelIterators[level] = new PeekableIterator<GATKBin>(binIterator.getIteratorOverLevel(level));
// Set the current bin to one less that the first bin in the sequence. advance() will push it
// ahead to the first bin in the lowest level.
currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1) - 1;
advance();
}
public boolean hasNext() {
return nextBinTree != null;
}
/**
* Return the next BinTree in the level.
* @return Next BinTree in sequence.
*/
public BinTree next() {
if(!hasNext())
throw new NoSuchElementException("BinTreeIterator is out of elements");
BinTree currentBinTree = nextBinTree;
advance();
return currentBinTree;
}
/**
* Bring the bin tree ahead to the next overlapping structure.
*/
private void advance() {
final int lowestLevel = GATKBAMIndex.getNumIndexLevels()-1;
final int firstBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(lowestLevel);
final int binsInLowestLevel = index.getLevelSize(lowestLevel);
currentBinInLowestLevel++;
GATKBin[] bins = new GATKBin[GATKBAMIndex.getNumIndexLevels()];
nextBinTree = null;
while(nextBinTree == null) {
for(int level = lowestLevel; level >= 0; level--) {
if(!levelIterators[level].hasNext())
continue;
final int firstBinInThisLevel = GATKBAMIndex.getFirstBinInLevel(level);
final int binsInThisLevel = index.getLevelSize(level);
final int currentBinInThisLevel = ((currentBinInLowestLevel-firstBinInLowestLevel)*binsInThisLevel/binsInLowestLevel) + firstBinInThisLevel;
while(levelIterators[level].hasNext() && levelIterators[level].peek().getBinNumber() < currentBinInThisLevel)
levelIterators[level].next();
if(levelIterators[level].hasNext() && levelIterators[level].peek().getBinNumber() == currentBinInThisLevel)
bins[level] = levelIterators[level].peek();
}
for(int level = 0; level <= lowestLevel; level++) {
if(bins[level] != null) {
nextBinTree = new BinTree(index,bins);
break;
}
}
}
}
/**
* Remove unsupported.
*/
public void remove() {
throw new UnsupportedOperationException("Cannot remove elements from a BinTreeIterator");
}
}

View File

@ -61,6 +61,10 @@ class FilePointer {
this.isRegionUnmapped = false;
}
public FilePointer(final String referenceSequence) {
this(referenceSequence,null);
}
public void addLocation(GenomeLoc location) {
locations.add(location);
}

View File

@ -103,6 +103,14 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
}
}
/**
* Gets the file backing this index.
* @return The index file.
*/
public File getIndexFile() {
return mFile;
}
/**
* Get the number of levels employed by this index.
* @return Number of levels in this index.
@ -127,7 +135,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
*/
public int getLevelSize(final int levelNumber) {
if(levelNumber == getNumIndexLevels()-1)
return MAX_BINS-LEVEL_STARTS[levelNumber];
return MAX_BINS-LEVEL_STARTS[levelNumber]-1;
else
return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber];
}
@ -272,7 +280,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
for(GATKBin bin: bins) {
for(GATKChunk chunk: bin.getGATKChunkList())
for(GATKChunk chunk: bin.getChunkList())
chunkList.add(chunk.clone());
}
@ -281,7 +289,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
}
chunkList = optimizeChunkList(chunkList,queryResults.getLinearIndex().getMinimumOffset(startPos));
return new GATKBAMFileSpan(chunkList);
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
}
/**
@ -322,13 +330,13 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
for(GATKBin coveringBin: binTree) {
for(GATKChunk chunk: coveringBin.getGATKChunkList())
for(GATKChunk chunk: coveringBin.getChunkList())
chunkList.add(chunk.clone());
}
final int start = getFirstLocusInBin(bin);
chunkList = optimizeChunkList(chunkList,indexQuery.getLinearIndex().getMinimumOffset(start));
return new GATKBAMFileSpan(chunkList);
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
}
public GATKBAMFileSpan getContentsOfBin(final Bin bin) {
@ -344,7 +352,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
GATKBin queriedBin = indexQuery.getBins().getBin(gatkBin.getBinNumber());
return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getGATKChunkList()) : null;
return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getChunkList()) : null;
}
/**
@ -420,8 +428,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
skipBytes(16 * nChunks);
}
GATKBin bin = new GATKBin(referenceSequence, indexBin);
bin.setGATKChunkList(chunks);
bin.setLastChunk(lastChunk);
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
bins[indexBin] = bin;
}
@ -560,7 +567,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
}
}
private void skipToSequence(final int sequenceIndex) {
protected void skipToSequence(final int sequenceIndex) {
for (int i = 0; i < sequenceIndex; i++) {
// System.out.println("# Sequence TID: " + i);
final int nBins = readInteger();
@ -581,20 +588,23 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
mFileBuffer.get(bytes);
}
private int readInteger() {
protected int readInteger() {
return mFileBuffer.getInt();
}
private long readLong() {
protected long readLong() {
return mFileBuffer.getLong();
}
private void skipBytes(final int count) {
protected void skipBytes(final int count) {
mFileBuffer.position(mFileBuffer.position() + count);
}
private void seek(final int position) {
protected void seek(final int position) {
mFileBuffer.position(position);
}
protected long position() {
return mFileBuffer.position();
}
}

View File

@ -158,8 +158,8 @@ public class IntervalSharder {
if(!binIterator.hasNext())
break;
int locationStart = (int)location.getStart();
final int locationStop = (int)location.getStop();
int locationStart = location.getStart();
final int locationStop = location.getStop();
// Advance to first bin.
while(binIterator.peek().stop < locationStart)

View File

@ -83,7 +83,10 @@ public class LocusShardStrategy implements ShardStrategy {
else
intervals = locations;
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
if(SAMDataSource.TRY_LOW_MEMORY_SHARDING)
this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
else
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
}
else {
final int maxShardSize = 100000;

View File

@ -0,0 +1,193 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKBin;
import net.sf.samtools.GATKChunk;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once.
*/
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
private static Logger logger = Logger.getLogger(IntervalSharder.class);
private final SAMDataSource dataSource;
private final GenomeLocSortedSet loci;
private final PeekableIterator<GenomeLoc> locusIterator;
private GenomeLoc currentLocus;
private FilePointer nextFilePointer = null;
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
this.dataSource = dataSource;
this.loci = loci;
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
if(locusIterator.hasNext())
currentLocus = locusIterator.next();
advance();
}
public boolean hasNext() {
return nextFilePointer != null;
}
public FilePointer next() {
if(!hasNext())
throw new NoSuchElementException("No next element available in interval sharder");
FilePointer currentFilePointer = nextFilePointer;
advance();
return currentFilePointer;
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder");
}
private void advance() {
if(loci.isEmpty())
return;
nextFilePointer = null;
while(nextFilePointer == null && currentLocus != null) {
nextFilePointer = new FilePointer(currentLocus.getContig());
int coveredRegionStart = 1;
int coveredRegionStop = Integer.MAX_VALUE;
GenomeLoc coveredRegion = null;
for(SAMReaderID reader: dataSource.getReaderIDs()) {
GATKBAMIndex index = (GATKBAMIndex)dataSource.getIndex(reader);
BinTree binTree = getNextOverlappingBinTree((GATKBAMIndex)dataSource.getIndex(reader),currentLocus);
if(binTree != null) {
coveredRegionStart = Math.max(coveredRegionStart,index.getFirstLocusInBin(binTree.getLowestLevelBin().toBin()));
coveredRegionStop = Math.min(coveredRegionStop,index.getLastLocusInBin(binTree.getLowestLevelBin().toBin()));
coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop);
GATKBAMFileSpan fileSpan = generateFileSpan(index,binTree,currentLocus);
nextFilePointer.addFileSpans(reader,fileSpan);
}
}
// Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list.
GenomeLoc initialLocation = currentLocus.intersect(coveredRegion);
nextFilePointer.addLocation(initialLocation);
// See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval.
if(!nextFilePointer.locations.isEmpty()) {
while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) {
currentLocus = locusIterator.next();
nextFilePointer.addLocation(locusIterator.next());
}
// Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus,
// we can simplify the interval creation process to the end of the covered region to the stop of the given interval.
if(coveredRegionStop < currentLocus.getStop())
currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop());
else if(locusIterator.hasNext())
currentLocus = locusIterator.next();
else
currentLocus = null;
}
}
}
/**
* The last reference sequence processed by this iterator.
*/
private int lastReferenceSequenceLoaded = -1;
/**
* The stateful iterator used to progress through the genoem.
*/
private PeekableIterator<BinTree> binTreeIterator = null;
/**
* Get the next overlapping tree of bins associated with the given BAM file.
* @param index BAM index representation.
* @param locus Locus for which to grab the bin tree, if available.
* @return The BinTree overlapping the given locus.
*/
private BinTree getNextOverlappingBinTree(final GATKBAMIndex index, final GenomeLoc locus) {
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
if(locus.getContigIndex() != lastReferenceSequenceLoaded) {
if(binTreeIterator != null)
binTreeIterator.close();
lastReferenceSequenceLoaded = locus.getContigIndex();
binTreeIterator = new PeekableIterator<BinTree>(new BinTreeIterator(index,index.getIndexFile(),locus.getContigIndex()));
}
if(!binTreeIterator.hasNext())
return null;
BinTree binTree = binTreeIterator.peek();
while(index.getLastLocusInBin(binTree.getLowestLevelBin().toBin()) < locus.getStart()) {
binTreeIterator.next(); // Before the point of interest. Consume this one.
binTree = binTreeIterator.peek();
}
if(binTree.overlaps(locus)) {
binTreeIterator.next();
return binTree;
}
return null;
}
/**
* Converts a bin list to a file span, trimmed based on the linear index and with overlapping regions removed.
* @param index BAM index.
* @param binTree Tree of data found to overlap the region. binTree.overlaps(initialRegion) must return true.
* @param initialRegion The region to employ when trimming the linear index.
* @return File span mapping to given region.
*/
private GATKBAMFileSpan generateFileSpan(final GATKBAMIndex index, final BinTree binTree, final GenomeLoc initialRegion) {
List<GATKChunk> chunks = new ArrayList<GATKChunk>(binTree.size());
for(GATKBin bin: binTree.getBins()) {
if(bin == null)
continue;
chunks.addAll(Arrays.asList(bin.getChunkList()));
}
// Optimize the chunk list with a linear index optimization
chunks = index.optimizeChunkList(chunks,index.getLinearIndex(initialRegion.getContigIndex()).getMinimumOffset(initialRegion.getStart()));
return new GATKBAMFileSpan(chunks.toArray(new GATKChunk[chunks.size()]));
}
}

View File

@ -46,6 +46,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
/**
@ -79,6 +81,11 @@ public class SAMDataSource {
*/
private final SAMFileReader.ValidationStringency validationStringency;
/**
* Store BAM indices for each reader present.
*/
private final Map<SAMReaderID,GATKBAMIndex> bamIndices = new HashMap<SAMReaderID,GATKBAMIndex>();
/**
* How far along is each reader?
*/
@ -121,6 +128,8 @@ public class SAMDataSource {
*/
private final SAMResourcePool resourcePool;
static final boolean TRY_LOW_MEMORY_SHARDING = false;
/**
* Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files.
@ -272,6 +281,17 @@ public class SAMDataSource {
originalToMergedReadGroupMappings.put(id,mappingToMerged);
}
if(TRY_LOW_MEMORY_SHARDING) {
for(SAMReaderID id: readerIDs) {
File indexFile = findIndexFile(id.samFile);
if(indexFile != null) {
SAMSequenceDictionary sequenceDictionary = readers.getReader(id).getFileHeader().getSequenceDictionary();
GATKBAMIndex index = new GATKBAMIndex(indexFile,sequenceDictionary);
bamIndices.put(id,index);
}
}
}
resourcePool.releaseReaders(readers);
}
@ -366,14 +386,18 @@ public class SAMDataSource {
/**
* True if all readers have an index.
* @return
* @return True if all readers have an index.
*/
public boolean hasIndex() {
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
if(!reader.hasIndex())
return false;
if(TRY_LOW_MEMORY_SHARDING)
return readerIDs.size() == bamIndices.size();
else {
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
if(!reader.hasIndex())
return false;
}
return true;
}
return true;
}
/**
@ -382,8 +406,12 @@ public class SAMDataSource {
* @return The index. Will preload the index if necessary.
*/
public BrowseableBAMIndex getIndex(final SAMReaderID id) {
SAMReaders readers = resourcePool.getReadersWithoutLocking();
return readers.getReader(id).getBrowseableIndex();
if(TRY_LOW_MEMORY_SHARDING)
return bamIndices.get(id);
else {
SAMReaders readers = resourcePool.getReadersWithoutLocking();
return readers.getReader(id).getBrowseableIndex();
}
}
/**
@ -701,11 +729,10 @@ public class SAMDataSource {
for(SAMReaderID readerID: readerIDs) {
SAMFileReader reader = new SAMFileReader(readerID.samFile);
reader.enableFileSource(true);
reader.enableIndexCaching(true);
if(!TRY_LOW_MEMORY_SHARDING)
reader.enableIndexCaching(true);
reader.setValidationStringency(validationStringency);
// If no read group is present, hallucinate one.
// TODO: Straw poll to see whether this is really required.
final SAMFileHeader header = reader.getFileHeader();
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
@ -944,6 +971,37 @@ public class SAMDataSource {
}
}
}
/**
* Locates the index file alongside the given BAM, if present.
* TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent.
* @param bamFile The data file to use.
* @return A File object if the index file is present; null otherwise.
*/
private File findIndexFile(File bamFile) {
File indexFile;
try {
Class bamFileReaderClass = Class.forName("net.sf.samtools.BAMFileReader");
Method indexFileLocator = bamFileReaderClass.getDeclaredMethod("findIndexFile",File.class);
indexFileLocator.setAccessible(true);
indexFile = (File)indexFileLocator.invoke(null,bamFile);
}
catch(ClassNotFoundException ex) {
throw new ReviewedStingException("Unable to locate BAMFileReader class, used to check for index files");
}
catch(NoSuchMethodException ex) {
throw new ReviewedStingException("Unable to locate Picard index file locator.");
}
catch(IllegalAccessException ex) {
throw new ReviewedStingException("Unable to access Picard index file locator.");
}
catch(InvocationTargetException ex) {
throw new ReviewedStingException("Unable to invoke Picard index file locator.");
}
return indexFile;
}
}

View File

@ -0,0 +1,143 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.utils.SimpleTimer;
import java.io.File;
/**
* Basic suite for testing idealized and actual performance of read processing.
*/
public class BAMProcessingPerformanceMeter extends CommandLineProgram {
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = true)
File samFile;
@Input(fullName = "reference_file", shortName="R", doc = "Associated FASTA sequence", required = true)
File referenceFile;
@Argument(fullName="test_repetitions", shortName = "test_reps", doc="Number of times to repeat each test", required = false)
int testRepetitions = 5;
@Argument(fullName="print_frequency", shortName = "pf", doc="Print cumulative time after x # reads", required = false)
int printFrequency = 100000;
private void testBAMFileProcessingThroughput(ReadProcessor readProcessor) {
readProcessor.execute(samFile,referenceFile);
}
public int execute() {
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new NoAdditionalProcessing(this));
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new IterateOverEachBase(this));
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new IterateOverCigarString(this));
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new ExtractTag(this,"OQ"));
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeSamLocusIterator(this));
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeLocusIteratorByState(this, GATKArgumentCollection.getDefaultDownsamplingMethod()));
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeLocusIteratorByState(this, DownsamplingMethod.NONE));
GATKWalkerInvoker countReadsInvoker = new GATKWalkerInvoker(this);
CountReadsPerformanceWalker countReadsWalker = new CountReadsPerformanceWalker(countReadsInvoker);
countReadsInvoker.setWalker(countReadsWalker);
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(countReadsInvoker);
GATKWalkerInvoker countBasesInReadInvoker = new GATKWalkerInvoker(this);
CountBasesInReadPerformanceWalker countBasesInReadWalker = new CountBasesInReadPerformanceWalker(countBasesInReadInvoker);
countBasesInReadInvoker.setWalker(countBasesInReadWalker);
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(countBasesInReadInvoker);
return 0;
}
/**
* Required main method implementation.
* @param argv Command-line argument text.
* @throws Exception on error.
*/
public static void main(String[] argv) throws Exception {
int returnCode = 0;
try {
BAMProcessingPerformanceMeter instance = new BAMProcessingPerformanceMeter();
start(instance, argv);
returnCode = 0;
}
catch(Exception ex) {
returnCode = 1;
ex.printStackTrace();
throw ex;
}
finally {
System.exit(returnCode);
}
}
}
abstract class ReadProcessor {
private final SimpleTimer timer;
private final int printFrequency;
protected int iterations = 0;
public ReadProcessor(BAMProcessingPerformanceMeter performanceMeter) {
timer = new SimpleTimer("timer");
this.printFrequency = performanceMeter.printFrequency;
}
public abstract String getTestName();
public String getIterationType() { return "loci"; }
public void processRead(final SAMRecord read) { }
public void execute(File bamFile,File fastaFile) {
SAMFileReader reader = new SAMFileReader(bamFile);
startTest();
for(SAMRecord read: reader) {
processRead(read);
updateIterationCount();
}
stopTest();
reader.close();
}
protected void startTest() {
timer.start();
}
protected void stopTest() {
timer.stop();
printStatus("TEST COMPLETE");
}
protected void updateIterationCount() {
if(++iterations % printFrequency == 0) printStatus("ONGOING");
}
private void printStatus(String prefix) {
System.out.printf("%s: %s printed %d %s in %f seconds.%n",prefix,getTestName(),iterations,getIterationType(),timer.getElapsedTime());
}
}

View File

@ -0,0 +1,67 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:55 AM
* To change this template use File | Settings | File Templates.
*/
class CountBasesInReadPerformanceWalker extends ReadWalker<Integer,Long> {
private long As;
private long Cs;
private long Gs;
private long Ts;
private final GATKWalkerInvoker invoker;
public CountBasesInReadPerformanceWalker(GATKWalkerInvoker walkerInvoker) {
this.invoker = walkerInvoker;
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
for(byte base: read.getReadBases()) {
switch(base) {
case 'A': As++; break;
case 'C': Cs++; break;
case 'G': Gs++; break;
case 'T': Ts++; break;
}
}
invoker.updateIterationCount();
return 1;
}
public Long reduceInit() { return 0L; }
public Long reduce(Integer value, Long accum) { return value + accum; }
}

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:55 AM
* To change this template use File | Settings | File Templates.
*/
class CountReadsPerformanceWalker extends ReadWalker<Integer,Long> {
private final GATKWalkerInvoker invoker;
public CountReadsPerformanceWalker(GATKWalkerInvoker walkerInvoker) {
this.invoker = walkerInvoker;
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
invoker.updateIterationCount();
return 1;
}
public Long reduceInit() { return 0L; }
public Long reduce(Integer value, Long accum) { return value + accum; }
}

View File

@ -0,0 +1,49 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:53 AM
* To change this template use File | Settings | File Templates.
*/
class ExtractTag extends ReadProcessor {
private final String tag;
public ExtractTag(final BAMProcessingPerformanceMeter performanceMeter, final String tag) {
super(performanceMeter);
this.tag = tag;
}
@Override
public String getTestName() { return "extract tag"; }
public void processRead(final SAMRecord read) {
read.getAttribute(tag);
}
}

View File

@ -0,0 +1,85 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.picard.filter.SamRecordFilter;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.walkers.Walker;
import java.io.File;
import java.util.Collections;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:54 AM
* To change this template use File | Settings | File Templates.
*/
class GATKWalkerInvoker extends ReadProcessor {
/**
* Walker to run over the existing dataset.
*/
private Walker<?,?> walker;
public GATKWalkerInvoker(BAMProcessingPerformanceMeter performanceMeter) {
super(performanceMeter);
}
@Override
public String getTestName() { return "GATK-CountReads"; }
public void setWalker(Walker<?,?> walker) {
this.walker = walker;
}
@Override
public void execute(File samFile, File fastaFile) {
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
// Establish the argument collection
GATKArgumentCollection argCollection = new GATKArgumentCollection();
argCollection.referenceFile = fastaFile;
argCollection.samFiles = Collections.singletonList(samFile.getAbsolutePath());
engine.setArguments(argCollection);
// Bugs in the engine mean that this has to be set twice.
engine.setSAMFileIDs(Collections.singletonList(new SAMReaderID(samFile,new Tags())));
engine.setFilters(Collections.<SamRecordFilter>emptyList());
engine.setReferenceMetaDataFiles(Collections.<RMDTriplet>emptyList());
// Create the walker
engine.setWalker(walker);
startTest();
engine.execute();
stopTest();
}
}

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.picard.filter.FilteringIterator;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.baq.BAQ;
import java.io.File;
import java.util.Collections;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:54 AM
* To change this template use File | Settings | File Templates.
*/
class InvokeLocusIteratorByState extends ReadProcessor {
private final DownsamplingMethod downsamplingMethod;
public InvokeLocusIteratorByState(final BAMProcessingPerformanceMeter performanceMeter,DownsamplingMethod downsamplingMethod) {
super(performanceMeter);
this.downsamplingMethod = downsamplingMethod;
}
@Override
public String getTestName() {
if(downsamplingMethod != DownsamplingMethod.NONE)
return String.format("invoke locus iterator by state; downsampling by sample to coverage = %d; ",downsamplingMethod.toCoverage);
else
return String.format("invoke locus iterator by state; no downsampling; ");
}
@Override
public String getIterationType() { return "loci"; }
@Override
public void execute(File samFile, File fastaFile) {
SAMFileReader reader = new SAMFileReader(samFile);
ReadProperties readProperties = new ReadProperties(Collections.<SAMReaderID>singletonList(new SAMReaderID(samFile,new Tags())),
reader.getFileHeader(),
false,
SAMFileReader.ValidationStringency.SILENT,
0,
downsamplingMethod,
new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)),
Collections.<SamRecordFilter>emptyList(),
false,
false,
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null,
(byte)0);
GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary());
SampleDataSource sampleDataSource = new SampleDataSource();
sampleDataSource.addSamplesFromSAMHeader(reader.getFileHeader());
// Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out?
Iterator<SAMRecord> readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter());
LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource);
startTest();
while(locusIteratorByState.hasNext()) {
locusIteratorByState.next();
updateIterationCount();
}
stopTest();
reader.close();
}
}

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.picard.util.SamLocusIterator;
import net.sf.samtools.SAMFileReader;
import java.io.File;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:54 AM
* To change this template use File | Settings | File Templates.
*/
class InvokeSamLocusIterator extends ReadProcessor {
public InvokeSamLocusIterator(final BAMProcessingPerformanceMeter performanceMeter) {
super(performanceMeter);
}
@Override
public String getTestName() {
return String.format("invoke sam locus iterator");
}
@Override
public String getIterationType() { return "loci"; }
@Override
public void execute(File samFile, File fastaFile) {
SAMFileReader reader = new SAMFileReader(samFile);
SamLocusIterator samLocusIterator = new SamLocusIterator(reader);
samLocusIterator.setEmitUncoveredLoci(false);
Iterator<SamLocusIterator.LocusInfo> workhorseIterator = samLocusIterator.iterator();
startTest();
while(workhorseIterator.hasNext()) {
SamLocusIterator.LocusInfo locusInfo = workhorseIterator.next();
updateIterationCount();
}
stopTest();
reader.close();
}
}

View File

@ -0,0 +1,65 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:53 AM
* To change this template use File | Settings | File Templates.
*/
class IterateOverCigarString extends ReadProcessor {
private long matchMismatches;
private long insertions;
private long deletions;
private long others;
public IterateOverCigarString(final BAMProcessingPerformanceMeter performanceMeter) {
super(performanceMeter);
}
@Override
public String getTestName() { return "iterator over cigar string"; }
public void processRead(final SAMRecord read) {
Cigar cigar = read.getCigar();
for(CigarElement cigarElement: cigar.getCigarElements()) {
int elementSize = cigarElement.getLength();
while(elementSize > 0) {
switch(cigarElement.getOperator()) {
case M: matchMismatches++; break;
case I: insertions++; break;
case D: deletions++; break;
default: others++; break;
}
elementSize--;
}
}
}
}

View File

@ -0,0 +1,58 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:53 AM
* To change this template use File | Settings | File Templates.
*/
class IterateOverEachBase extends ReadProcessor {
private long As;
private long Cs;
private long Gs;
private long Ts;
public IterateOverEachBase(final BAMProcessingPerformanceMeter performanceMeter) {
super(performanceMeter);
}
@Override
public String getTestName() { return "iterate over each base"; }
public void processRead(final SAMRecord read) {
for(byte base: read.getReadBases()) {
switch(base) {
case 'A': As++; break;
case 'C': Cs++; break;
case 'G': Gs++; break;
case 'T': Ts++; break;
}
}
}
}

View File

@ -0,0 +1,44 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.performance;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Feb 25, 2011
* Time: 10:16:53 AM
* To change this template use File | Settings | File Templates.
*/
class NoAdditionalProcessing extends ReadProcessor {
public NoAdditionalProcessing(final BAMProcessingPerformanceMeter performanceMeter) {
super(performanceMeter);
}
@Override
public String getTestName() { return "no additional processing"; }
public void processRead(final SAMRecord read) {}
}

View File

@ -0,0 +1,88 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads.utilities;
import net.sf.samtools.util.BlockGunzipper;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
/**
* Test decompression of a single BGZF block.
*/
public class UnzipSingleBlock extends CommandLineProgram {
@Input(fullName = "block_file", shortName = "b", doc = "block file over which to test unzipping", required = true)
private File blockFile;
@Input(fullName = "compressed_block_size", shortName = "cbs", doc = "size of compressed block", required = true)
private int compressedBufferSize;
public int execute() throws IOException, NoSuchMethodException, IllegalAccessException, InvocationTargetException {
byte[] compressedBuffer = new byte[(int)blockFile.length()];
byte[] uncompressedBuffer = new byte[65536];
FileInputStream fis = new FileInputStream(blockFile);
fis.read(compressedBuffer);
fis.close();
BlockGunzipper gunzipper = new BlockGunzipper();
gunzipper.setCheckCrcs(true);
Method unzipBlock = BlockGunzipper.class.getDeclaredMethod("unzipBlock",byte[].class,byte[].class,Integer.TYPE);
unzipBlock.setAccessible(true);
unzipBlock.invoke(gunzipper,uncompressedBuffer,compressedBuffer,compressedBufferSize);
System.out.printf("SUCCESS!%n");
return 0;
}
/**
* Required main method implementation.
* @param argv Command-line argument text.
* @throws Exception on error.
*/
public static void main(String[] argv) throws Exception {
int returnCode = 0;
try {
UnzipSingleBlock instance = new UnzipSingleBlock();
start(instance, argv);
returnCode = 0;
}
catch(Exception ex) {
returnCode = 1;
ex.printStackTrace();
throw ex;
}
finally {
System.exit(returnCode);
}
}
}

View File

@ -88,7 +88,7 @@ public class GATKBAMIndexUnitTest extends BaseTest {
// Level 5
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(5),4681);
Assert.assertEquals(bamIndex.getLevelSize(5),37449-4681+1);
Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1);
}
}