A checkpoint commit of two BAM reading projects going on simultaneously. These two projects
are works in progress, and this checkin will provide a baseline against which to gauge improvements to both projects. Low-memory BAM protoshards (disabled by default): - Currently passing ValidatingPileupIntegrationTest. - Gets progressively slower throughout the traversal, but should run at least as fast as original implementation. - Uses 10+ file handles per BAM, but should use 3. BAM performance microbenchmark test system: - Currently tests performance of BAM reading using SAM-JDK vs. GATK - Tests do not hit all GATK performance hotspots. - New tests that require input data in a slightly different form are hard to implement. - Output of test results is not easily parseable (investigating Google Caliper for possible improvements). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5317 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ad1e4f47b1
commit
600f73cbd6
|
|
@ -25,6 +25,7 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -53,8 +54,8 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
|||
* Create a new chunk list from the given list of chunks.
|
||||
* @param chunks Constituent chunks.
|
||||
*/
|
||||
public GATKBAMFileSpan(final List<GATKChunk> chunks) {
|
||||
super(new ArrayList<Chunk>(chunks));
|
||||
public GATKBAMFileSpan(final GATKChunk[] chunks) {
|
||||
super(Arrays.<Chunk>asList(chunks));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -33,38 +33,102 @@ import java.util.List;
|
|||
* override GATKBin and make it public.
|
||||
* TODO: Eliminate once we determine the final fate of the BAM index reading code.
|
||||
*/
|
||||
public class GATKBin extends Bin {
|
||||
public class GATKBin implements Comparable<GATKBin> {
|
||||
/**
|
||||
* The reference sequence associated with this bin.
|
||||
*/
|
||||
private final int referenceSequence;
|
||||
|
||||
/**
|
||||
* The number of this bin within the BAM file.
|
||||
*/
|
||||
private final int binNumber;
|
||||
|
||||
/**
|
||||
* The chunks associated with this bin.
|
||||
*/
|
||||
private GATKChunk[] chunkList;
|
||||
|
||||
public GATKBin(Bin bin) {
|
||||
this(bin.getReferenceSequence(),bin.getBinNumber());
|
||||
}
|
||||
|
||||
public GATKBin(final int referenceSequence, final int binNumber) {
|
||||
super(referenceSequence,binNumber);
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.binNumber = binNumber;
|
||||
}
|
||||
|
||||
public GATKBin(final Bin bin) {
|
||||
super(bin.getReferenceSequence(),bin.getBinNumber());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getReferenceSequence() {
|
||||
return super.getReferenceSequence();
|
||||
return referenceSequence;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getBinNumber() {
|
||||
return super.getBinNumber();
|
||||
return binNumber;
|
||||
}
|
||||
|
||||
public List<GATKChunk> getGATKChunkList() {
|
||||
List<GATKChunk> gatkChunks = new ArrayList<GATKChunk>();
|
||||
for(Chunk chunk: getChunkList())
|
||||
gatkChunks.add(new GATKChunk(chunk));
|
||||
return gatkChunks;
|
||||
}
|
||||
|
||||
public void setGATKChunkList(List<GATKChunk> chunks) {
|
||||
super.setChunkList(new ArrayList<Chunk>(chunks));
|
||||
/**
|
||||
* Convert this GATKBin to a normal bin, for processing with the standard BAM query interface.
|
||||
* @return
|
||||
*/
|
||||
public Bin toBin() {
|
||||
return new Bin(referenceSequence,binNumber);
|
||||
}
|
||||
|
||||
/**
|
||||
* See whether two bins are equal. If the ref seq and the bin number
|
||||
* are equal, assume equality of the chunk list.
|
||||
* @param other The other Bin to which to compare this.
|
||||
* @return True if the two bins are equal. False otherwise.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("Bin %d in contig %d",getBinNumber(),getReferenceSequence());
|
||||
public boolean equals(Object other) {
|
||||
if(other == null) return false;
|
||||
if(!(other instanceof GATKBin)) return false;
|
||||
|
||||
GATKBin otherBin = (GATKBin)other;
|
||||
return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a unique hash code for the given reference sequence and bin number.
|
||||
* @return A unique hash code.
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return ((Integer)referenceSequence).hashCode() ^ ((Integer)binNumber).hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two bins to see what ordering they should appear in.
|
||||
* @param other Other bin to which this bin should be compared.
|
||||
* @return -1 if this < other, 0 if this == other, 1 if this > other.
|
||||
*/
|
||||
public int compareTo(GATKBin other) {
|
||||
if(other == null)
|
||||
throw new ClassCastException("Cannot compare to a null object");
|
||||
|
||||
// Check the reference sequences first.
|
||||
if(this.referenceSequence != other.referenceSequence)
|
||||
return referenceSequence - other.referenceSequence;
|
||||
|
||||
// Then check the bin ordering.
|
||||
return binNumber - other.binNumber;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the chunks associated with this bin
|
||||
*/
|
||||
public void setChunkList(GATKChunk[] list){
|
||||
chunkList = list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the list of chunks associated with this bin.
|
||||
* @return the chunks in this bin. If no chunks are associated, an empty list will be returned.
|
||||
*/
|
||||
public GATKChunk[] getChunkList(){
|
||||
if(chunkList == null)
|
||||
return new GATKChunk[0];
|
||||
return chunkList;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ public class GATKArgumentCollection {
|
|||
* method.
|
||||
* @return The default downsampling mechanism, or null if none exists.
|
||||
*/
|
||||
public DownsamplingMethod getDefaultDownsamplingMethod() {
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod() {
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,266 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBin;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Creates an 'index of the index' for a particular reference sequence
|
||||
* within the BAM file, for easier whole-BAM-file traversal.
|
||||
* file.
|
||||
*/
|
||||
public class BAMIndexBinIterator {
|
||||
/**
|
||||
* The source of index data.
|
||||
*/
|
||||
private final GATKBAMIndex index;
|
||||
|
||||
/**
|
||||
* The file storing the index data.
|
||||
*/
|
||||
private final File indexFile;
|
||||
|
||||
/**
|
||||
* File storing index metadata.
|
||||
*/
|
||||
private final File metaIndexFile;
|
||||
|
||||
/**
|
||||
* Reference sequence that temporary file is based on.
|
||||
*/
|
||||
private final int referenceSequence;
|
||||
|
||||
/**
|
||||
* Size of a long in bytes.
|
||||
*/
|
||||
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
|
||||
|
||||
public BAMIndexBinIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) {
|
||||
this.index = index;
|
||||
this.indexFile = indexFile;
|
||||
this.referenceSequence = referenceSequence;
|
||||
|
||||
index.seek(4);
|
||||
|
||||
final int sequenceCount = index.readInteger();
|
||||
|
||||
if (referenceSequence >= sequenceCount)
|
||||
throw new ReviewedStingException(String.format("Reference sequence past end of genome; reference sequence = %d, sequence count = %d",referenceSequence,sequenceCount));
|
||||
|
||||
index.skipToSequence(referenceSequence);
|
||||
|
||||
int binCount = index.readInteger();
|
||||
|
||||
try {
|
||||
metaIndexFile = File.createTempFile("bammetaindex."+referenceSequence,null);
|
||||
metaIndexFile.deleteOnExit();
|
||||
|
||||
FileOutputStream metaIndex = new FileOutputStream(metaIndexFile);
|
||||
FileChannel metaIndexChannel = metaIndex.getChannel();
|
||||
|
||||
// zero out the contents of the file. Arrays of primitives in java are always zeroed out by default.
|
||||
byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*(Long.SIZE/8)]; // byte array is zeroed out by default.
|
||||
metaIndexChannel.write(ByteBuffer.wrap(emptyContents));
|
||||
|
||||
ByteBuffer binPositionBuffer = ByteBuffer.allocate(emptyContents.length);
|
||||
binPositionBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
for (int binNumber = 0; binNumber < binCount; binNumber++) {
|
||||
long position = index.position();
|
||||
|
||||
final int indexBin = index.readInteger();
|
||||
|
||||
metaIndexChannel.position(indexBin*LONG_SIZE_IN_BYTES);
|
||||
binPositionBuffer.putLong(position);
|
||||
binPositionBuffer.flip();
|
||||
|
||||
System.out.printf("Writing bin number %d to position %d: coordinate = %d%n",indexBin,indexBin*Long.SIZE*8,position);
|
||||
|
||||
metaIndexChannel.write(binPositionBuffer);
|
||||
binPositionBuffer.flip();
|
||||
|
||||
final int nChunks = index.readInteger();
|
||||
index.skipBytes(16 * nChunks);
|
||||
}
|
||||
|
||||
metaIndexChannel.close();
|
||||
metaIndex.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to write BAM metaindex",ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
metaIndexFile.delete();
|
||||
}
|
||||
|
||||
public CloseableIterator<GATKBin> getIteratorOverLevel(int level) {
|
||||
return new LevelIterator(level);
|
||||
}
|
||||
|
||||
private class LevelIterator implements CloseableIterator<GATKBin> {
|
||||
/**
|
||||
* The raw BAM index file with unordered bins.
|
||||
*/
|
||||
private final FileInputStream indexInputStream;
|
||||
|
||||
/**
|
||||
* The index metafile, with pointers to ordered bins.
|
||||
*/
|
||||
private final FileInputStream metaIndexInputStream;
|
||||
|
||||
/**
|
||||
* The first and last bins in the level.
|
||||
*/
|
||||
private final int firstBinNumber, lastBinNumber;
|
||||
|
||||
/**
|
||||
* The current bin in the index.
|
||||
*/
|
||||
private int currentBinNumber;
|
||||
|
||||
/**
|
||||
* Position of the most recent chunk data.
|
||||
*/
|
||||
private GATKBin nextBin = null;
|
||||
|
||||
public LevelIterator(final int level) {
|
||||
try {
|
||||
indexInputStream = new FileInputStream(indexFile);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to open index file for reading");
|
||||
}
|
||||
try {
|
||||
metaIndexInputStream = new FileInputStream(metaIndexFile);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to open index metafile for reading");
|
||||
}
|
||||
|
||||
firstBinNumber = GATKBAMIndex.getFirstBinInLevel(level);
|
||||
lastBinNumber = firstBinNumber + index.getLevelSize(level) - 1;
|
||||
|
||||
currentBinNumber = firstBinNumber - 1;
|
||||
advance();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
indexInputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to close index file.");
|
||||
}
|
||||
try {
|
||||
metaIndexInputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to close index metafile");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextBin != null;
|
||||
}
|
||||
|
||||
public GATKBin next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("Out of elements in BAMIndexBinIterator");
|
||||
GATKBin currentPosition = nextBin;
|
||||
advance();
|
||||
return currentPosition;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from a LevelIterator"); }
|
||||
|
||||
private void advance() {
|
||||
ByteBuffer indexFilePositionDecoder = ByteBuffer.allocate(LONG_SIZE_IN_BYTES*2);
|
||||
indexFilePositionDecoder.order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
nextBin = null;
|
||||
try {
|
||||
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES);
|
||||
while(nextBin == null && currentBinNumber < lastBinNumber) {
|
||||
currentBinNumber++;
|
||||
metaIndexInputStream.getChannel().position(currentBinNumber*LONG_SIZE_IN_BYTES);
|
||||
metaIndexInputStream.getChannel().read(indexFilePositionDecoder);
|
||||
indexFilePositionDecoder.flip();
|
||||
long currentPosition = indexFilePositionDecoder.getLong();
|
||||
indexFilePositionDecoder.flip();
|
||||
|
||||
if(currentPosition != 0) {
|
||||
indexInputStream.getChannel().position(currentPosition);
|
||||
indexInputStream.getChannel().read(indexFilePositionDecoder);
|
||||
|
||||
indexFilePositionDecoder.flip();
|
||||
int binNumber = indexFilePositionDecoder.getInt();
|
||||
if(binNumber != currentBinNumber)
|
||||
throw new ReviewedStingException("Index file and metaindex file are out of sync.");
|
||||
|
||||
int nChunks = indexFilePositionDecoder.getInt();
|
||||
GATKChunk[] chunks = new GATKChunk[nChunks];
|
||||
|
||||
indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES*2);
|
||||
indexFilePositionDecoder.clear();
|
||||
|
||||
for (int ci = 0; ci < nChunks; ci++) {
|
||||
indexInputStream.getChannel().read(indexFilePositionDecoder);
|
||||
|
||||
indexFilePositionDecoder.flip();
|
||||
final long chunkBegin = indexFilePositionDecoder.getLong();
|
||||
final long chunkEnd = indexFilePositionDecoder.getLong();
|
||||
chunks[ci] = new GATKChunk(chunkBegin, chunkEnd);
|
||||
|
||||
indexFilePositionDecoder.flip();
|
||||
}
|
||||
|
||||
nextBin = new GATKBin(referenceSequence,binNumber);
|
||||
nextBin.setChunkList(chunks);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to close index metafile");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -100,7 +100,7 @@ class BAMIndexContent {
|
|||
List<GATKChunk> allChunks = new ArrayList<GATKChunk>();
|
||||
for (GATKBin b : mBinList)
|
||||
if (b.getChunkList() != null) {
|
||||
allChunks.addAll(b.getGATKChunkList());
|
||||
allChunks.addAll(Arrays.asList(b.getChunkList()));
|
||||
}
|
||||
return Collections.unmodifiableList(allChunks);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.Bin;
|
||||
import net.sf.samtools.GATKBin;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Represents a tree of overlapping bins in a single
|
||||
* BAM index.
|
||||
*/
|
||||
public class BinTree {
|
||||
/**
|
||||
* The BAM index from which this bin data is sourced.
|
||||
*/
|
||||
private final GATKBAMIndex index;
|
||||
|
||||
/**
|
||||
* The bins in this tree, organized by level.
|
||||
*/
|
||||
private final GATKBin[] bins;
|
||||
|
||||
public BinTree(GATKBAMIndex index,final GATKBin[] bins) {
|
||||
this.index = index;
|
||||
this.bins = bins;
|
||||
}
|
||||
|
||||
public GATKBin getLowestLevelBin() {
|
||||
for(int i = bins.length-1; i >= 0; i--) {
|
||||
if(bins[i] != null)
|
||||
return bins[i];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieve the bins from the bin tree.
|
||||
* @return list of bins.
|
||||
*/
|
||||
public GATKBin[] getBins() {
|
||||
return bins;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the number of bins in a given bin list.
|
||||
* @return Number of bins in the list.
|
||||
*/
|
||||
public int size() {
|
||||
return bins.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks overlap between this bin tree and other bin trees.
|
||||
* @param position the position over which to detect overlap.
|
||||
* @return True if the segment overlaps. False otherwise.
|
||||
*/
|
||||
public boolean overlaps(final GenomeLoc position) {
|
||||
for(GATKBin gatkBin: bins) {
|
||||
if(gatkBin == null)
|
||||
continue;
|
||||
Bin bin = new Bin(gatkBin.getReferenceSequence(),gatkBin.getBinNumber());
|
||||
// Overlap occurs when the position is not disjoint with the bin boundaries.
|
||||
if(!(position.getStop() < index.getFirstLocusInBin(bin) || position.getStart() > index.getLastLocusInBin(bin)))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate through all bin trees in sequence, from those covering base 1 to those covering MAX_BINS.
|
||||
*/
|
||||
class BinTreeIterator implements Iterator<BinTree> {
|
||||
/**
|
||||
* The index over which to iterate.
|
||||
*/
|
||||
private final GATKBAMIndex index;
|
||||
|
||||
/**
|
||||
* Iterators over each individual level.
|
||||
*/
|
||||
private final PeekableIterator<GATKBin>[] levelIterators;
|
||||
|
||||
/**
|
||||
* The next bin tree to be returned.
|
||||
*/
|
||||
private BinTree nextBinTree;
|
||||
|
||||
/**
|
||||
* Each iteration through the bin tree has a corresponding lowest level. Make sure
|
||||
* every lowest-level bin is covered, whether that bin is present or not.
|
||||
*/
|
||||
private int currentBinInLowestLevel;
|
||||
|
||||
public BinTreeIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) {
|
||||
this.index = index;
|
||||
|
||||
BAMIndexBinIterator binIterator = new BAMIndexBinIterator(index,indexFile,referenceSequence);
|
||||
levelIterators = new PeekableIterator[GATKBAMIndex.getNumIndexLevels()];
|
||||
for(int level = 0; level < GATKBAMIndex.getNumIndexLevels(); level++)
|
||||
levelIterators[level] = new PeekableIterator<GATKBin>(binIterator.getIteratorOverLevel(level));
|
||||
|
||||
// Set the current bin to one less that the first bin in the sequence. advance() will push it
|
||||
// ahead to the first bin in the lowest level.
|
||||
currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1) - 1;
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextBinTree != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the next BinTree in the level.
|
||||
* @return Next BinTree in sequence.
|
||||
*/
|
||||
public BinTree next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("BinTreeIterator is out of elements");
|
||||
BinTree currentBinTree = nextBinTree;
|
||||
advance();
|
||||
return currentBinTree;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bring the bin tree ahead to the next overlapping structure.
|
||||
*/
|
||||
private void advance() {
|
||||
final int lowestLevel = GATKBAMIndex.getNumIndexLevels()-1;
|
||||
final int firstBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(lowestLevel);
|
||||
final int binsInLowestLevel = index.getLevelSize(lowestLevel);
|
||||
|
||||
currentBinInLowestLevel++;
|
||||
|
||||
GATKBin[] bins = new GATKBin[GATKBAMIndex.getNumIndexLevels()];
|
||||
nextBinTree = null;
|
||||
while(nextBinTree == null) {
|
||||
for(int level = lowestLevel; level >= 0; level--) {
|
||||
if(!levelIterators[level].hasNext())
|
||||
continue;
|
||||
|
||||
final int firstBinInThisLevel = GATKBAMIndex.getFirstBinInLevel(level);
|
||||
final int binsInThisLevel = index.getLevelSize(level);
|
||||
final int currentBinInThisLevel = ((currentBinInLowestLevel-firstBinInLowestLevel)*binsInThisLevel/binsInLowestLevel) + firstBinInThisLevel;
|
||||
|
||||
while(levelIterators[level].hasNext() && levelIterators[level].peek().getBinNumber() < currentBinInThisLevel)
|
||||
levelIterators[level].next();
|
||||
|
||||
if(levelIterators[level].hasNext() && levelIterators[level].peek().getBinNumber() == currentBinInThisLevel)
|
||||
bins[level] = levelIterators[level].peek();
|
||||
}
|
||||
|
||||
for(int level = 0; level <= lowestLevel; level++) {
|
||||
if(bins[level] != null) {
|
||||
nextBinTree = new BinTree(index,bins);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove unsupported.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove elements from a BinTreeIterator");
|
||||
}
|
||||
}
|
||||
|
|
@ -61,6 +61,10 @@ class FilePointer {
|
|||
this.isRegionUnmapped = false;
|
||||
}
|
||||
|
||||
public FilePointer(final String referenceSequence) {
|
||||
this(referenceSequence,null);
|
||||
}
|
||||
|
||||
public void addLocation(GenomeLoc location) {
|
||||
locations.add(location);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,6 +103,14 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the file backing this index.
|
||||
* @return The index file.
|
||||
*/
|
||||
public File getIndexFile() {
|
||||
return mFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of levels employed by this index.
|
||||
* @return Number of levels in this index.
|
||||
|
|
@ -127,7 +135,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
*/
|
||||
public int getLevelSize(final int levelNumber) {
|
||||
if(levelNumber == getNumIndexLevels()-1)
|
||||
return MAX_BINS-LEVEL_STARTS[levelNumber];
|
||||
return MAX_BINS-LEVEL_STARTS[levelNumber]-1;
|
||||
else
|
||||
return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber];
|
||||
}
|
||||
|
|
@ -272,7 +280,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
|
||||
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
|
||||
for(GATKBin bin: bins) {
|
||||
for(GATKChunk chunk: bin.getGATKChunkList())
|
||||
for(GATKChunk chunk: bin.getChunkList())
|
||||
chunkList.add(chunk.clone());
|
||||
}
|
||||
|
||||
|
|
@ -281,7 +289,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
}
|
||||
|
||||
chunkList = optimizeChunkList(chunkList,queryResults.getLinearIndex().getMinimumOffset(startPos));
|
||||
return new GATKBAMFileSpan(chunkList);
|
||||
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -322,13 +330,13 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
|
||||
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
|
||||
for(GATKBin coveringBin: binTree) {
|
||||
for(GATKChunk chunk: coveringBin.getGATKChunkList())
|
||||
for(GATKChunk chunk: coveringBin.getChunkList())
|
||||
chunkList.add(chunk.clone());
|
||||
}
|
||||
|
||||
final int start = getFirstLocusInBin(bin);
|
||||
chunkList = optimizeChunkList(chunkList,indexQuery.getLinearIndex().getMinimumOffset(start));
|
||||
return new GATKBAMFileSpan(chunkList);
|
||||
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
|
||||
}
|
||||
|
||||
public GATKBAMFileSpan getContentsOfBin(final Bin bin) {
|
||||
|
|
@ -344,7 +352,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
|
||||
GATKBin queriedBin = indexQuery.getBins().getBin(gatkBin.getBinNumber());
|
||||
|
||||
return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getGATKChunkList()) : null;
|
||||
return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getChunkList()) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -420,8 +428,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
skipBytes(16 * nChunks);
|
||||
}
|
||||
GATKBin bin = new GATKBin(referenceSequence, indexBin);
|
||||
bin.setGATKChunkList(chunks);
|
||||
bin.setLastChunk(lastChunk);
|
||||
bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()]));
|
||||
bins[indexBin] = bin;
|
||||
}
|
||||
|
||||
|
|
@ -560,7 +567,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
}
|
||||
}
|
||||
|
||||
private void skipToSequence(final int sequenceIndex) {
|
||||
protected void skipToSequence(final int sequenceIndex) {
|
||||
for (int i = 0; i < sequenceIndex; i++) {
|
||||
// System.out.println("# Sequence TID: " + i);
|
||||
final int nBins = readInteger();
|
||||
|
|
@ -581,20 +588,23 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex {
|
|||
mFileBuffer.get(bytes);
|
||||
}
|
||||
|
||||
private int readInteger() {
|
||||
protected int readInteger() {
|
||||
return mFileBuffer.getInt();
|
||||
}
|
||||
|
||||
private long readLong() {
|
||||
protected long readLong() {
|
||||
return mFileBuffer.getLong();
|
||||
}
|
||||
|
||||
private void skipBytes(final int count) {
|
||||
protected void skipBytes(final int count) {
|
||||
mFileBuffer.position(mFileBuffer.position() + count);
|
||||
}
|
||||
|
||||
private void seek(final int position) {
|
||||
protected void seek(final int position) {
|
||||
mFileBuffer.position(position);
|
||||
}
|
||||
|
||||
protected long position() {
|
||||
return mFileBuffer.position();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -158,8 +158,8 @@ public class IntervalSharder {
|
|||
if(!binIterator.hasNext())
|
||||
break;
|
||||
|
||||
int locationStart = (int)location.getStart();
|
||||
final int locationStop = (int)location.getStop();
|
||||
int locationStart = location.getStart();
|
||||
final int locationStop = location.getStop();
|
||||
|
||||
// Advance to first bin.
|
||||
while(binIterator.peek().stop < locationStart)
|
||||
|
|
|
|||
|
|
@ -83,7 +83,10 @@ public class LocusShardStrategy implements ShardStrategy {
|
|||
else
|
||||
intervals = locations;
|
||||
|
||||
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
|
||||
if(SAMDataSource.TRY_LOW_MEMORY_SHARDING)
|
||||
this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
|
||||
else
|
||||
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
|
||||
}
|
||||
else {
|
||||
final int maxShardSize = 100000;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,193 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKBin;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once.
|
||||
*/
|
||||
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
||||
private static Logger logger = Logger.getLogger(IntervalSharder.class);
|
||||
|
||||
private final SAMDataSource dataSource;
|
||||
|
||||
private final GenomeLocSortedSet loci;
|
||||
|
||||
private final PeekableIterator<GenomeLoc> locusIterator;
|
||||
|
||||
private GenomeLoc currentLocus;
|
||||
|
||||
private FilePointer nextFilePointer = null;
|
||||
|
||||
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
this.dataSource = dataSource;
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextFilePointer != null;
|
||||
}
|
||||
|
||||
public FilePointer next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next element available in interval sharder");
|
||||
FilePointer currentFilePointer = nextFilePointer;
|
||||
advance();
|
||||
return currentFilePointer;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
if(loci.isEmpty())
|
||||
return;
|
||||
|
||||
nextFilePointer = null;
|
||||
while(nextFilePointer == null && currentLocus != null) {
|
||||
nextFilePointer = new FilePointer(currentLocus.getContig());
|
||||
|
||||
int coveredRegionStart = 1;
|
||||
int coveredRegionStop = Integer.MAX_VALUE;
|
||||
GenomeLoc coveredRegion = null;
|
||||
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs()) {
|
||||
GATKBAMIndex index = (GATKBAMIndex)dataSource.getIndex(reader);
|
||||
BinTree binTree = getNextOverlappingBinTree((GATKBAMIndex)dataSource.getIndex(reader),currentLocus);
|
||||
if(binTree != null) {
|
||||
coveredRegionStart = Math.max(coveredRegionStart,index.getFirstLocusInBin(binTree.getLowestLevelBin().toBin()));
|
||||
coveredRegionStop = Math.min(coveredRegionStop,index.getLastLocusInBin(binTree.getLowestLevelBin().toBin()));
|
||||
coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop);
|
||||
|
||||
GATKBAMFileSpan fileSpan = generateFileSpan(index,binTree,currentLocus);
|
||||
nextFilePointer.addFileSpans(reader,fileSpan);
|
||||
}
|
||||
}
|
||||
|
||||
// Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list.
|
||||
GenomeLoc initialLocation = currentLocus.intersect(coveredRegion);
|
||||
nextFilePointer.addLocation(initialLocation);
|
||||
|
||||
// See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval.
|
||||
if(!nextFilePointer.locations.isEmpty()) {
|
||||
while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) {
|
||||
currentLocus = locusIterator.next();
|
||||
nextFilePointer.addLocation(locusIterator.next());
|
||||
}
|
||||
|
||||
// Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus,
|
||||
// we can simplify the interval creation process to the end of the covered region to the stop of the given interval.
|
||||
if(coveredRegionStop < currentLocus.getStop())
|
||||
currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop());
|
||||
else if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
else
|
||||
currentLocus = null;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The last reference sequence processed by this iterator.
|
||||
*/
|
||||
private int lastReferenceSequenceLoaded = -1;
|
||||
|
||||
/**
|
||||
* The stateful iterator used to progress through the genoem.
|
||||
*/
|
||||
private PeekableIterator<BinTree> binTreeIterator = null;
|
||||
|
||||
/**
|
||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||
* @param index BAM index representation.
|
||||
* @param locus Locus for which to grab the bin tree, if available.
|
||||
* @return The BinTree overlapping the given locus.
|
||||
*/
|
||||
private BinTree getNextOverlappingBinTree(final GATKBAMIndex index, final GenomeLoc locus) {
|
||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||
if(locus.getContigIndex() != lastReferenceSequenceLoaded) {
|
||||
if(binTreeIterator != null)
|
||||
binTreeIterator.close();
|
||||
lastReferenceSequenceLoaded = locus.getContigIndex();
|
||||
binTreeIterator = new PeekableIterator<BinTree>(new BinTreeIterator(index,index.getIndexFile(),locus.getContigIndex()));
|
||||
}
|
||||
|
||||
if(!binTreeIterator.hasNext())
|
||||
return null;
|
||||
|
||||
BinTree binTree = binTreeIterator.peek();
|
||||
while(index.getLastLocusInBin(binTree.getLowestLevelBin().toBin()) < locus.getStart()) {
|
||||
binTreeIterator.next(); // Before the point of interest. Consume this one.
|
||||
binTree = binTreeIterator.peek();
|
||||
}
|
||||
|
||||
if(binTree.overlaps(locus)) {
|
||||
binTreeIterator.next();
|
||||
return binTree;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a bin list to a file span, trimmed based on the linear index and with overlapping regions removed.
|
||||
* @param index BAM index.
|
||||
* @param binTree Tree of data found to overlap the region. binTree.overlaps(initialRegion) must return true.
|
||||
* @param initialRegion The region to employ when trimming the linear index.
|
||||
* @return File span mapping to given region.
|
||||
*/
|
||||
private GATKBAMFileSpan generateFileSpan(final GATKBAMIndex index, final BinTree binTree, final GenomeLoc initialRegion) {
|
||||
List<GATKChunk> chunks = new ArrayList<GATKChunk>(binTree.size());
|
||||
for(GATKBin bin: binTree.getBins()) {
|
||||
if(bin == null)
|
||||
continue;
|
||||
chunks.addAll(Arrays.asList(bin.getChunkList()));
|
||||
}
|
||||
|
||||
// Optimize the chunk list with a linear index optimization
|
||||
chunks = index.optimizeChunkList(chunks,index.getLinearIndex(initialRegion.getContigIndex()).getMinimumOffset(initialRegion.getStart()));
|
||||
|
||||
return new GATKBAMFileSpan(chunks.toArray(new GATKChunk[chunks.size()]));
|
||||
}
|
||||
}
|
||||
|
|
@ -46,6 +46,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -79,6 +81,11 @@ public class SAMDataSource {
|
|||
*/
|
||||
private final SAMFileReader.ValidationStringency validationStringency;
|
||||
|
||||
/**
|
||||
* Store BAM indices for each reader present.
|
||||
*/
|
||||
private final Map<SAMReaderID,GATKBAMIndex> bamIndices = new HashMap<SAMReaderID,GATKBAMIndex>();
|
||||
|
||||
/**
|
||||
* How far along is each reader?
|
||||
*/
|
||||
|
|
@ -121,6 +128,8 @@ public class SAMDataSource {
|
|||
*/
|
||||
private final SAMResourcePool resourcePool;
|
||||
|
||||
static final boolean TRY_LOW_MEMORY_SHARDING = false;
|
||||
|
||||
/**
|
||||
* Create a new SAM data source given the supplied read metadata.
|
||||
* @param samFiles list of reads files.
|
||||
|
|
@ -272,6 +281,17 @@ public class SAMDataSource {
|
|||
originalToMergedReadGroupMappings.put(id,mappingToMerged);
|
||||
}
|
||||
|
||||
if(TRY_LOW_MEMORY_SHARDING) {
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
File indexFile = findIndexFile(id.samFile);
|
||||
if(indexFile != null) {
|
||||
SAMSequenceDictionary sequenceDictionary = readers.getReader(id).getFileHeader().getSequenceDictionary();
|
||||
GATKBAMIndex index = new GATKBAMIndex(indexFile,sequenceDictionary);
|
||||
bamIndices.put(id,index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resourcePool.releaseReaders(readers);
|
||||
}
|
||||
|
||||
|
|
@ -366,14 +386,18 @@ public class SAMDataSource {
|
|||
|
||||
/**
|
||||
* True if all readers have an index.
|
||||
* @return
|
||||
* @return True if all readers have an index.
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
|
||||
if(!reader.hasIndex())
|
||||
return false;
|
||||
if(TRY_LOW_MEMORY_SHARDING)
|
||||
return readerIDs.size() == bamIndices.size();
|
||||
else {
|
||||
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
|
||||
if(!reader.hasIndex())
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -382,8 +406,12 @@ public class SAMDataSource {
|
|||
* @return The index. Will preload the index if necessary.
|
||||
*/
|
||||
public BrowseableBAMIndex getIndex(final SAMReaderID id) {
|
||||
SAMReaders readers = resourcePool.getReadersWithoutLocking();
|
||||
return readers.getReader(id).getBrowseableIndex();
|
||||
if(TRY_LOW_MEMORY_SHARDING)
|
||||
return bamIndices.get(id);
|
||||
else {
|
||||
SAMReaders readers = resourcePool.getReadersWithoutLocking();
|
||||
return readers.getReader(id).getBrowseableIndex();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -701,11 +729,10 @@ public class SAMDataSource {
|
|||
for(SAMReaderID readerID: readerIDs) {
|
||||
SAMFileReader reader = new SAMFileReader(readerID.samFile);
|
||||
reader.enableFileSource(true);
|
||||
reader.enableIndexCaching(true);
|
||||
if(!TRY_LOW_MEMORY_SHARDING)
|
||||
reader.enableIndexCaching(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
|
||||
// If no read group is present, hallucinate one.
|
||||
// TODO: Straw poll to see whether this is really required.
|
||||
final SAMFileHeader header = reader.getFileHeader();
|
||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
||||
|
||||
|
|
@ -944,6 +971,37 @@ public class SAMDataSource {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates the index file alongside the given BAM, if present.
|
||||
* TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent.
|
||||
* @param bamFile The data file to use.
|
||||
* @return A File object if the index file is present; null otherwise.
|
||||
*/
|
||||
private File findIndexFile(File bamFile) {
|
||||
File indexFile;
|
||||
|
||||
try {
|
||||
Class bamFileReaderClass = Class.forName("net.sf.samtools.BAMFileReader");
|
||||
Method indexFileLocator = bamFileReaderClass.getDeclaredMethod("findIndexFile",File.class);
|
||||
indexFileLocator.setAccessible(true);
|
||||
indexFile = (File)indexFileLocator.invoke(null,bamFile);
|
||||
}
|
||||
catch(ClassNotFoundException ex) {
|
||||
throw new ReviewedStingException("Unable to locate BAMFileReader class, used to check for index files");
|
||||
}
|
||||
catch(NoSuchMethodException ex) {
|
||||
throw new ReviewedStingException("Unable to locate Picard index file locator.");
|
||||
}
|
||||
catch(IllegalAccessException ex) {
|
||||
throw new ReviewedStingException("Unable to access Picard index file locator.");
|
||||
}
|
||||
catch(InvocationTargetException ex) {
|
||||
throw new ReviewedStingException("Unable to invoke Picard index file locator.");
|
||||
}
|
||||
|
||||
return indexFile;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Basic suite for testing idealized and actual performance of read processing.
|
||||
*/
|
||||
public class BAMProcessingPerformanceMeter extends CommandLineProgram {
|
||||
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = true)
|
||||
File samFile;
|
||||
|
||||
@Input(fullName = "reference_file", shortName="R", doc = "Associated FASTA sequence", required = true)
|
||||
File referenceFile;
|
||||
|
||||
@Argument(fullName="test_repetitions", shortName = "test_reps", doc="Number of times to repeat each test", required = false)
|
||||
int testRepetitions = 5;
|
||||
|
||||
@Argument(fullName="print_frequency", shortName = "pf", doc="Print cumulative time after x # reads", required = false)
|
||||
int printFrequency = 100000;
|
||||
|
||||
private void testBAMFileProcessingThroughput(ReadProcessor readProcessor) {
|
||||
readProcessor.execute(samFile,referenceFile);
|
||||
}
|
||||
|
||||
public int execute() {
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new NoAdditionalProcessing(this));
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new IterateOverEachBase(this));
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new IterateOverCigarString(this));
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new ExtractTag(this,"OQ"));
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeSamLocusIterator(this));
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeLocusIteratorByState(this, GATKArgumentCollection.getDefaultDownsamplingMethod()));
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeLocusIteratorByState(this, DownsamplingMethod.NONE));
|
||||
GATKWalkerInvoker countReadsInvoker = new GATKWalkerInvoker(this);
|
||||
CountReadsPerformanceWalker countReadsWalker = new CountReadsPerformanceWalker(countReadsInvoker);
|
||||
countReadsInvoker.setWalker(countReadsWalker);
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(countReadsInvoker);
|
||||
|
||||
GATKWalkerInvoker countBasesInReadInvoker = new GATKWalkerInvoker(this);
|
||||
CountBasesInReadPerformanceWalker countBasesInReadWalker = new CountBasesInReadPerformanceWalker(countBasesInReadInvoker);
|
||||
countBasesInReadInvoker.setWalker(countBasesInReadWalker);
|
||||
for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(countBasesInReadInvoker);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Required main method implementation.
|
||||
* @param argv Command-line argument text.
|
||||
* @throws Exception on error.
|
||||
*/
|
||||
public static void main(String[] argv) throws Exception {
|
||||
int returnCode = 0;
|
||||
try {
|
||||
BAMProcessingPerformanceMeter instance = new BAMProcessingPerformanceMeter();
|
||||
start(instance, argv);
|
||||
returnCode = 0;
|
||||
}
|
||||
catch(Exception ex) {
|
||||
returnCode = 1;
|
||||
ex.printStackTrace();
|
||||
throw ex;
|
||||
}
|
||||
finally {
|
||||
System.exit(returnCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
abstract class ReadProcessor {
|
||||
private final SimpleTimer timer;
|
||||
private final int printFrequency;
|
||||
protected int iterations = 0;
|
||||
|
||||
public ReadProcessor(BAMProcessingPerformanceMeter performanceMeter) {
|
||||
timer = new SimpleTimer("timer");
|
||||
this.printFrequency = performanceMeter.printFrequency;
|
||||
}
|
||||
|
||||
public abstract String getTestName();
|
||||
public String getIterationType() { return "loci"; }
|
||||
|
||||
public void processRead(final SAMRecord read) { }
|
||||
public void execute(File bamFile,File fastaFile) {
|
||||
SAMFileReader reader = new SAMFileReader(bamFile);
|
||||
startTest();
|
||||
for(SAMRecord read: reader) {
|
||||
processRead(read);
|
||||
updateIterationCount();
|
||||
}
|
||||
stopTest();
|
||||
reader.close();
|
||||
}
|
||||
|
||||
protected void startTest() {
|
||||
timer.start();
|
||||
}
|
||||
|
||||
protected void stopTest() {
|
||||
timer.stop();
|
||||
printStatus("TEST COMPLETE");
|
||||
}
|
||||
|
||||
protected void updateIterationCount() {
|
||||
if(++iterations % printFrequency == 0) printStatus("ONGOING");
|
||||
}
|
||||
|
||||
private void printStatus(String prefix) {
|
||||
System.out.printf("%s: %s printed %d %s in %f seconds.%n",prefix,getTestName(),iterations,getIterationType(),timer.getElapsedTime());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:55 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class CountBasesInReadPerformanceWalker extends ReadWalker<Integer,Long> {
|
||||
private long As;
|
||||
private long Cs;
|
||||
private long Gs;
|
||||
private long Ts;
|
||||
|
||||
private final GATKWalkerInvoker invoker;
|
||||
|
||||
public CountBasesInReadPerformanceWalker(GATKWalkerInvoker walkerInvoker) {
|
||||
this.invoker = walkerInvoker;
|
||||
|
||||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||
for(byte base: read.getReadBases()) {
|
||||
switch(base) {
|
||||
case 'A': As++; break;
|
||||
case 'C': Cs++; break;
|
||||
case 'G': Gs++; break;
|
||||
case 'T': Ts++; break;
|
||||
}
|
||||
}
|
||||
invoker.updateIterationCount();
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Long reduceInit() { return 0L; }
|
||||
public Long reduce(Integer value, Long accum) { return value + accum; }
|
||||
}
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:55 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class CountReadsPerformanceWalker extends ReadWalker<Integer,Long> {
|
||||
private final GATKWalkerInvoker invoker;
|
||||
|
||||
public CountReadsPerformanceWalker(GATKWalkerInvoker walkerInvoker) {
|
||||
this.invoker = walkerInvoker;
|
||||
|
||||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||
invoker.updateIterationCount();
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Long reduceInit() { return 0L; }
|
||||
public Long reduce(Integer value, Long accum) { return value + accum; }
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:53 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class ExtractTag extends ReadProcessor {
|
||||
private final String tag;
|
||||
|
||||
public ExtractTag(final BAMProcessingPerformanceMeter performanceMeter, final String tag) {
|
||||
super(performanceMeter);
|
||||
this.tag = tag;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestName() { return "extract tag"; }
|
||||
public void processRead(final SAMRecord read) {
|
||||
read.getAttribute(tag);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:54 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class GATKWalkerInvoker extends ReadProcessor {
|
||||
/**
|
||||
* Walker to run over the existing dataset.
|
||||
*/
|
||||
private Walker<?,?> walker;
|
||||
|
||||
public GATKWalkerInvoker(BAMProcessingPerformanceMeter performanceMeter) {
|
||||
super(performanceMeter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestName() { return "GATK-CountReads"; }
|
||||
|
||||
public void setWalker(Walker<?,?> walker) {
|
||||
this.walker = walker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(File samFile, File fastaFile) {
|
||||
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
|
||||
// Establish the argument collection
|
||||
GATKArgumentCollection argCollection = new GATKArgumentCollection();
|
||||
argCollection.referenceFile = fastaFile;
|
||||
argCollection.samFiles = Collections.singletonList(samFile.getAbsolutePath());
|
||||
|
||||
engine.setArguments(argCollection);
|
||||
// Bugs in the engine mean that this has to be set twice.
|
||||
engine.setSAMFileIDs(Collections.singletonList(new SAMReaderID(samFile,new Tags())));
|
||||
engine.setFilters(Collections.<SamRecordFilter>emptyList());
|
||||
engine.setReferenceMetaDataFiles(Collections.<RMDTriplet>emptyList());
|
||||
|
||||
// Create the walker
|
||||
engine.setWalker(walker);
|
||||
|
||||
startTest();
|
||||
engine.execute();
|
||||
stopTest();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
|
||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:54 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class InvokeLocusIteratorByState extends ReadProcessor {
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
|
||||
public InvokeLocusIteratorByState(final BAMProcessingPerformanceMeter performanceMeter,DownsamplingMethod downsamplingMethod) {
|
||||
super(performanceMeter);
|
||||
this.downsamplingMethod = downsamplingMethod;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestName() {
|
||||
if(downsamplingMethod != DownsamplingMethod.NONE)
|
||||
return String.format("invoke locus iterator by state; downsampling by sample to coverage = %d; ",downsamplingMethod.toCoverage);
|
||||
else
|
||||
return String.format("invoke locus iterator by state; no downsampling; ");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getIterationType() { return "loci"; }
|
||||
|
||||
@Override
|
||||
public void execute(File samFile, File fastaFile) {
|
||||
SAMFileReader reader = new SAMFileReader(samFile);
|
||||
ReadProperties readProperties = new ReadProperties(Collections.<SAMReaderID>singletonList(new SAMReaderID(samFile,new Tags())),
|
||||
reader.getFileHeader(),
|
||||
false,
|
||||
SAMFileReader.ValidationStringency.SILENT,
|
||||
0,
|
||||
downsamplingMethod,
|
||||
new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)),
|
||||
Collections.<SamRecordFilter>emptyList(),
|
||||
false,
|
||||
false,
|
||||
BAQ.CalculationMode.OFF,
|
||||
BAQ.QualityMode.DONT_MODIFY,
|
||||
null,
|
||||
(byte)0);
|
||||
|
||||
GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary());
|
||||
SampleDataSource sampleDataSource = new SampleDataSource();
|
||||
sampleDataSource.addSamplesFromSAMHeader(reader.getFileHeader());
|
||||
|
||||
// Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out?
|
||||
Iterator<SAMRecord> readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter());
|
||||
LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource);
|
||||
startTest();
|
||||
while(locusIteratorByState.hasNext()) {
|
||||
locusIteratorByState.next();
|
||||
updateIterationCount();
|
||||
}
|
||||
stopTest();
|
||||
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.picard.util.SamLocusIterator;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:54 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class InvokeSamLocusIterator extends ReadProcessor {
|
||||
public InvokeSamLocusIterator(final BAMProcessingPerformanceMeter performanceMeter) {
|
||||
super(performanceMeter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestName() {
|
||||
return String.format("invoke sam locus iterator");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getIterationType() { return "loci"; }
|
||||
|
||||
@Override
|
||||
public void execute(File samFile, File fastaFile) {
|
||||
SAMFileReader reader = new SAMFileReader(samFile);
|
||||
|
||||
SamLocusIterator samLocusIterator = new SamLocusIterator(reader);
|
||||
samLocusIterator.setEmitUncoveredLoci(false);
|
||||
Iterator<SamLocusIterator.LocusInfo> workhorseIterator = samLocusIterator.iterator();
|
||||
|
||||
startTest();
|
||||
while(workhorseIterator.hasNext()) {
|
||||
SamLocusIterator.LocusInfo locusInfo = workhorseIterator.next();
|
||||
updateIterationCount();
|
||||
}
|
||||
stopTest();
|
||||
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:53 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class IterateOverCigarString extends ReadProcessor {
|
||||
private long matchMismatches;
|
||||
private long insertions;
|
||||
private long deletions;
|
||||
private long others;
|
||||
|
||||
public IterateOverCigarString(final BAMProcessingPerformanceMeter performanceMeter) {
|
||||
super(performanceMeter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestName() { return "iterator over cigar string"; }
|
||||
public void processRead(final SAMRecord read) {
|
||||
Cigar cigar = read.getCigar();
|
||||
for(CigarElement cigarElement: cigar.getCigarElements()) {
|
||||
int elementSize = cigarElement.getLength();
|
||||
while(elementSize > 0) {
|
||||
switch(cigarElement.getOperator()) {
|
||||
case M: matchMismatches++; break;
|
||||
case I: insertions++; break;
|
||||
case D: deletions++; break;
|
||||
default: others++; break;
|
||||
}
|
||||
elementSize--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:53 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class IterateOverEachBase extends ReadProcessor {
|
||||
private long As;
|
||||
private long Cs;
|
||||
private long Gs;
|
||||
private long Ts;
|
||||
|
||||
public IterateOverEachBase(final BAMProcessingPerformanceMeter performanceMeter) {
|
||||
super(performanceMeter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestName() { return "iterate over each base"; }
|
||||
public void processRead(final SAMRecord read) {
|
||||
for(byte base: read.getReadBases()) {
|
||||
switch(base) {
|
||||
case 'A': As++; break;
|
||||
case 'C': Cs++; break;
|
||||
case 'G': Gs++; break;
|
||||
case 'T': Ts++; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.performance;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 25, 2011
|
||||
* Time: 10:16:53 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class NoAdditionalProcessing extends ReadProcessor {
|
||||
public NoAdditionalProcessing(final BAMProcessingPerformanceMeter performanceMeter) {
|
||||
super(performanceMeter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTestName() { return "no additional processing"; }
|
||||
public void processRead(final SAMRecord read) {}
|
||||
}
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads.utilities;
|
||||
|
||||
import net.sf.samtools.util.BlockGunzipper;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
/**
|
||||
* Test decompression of a single BGZF block.
|
||||
*/
|
||||
public class UnzipSingleBlock extends CommandLineProgram {
|
||||
@Input(fullName = "block_file", shortName = "b", doc = "block file over which to test unzipping", required = true)
|
||||
private File blockFile;
|
||||
|
||||
@Input(fullName = "compressed_block_size", shortName = "cbs", doc = "size of compressed block", required = true)
|
||||
private int compressedBufferSize;
|
||||
|
||||
public int execute() throws IOException, NoSuchMethodException, IllegalAccessException, InvocationTargetException {
|
||||
byte[] compressedBuffer = new byte[(int)blockFile.length()];
|
||||
byte[] uncompressedBuffer = new byte[65536];
|
||||
|
||||
FileInputStream fis = new FileInputStream(blockFile);
|
||||
fis.read(compressedBuffer);
|
||||
fis.close();
|
||||
|
||||
BlockGunzipper gunzipper = new BlockGunzipper();
|
||||
gunzipper.setCheckCrcs(true);
|
||||
Method unzipBlock = BlockGunzipper.class.getDeclaredMethod("unzipBlock",byte[].class,byte[].class,Integer.TYPE);
|
||||
unzipBlock.setAccessible(true);
|
||||
|
||||
unzipBlock.invoke(gunzipper,uncompressedBuffer,compressedBuffer,compressedBufferSize);
|
||||
|
||||
System.out.printf("SUCCESS!%n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Required main method implementation.
|
||||
* @param argv Command-line argument text.
|
||||
* @throws Exception on error.
|
||||
*/
|
||||
public static void main(String[] argv) throws Exception {
|
||||
int returnCode = 0;
|
||||
try {
|
||||
UnzipSingleBlock instance = new UnzipSingleBlock();
|
||||
start(instance, argv);
|
||||
returnCode = 0;
|
||||
}
|
||||
catch(Exception ex) {
|
||||
returnCode = 1;
|
||||
ex.printStackTrace();
|
||||
throw ex;
|
||||
}
|
||||
finally {
|
||||
System.exit(returnCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -88,7 +88,7 @@ public class GATKBAMIndexUnitTest extends BaseTest {
|
|||
|
||||
// Level 5
|
||||
Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(5),4681);
|
||||
Assert.assertEquals(bamIndex.getLevelSize(5),37449-4681+1);
|
||||
Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue