From 600f73cbd6f5dc8c249b134fe1c4f8e81c93d963 Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 25 Feb 2011 17:50:32 +0000 Subject: [PATCH] A checkpoint commit of two BAM reading projects going on simultaneously. These two projects are works in progress, and this checkin will provide a baseline against which to gauge improvements to both projects. Low-memory BAM protoshards (disabled by default): - Currently passing ValidatingPileupIntegrationTest. - Gets progressively slower throughout the traversal, but should run at least as fast as original implementation. - Uses 10+ file handles per BAM, but should use 3. BAM performance microbenchmark test system: - Currently tests performance of BAM reading using SAM-JDK vs. GATK - Tests do not hit all GATK performance hotspots. - New tests that require input data in a slightly different form are hard to implement. - Output of test results is not easily parseable (investigating Google Caliper for possible improvements). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5317 348d0f76-0448-11de-a6fe-93d51630548a --- java/src/net/sf/samtools/GATKBAMFileSpan.java | 5 +- java/src/net/sf/samtools/GATKBin.java | 106 +++++-- .../arguments/GATKArgumentCollection.java | 2 +- .../reads/BAMIndexBinIterator.java | 266 ++++++++++++++++++ .../datasources/reads/BAMIndexContent.java | 2 +- .../sting/gatk/datasources/reads/BinTree.java | 198 +++++++++++++ .../gatk/datasources/reads/FilePointer.java | 4 + .../gatk/datasources/reads/GATKBAMIndex.java | 36 ++- .../datasources/reads/IntervalSharder.java | 4 +- .../datasources/reads/LocusShardStrategy.java | 5 +- .../reads/LowMemoryIntervalSharder.java | 193 +++++++++++++ .../gatk/datasources/reads/SAMDataSource.java | 78 ++++- .../BAMProcessingPerformanceMeter.java | 143 ++++++++++ .../CountBasesInReadPerformanceWalker.java | 67 +++++ .../CountReadsPerformanceWalker.java | 54 ++++ .../reads/performance/ExtractTag.java | 49 ++++ .../reads/performance/GATKWalkerInvoker.java | 85 ++++++ .../InvokeLocusIteratorByState.java | 106 +++++++ .../performance/InvokeSamLocusIterator.java | 70 +++++ .../performance/IterateOverCigarString.java | 65 +++++ .../performance/IterateOverEachBase.java | 58 ++++ .../performance/NoAdditionalProcessing.java | 44 +++ .../reads/utilities/UnzipSingleBlock.java | 88 ++++++ .../reads/GATKBAMIndexUnitTest.java | 2 +- 24 files changed, 1678 insertions(+), 52 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/BinTree.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/BAMProcessingPerformanceMeter.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountBasesInReadPerformanceWalker.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountReadsPerformanceWalker.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/ExtractTag.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/GATKWalkerInvoker.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeLocusIteratorByState.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeSamLocusIterator.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverCigarString.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverEachBase.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/NoAdditionalProcessing.java create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java diff --git a/java/src/net/sf/samtools/GATKBAMFileSpan.java b/java/src/net/sf/samtools/GATKBAMFileSpan.java index ba99f73e5..13c40c3a2 100644 --- a/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -25,6 +25,7 @@ package net.sf.samtools; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -53,8 +54,8 @@ public class GATKBAMFileSpan extends BAMFileSpan { * Create a new chunk list from the given list of chunks. * @param chunks Constituent chunks. */ - public GATKBAMFileSpan(final List chunks) { - super(new ArrayList(chunks)); + public GATKBAMFileSpan(final GATKChunk[] chunks) { + super(Arrays.asList(chunks)); } /** diff --git a/java/src/net/sf/samtools/GATKBin.java b/java/src/net/sf/samtools/GATKBin.java index 0922ef630..e2ac8bd96 100644 --- a/java/src/net/sf/samtools/GATKBin.java +++ b/java/src/net/sf/samtools/GATKBin.java @@ -33,38 +33,102 @@ import java.util.List; * override GATKBin and make it public. * TODO: Eliminate once we determine the final fate of the BAM index reading code. */ -public class GATKBin extends Bin { +public class GATKBin implements Comparable { + /** + * The reference sequence associated with this bin. + */ + private final int referenceSequence; + + /** + * The number of this bin within the BAM file. + */ + private final int binNumber; + + /** + * The chunks associated with this bin. + */ + private GATKChunk[] chunkList; + + public GATKBin(Bin bin) { + this(bin.getReferenceSequence(),bin.getBinNumber()); + } + public GATKBin(final int referenceSequence, final int binNumber) { - super(referenceSequence,binNumber); + this.referenceSequence = referenceSequence; + this.binNumber = binNumber; } - public GATKBin(final Bin bin) { - super(bin.getReferenceSequence(),bin.getBinNumber()); - } - - @Override public int getReferenceSequence() { - return super.getReferenceSequence(); + return referenceSequence; } - @Override public int getBinNumber() { - return super.getBinNumber(); + return binNumber; } - public List getGATKChunkList() { - List gatkChunks = new ArrayList(); - for(Chunk chunk: getChunkList()) - gatkChunks.add(new GATKChunk(chunk)); - return gatkChunks; - } - - public void setGATKChunkList(List chunks) { - super.setChunkList(new ArrayList(chunks)); + /** + * Convert this GATKBin to a normal bin, for processing with the standard BAM query interface. + * @return + */ + public Bin toBin() { + return new Bin(referenceSequence,binNumber); } + /** + * See whether two bins are equal. If the ref seq and the bin number + * are equal, assume equality of the chunk list. + * @param other The other Bin to which to compare this. + * @return True if the two bins are equal. False otherwise. + */ @Override - public String toString() { - return String.format("Bin %d in contig %d",getBinNumber(),getReferenceSequence()); + public boolean equals(Object other) { + if(other == null) return false; + if(!(other instanceof GATKBin)) return false; + + GATKBin otherBin = (GATKBin)other; + return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber; + } + + /** + * Compute a unique hash code for the given reference sequence and bin number. + * @return A unique hash code. + */ + @Override + public int hashCode() { + return ((Integer)referenceSequence).hashCode() ^ ((Integer)binNumber).hashCode(); + } + + /** + * Compare two bins to see what ordering they should appear in. + * @param other Other bin to which this bin should be compared. + * @return -1 if this < other, 0 if this == other, 1 if this > other. + */ + public int compareTo(GATKBin other) { + if(other == null) + throw new ClassCastException("Cannot compare to a null object"); + + // Check the reference sequences first. + if(this.referenceSequence != other.referenceSequence) + return referenceSequence - other.referenceSequence; + + // Then check the bin ordering. + return binNumber - other.binNumber; + } + + /** + * Sets the chunks associated with this bin + */ + public void setChunkList(GATKChunk[] list){ + chunkList = list; + } + + /** + * Gets the list of chunks associated with this bin. + * @return the chunks in this bin. If no chunks are associated, an empty list will be returned. + */ + public GATKChunk[] getChunkList(){ + if(chunkList == null) + return new GATKChunk[0]; + return chunkList; } } diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 4d00262d3..2eff6dd11 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -179,7 +179,7 @@ public class GATKArgumentCollection { * method. * @return The default downsampling mechanism, or null if none exists. */ - public DownsamplingMethod getDefaultDownsamplingMethod() { + public static DownsamplingMethod getDefaultDownsamplingMethod() { return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java new file mode 100644 index 000000000..ef22f9b43 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBin; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Creates an 'index of the index' for a particular reference sequence + * within the BAM file, for easier whole-BAM-file traversal. + * file. + */ +public class BAMIndexBinIterator { + /** + * The source of index data. + */ + private final GATKBAMIndex index; + + /** + * The file storing the index data. + */ + private final File indexFile; + + /** + * File storing index metadata. + */ + private final File metaIndexFile; + + /** + * Reference sequence that temporary file is based on. + */ + private final int referenceSequence; + + /** + * Size of a long in bytes. + */ + private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; + + public BAMIndexBinIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) { + this.index = index; + this.indexFile = indexFile; + this.referenceSequence = referenceSequence; + + index.seek(4); + + final int sequenceCount = index.readInteger(); + + if (referenceSequence >= sequenceCount) + throw new ReviewedStingException(String.format("Reference sequence past end of genome; reference sequence = %d, sequence count = %d",referenceSequence,sequenceCount)); + + index.skipToSequence(referenceSequence); + + int binCount = index.readInteger(); + + try { + metaIndexFile = File.createTempFile("bammetaindex."+referenceSequence,null); + metaIndexFile.deleteOnExit(); + + FileOutputStream metaIndex = new FileOutputStream(metaIndexFile); + FileChannel metaIndexChannel = metaIndex.getChannel(); + + // zero out the contents of the file. Arrays of primitives in java are always zeroed out by default. + byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*(Long.SIZE/8)]; // byte array is zeroed out by default. + metaIndexChannel.write(ByteBuffer.wrap(emptyContents)); + + ByteBuffer binPositionBuffer = ByteBuffer.allocate(emptyContents.length); + binPositionBuffer.order(ByteOrder.LITTLE_ENDIAN); + + for (int binNumber = 0; binNumber < binCount; binNumber++) { + long position = index.position(); + + final int indexBin = index.readInteger(); + + metaIndexChannel.position(indexBin*LONG_SIZE_IN_BYTES); + binPositionBuffer.putLong(position); + binPositionBuffer.flip(); + + System.out.printf("Writing bin number %d to position %d: coordinate = %d%n",indexBin,indexBin*Long.SIZE*8,position); + + metaIndexChannel.write(binPositionBuffer); + binPositionBuffer.flip(); + + final int nChunks = index.readInteger(); + index.skipBytes(16 * nChunks); + } + + metaIndexChannel.close(); + metaIndex.close(); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to write BAM metaindex",ex); + } + } + + public void close() { + metaIndexFile.delete(); + } + + public CloseableIterator getIteratorOverLevel(int level) { + return new LevelIterator(level); + } + + private class LevelIterator implements CloseableIterator { + /** + * The raw BAM index file with unordered bins. + */ + private final FileInputStream indexInputStream; + + /** + * The index metafile, with pointers to ordered bins. + */ + private final FileInputStream metaIndexInputStream; + + /** + * The first and last bins in the level. + */ + private final int firstBinNumber, lastBinNumber; + + /** + * The current bin in the index. + */ + private int currentBinNumber; + + /** + * Position of the most recent chunk data. + */ + private GATKBin nextBin = null; + + public LevelIterator(final int level) { + try { + indexInputStream = new FileInputStream(indexFile); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to open index file for reading"); + } + try { + metaIndexInputStream = new FileInputStream(metaIndexFile); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to open index metafile for reading"); + } + + firstBinNumber = GATKBAMIndex.getFirstBinInLevel(level); + lastBinNumber = firstBinNumber + index.getLevelSize(level) - 1; + + currentBinNumber = firstBinNumber - 1; + advance(); + } + + public void close() { + try { + indexInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to close index file."); + } + try { + metaIndexInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to close index metafile"); + } + } + + public boolean hasNext() { + return nextBin != null; + } + + public GATKBin next() { + if(!hasNext()) + throw new NoSuchElementException("Out of elements in BAMIndexBinIterator"); + GATKBin currentPosition = nextBin; + advance(); + return currentPosition; + } + + public void remove() { throw new UnsupportedOperationException("Cannot remove from a LevelIterator"); } + + private void advance() { + ByteBuffer indexFilePositionDecoder = ByteBuffer.allocate(LONG_SIZE_IN_BYTES*2); + indexFilePositionDecoder.order(ByteOrder.LITTLE_ENDIAN); + + nextBin = null; + try { + indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES); + while(nextBin == null && currentBinNumber < lastBinNumber) { + currentBinNumber++; + metaIndexInputStream.getChannel().position(currentBinNumber*LONG_SIZE_IN_BYTES); + metaIndexInputStream.getChannel().read(indexFilePositionDecoder); + indexFilePositionDecoder.flip(); + long currentPosition = indexFilePositionDecoder.getLong(); + indexFilePositionDecoder.flip(); + + if(currentPosition != 0) { + indexInputStream.getChannel().position(currentPosition); + indexInputStream.getChannel().read(indexFilePositionDecoder); + + indexFilePositionDecoder.flip(); + int binNumber = indexFilePositionDecoder.getInt(); + if(binNumber != currentBinNumber) + throw new ReviewedStingException("Index file and metaindex file are out of sync."); + + int nChunks = indexFilePositionDecoder.getInt(); + GATKChunk[] chunks = new GATKChunk[nChunks]; + + indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES*2); + indexFilePositionDecoder.clear(); + + for (int ci = 0; ci < nChunks; ci++) { + indexInputStream.getChannel().read(indexFilePositionDecoder); + + indexFilePositionDecoder.flip(); + final long chunkBegin = indexFilePositionDecoder.getLong(); + final long chunkEnd = indexFilePositionDecoder.getLong(); + chunks[ci] = new GATKChunk(chunkBegin, chunkEnd); + + indexFilePositionDecoder.flip(); + } + + nextBin = new GATKBin(referenceSequence,binNumber); + nextBin.setChunkList(chunks); + } + } + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to close index metafile"); + } + + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java index 48194dafa..4d91fb45f 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexContent.java @@ -100,7 +100,7 @@ class BAMIndexContent { List allChunks = new ArrayList(); for (GATKBin b : mBinList) if (b.getChunkList() != null) { - allChunks.addAll(b.getGATKChunkList()); + allChunks.addAll(Arrays.asList(b.getChunkList())); } return Collections.unmodifiableList(allChunks); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BinTree.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BinTree.java new file mode 100644 index 000000000..95c33d552 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BinTree.java @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.Bin; +import net.sf.samtools.GATKBin; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * Represents a tree of overlapping bins in a single + * BAM index. + */ +public class BinTree { + /** + * The BAM index from which this bin data is sourced. + */ + private final GATKBAMIndex index; + + /** + * The bins in this tree, organized by level. + */ + private final GATKBin[] bins; + + public BinTree(GATKBAMIndex index,final GATKBin[] bins) { + this.index = index; + this.bins = bins; + } + + public GATKBin getLowestLevelBin() { + for(int i = bins.length-1; i >= 0; i--) { + if(bins[i] != null) + return bins[i]; + } + return null; + } + + + /** + * Retrieve the bins from the bin tree. + * @return list of bins. + */ + public GATKBin[] getBins() { + return bins; + } + + /** + * Gets the number of bins in a given bin list. + * @return Number of bins in the list. + */ + public int size() { + return bins.length; + } + + /** + * Checks overlap between this bin tree and other bin trees. + * @param position the position over which to detect overlap. + * @return True if the segment overlaps. False otherwise. + */ + public boolean overlaps(final GenomeLoc position) { + for(GATKBin gatkBin: bins) { + if(gatkBin == null) + continue; + Bin bin = new Bin(gatkBin.getReferenceSequence(),gatkBin.getBinNumber()); + // Overlap occurs when the position is not disjoint with the bin boundaries. + if(!(position.getStop() < index.getFirstLocusInBin(bin) || position.getStart() > index.getLastLocusInBin(bin))) + return true; + } + return false; + } +} + +/** + * Iterate through all bin trees in sequence, from those covering base 1 to those covering MAX_BINS. + */ +class BinTreeIterator implements Iterator { + /** + * The index over which to iterate. + */ + private final GATKBAMIndex index; + + /** + * Iterators over each individual level. + */ + private final PeekableIterator[] levelIterators; + + /** + * The next bin tree to be returned. + */ + private BinTree nextBinTree; + + /** + * Each iteration through the bin tree has a corresponding lowest level. Make sure + * every lowest-level bin is covered, whether that bin is present or not. + */ + private int currentBinInLowestLevel; + + public BinTreeIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) { + this.index = index; + + BAMIndexBinIterator binIterator = new BAMIndexBinIterator(index,indexFile,referenceSequence); + levelIterators = new PeekableIterator[GATKBAMIndex.getNumIndexLevels()]; + for(int level = 0; level < GATKBAMIndex.getNumIndexLevels(); level++) + levelIterators[level] = new PeekableIterator(binIterator.getIteratorOverLevel(level)); + + // Set the current bin to one less that the first bin in the sequence. advance() will push it + // ahead to the first bin in the lowest level. + currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1) - 1; + + advance(); + } + + public boolean hasNext() { + return nextBinTree != null; + } + + /** + * Return the next BinTree in the level. + * @return Next BinTree in sequence. + */ + public BinTree next() { + if(!hasNext()) + throw new NoSuchElementException("BinTreeIterator is out of elements"); + BinTree currentBinTree = nextBinTree; + advance(); + return currentBinTree; + } + + /** + * Bring the bin tree ahead to the next overlapping structure. + */ + private void advance() { + final int lowestLevel = GATKBAMIndex.getNumIndexLevels()-1; + final int firstBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(lowestLevel); + final int binsInLowestLevel = index.getLevelSize(lowestLevel); + + currentBinInLowestLevel++; + + GATKBin[] bins = new GATKBin[GATKBAMIndex.getNumIndexLevels()]; + nextBinTree = null; + while(nextBinTree == null) { + for(int level = lowestLevel; level >= 0; level--) { + if(!levelIterators[level].hasNext()) + continue; + + final int firstBinInThisLevel = GATKBAMIndex.getFirstBinInLevel(level); + final int binsInThisLevel = index.getLevelSize(level); + final int currentBinInThisLevel = ((currentBinInLowestLevel-firstBinInLowestLevel)*binsInThisLevel/binsInLowestLevel) + firstBinInThisLevel; + + while(levelIterators[level].hasNext() && levelIterators[level].peek().getBinNumber() < currentBinInThisLevel) + levelIterators[level].next(); + + if(levelIterators[level].hasNext() && levelIterators[level].peek().getBinNumber() == currentBinInThisLevel) + bins[level] = levelIterators[level].peek(); + } + + for(int level = 0; level <= lowestLevel; level++) { + if(bins[level] != null) { + nextBinTree = new BinTree(index,bins); + break; + } + } + } + } + + /** + * Remove unsupported. + */ + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from a BinTreeIterator"); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index 69128b272..26c547a27 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -61,6 +61,10 @@ class FilePointer { this.isRegionUnmapped = false; } + public FilePointer(final String referenceSequence) { + this(referenceSequence,null); + } + public void addLocation(GenomeLoc location) { locations.add(location); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 782fa3f88..b3c767cec 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -103,6 +103,14 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { } } + /** + * Gets the file backing this index. + * @return The index file. + */ + public File getIndexFile() { + return mFile; + } + /** * Get the number of levels employed by this index. * @return Number of levels in this index. @@ -127,7 +135,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { */ public int getLevelSize(final int levelNumber) { if(levelNumber == getNumIndexLevels()-1) - return MAX_BINS-LEVEL_STARTS[levelNumber]; + return MAX_BINS-LEVEL_STARTS[levelNumber]-1; else return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; } @@ -272,7 +280,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { List chunkList = new ArrayList(); for(GATKBin bin: bins) { - for(GATKChunk chunk: bin.getGATKChunkList()) + for(GATKChunk chunk: bin.getChunkList()) chunkList.add(chunk.clone()); } @@ -281,7 +289,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { } chunkList = optimizeChunkList(chunkList,queryResults.getLinearIndex().getMinimumOffset(startPos)); - return new GATKBAMFileSpan(chunkList); + return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()])); } /** @@ -322,13 +330,13 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { List chunkList = new ArrayList(); for(GATKBin coveringBin: binTree) { - for(GATKChunk chunk: coveringBin.getGATKChunkList()) + for(GATKChunk chunk: coveringBin.getChunkList()) chunkList.add(chunk.clone()); } final int start = getFirstLocusInBin(bin); chunkList = optimizeChunkList(chunkList,indexQuery.getLinearIndex().getMinimumOffset(start)); - return new GATKBAMFileSpan(chunkList); + return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()])); } public GATKBAMFileSpan getContentsOfBin(final Bin bin) { @@ -344,7 +352,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { GATKBin queriedBin = indexQuery.getBins().getBin(gatkBin.getBinNumber()); - return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getGATKChunkList()) : null; + return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getChunkList()) : null; } /** @@ -420,8 +428,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { skipBytes(16 * nChunks); } GATKBin bin = new GATKBin(referenceSequence, indexBin); - bin.setGATKChunkList(chunks); - bin.setLastChunk(lastChunk); + bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); bins[indexBin] = bin; } @@ -560,7 +567,7 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { } } - private void skipToSequence(final int sequenceIndex) { + protected void skipToSequence(final int sequenceIndex) { for (int i = 0; i < sequenceIndex; i++) { // System.out.println("# Sequence TID: " + i); final int nBins = readInteger(); @@ -581,20 +588,23 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { mFileBuffer.get(bytes); } - private int readInteger() { + protected int readInteger() { return mFileBuffer.getInt(); } - private long readLong() { + protected long readLong() { return mFileBuffer.getLong(); } - private void skipBytes(final int count) { + protected void skipBytes(final int count) { mFileBuffer.position(mFileBuffer.position() + count); } - private void seek(final int position) { + protected void seek(final int position) { mFileBuffer.position(position); } + protected long position() { + return mFileBuffer.position(); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 0732b0de9..f2110fae0 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -158,8 +158,8 @@ public class IntervalSharder { if(!binIterator.hasNext()) break; - int locationStart = (int)location.getStart(); - final int locationStop = (int)location.getStop(); + int locationStart = location.getStart(); + final int locationStop = location.getStop(); // Advance to first bin. while(binIterator.peek().stop < locationStart) diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java index 1309c0c18..d40ee74c3 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardStrategy.java @@ -83,7 +83,10 @@ public class LocusShardStrategy implements ShardStrategy { else intervals = locations; - this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals); + if(SAMDataSource.TRY_LOW_MEMORY_SHARDING) + this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals); + else + this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals); } else { final int maxShardSize = 100000; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java new file mode 100644 index 000000000..6f0d52fd6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKBin; +import net.sf.samtools.GATKChunk; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once. + */ +public class LowMemoryIntervalSharder implements Iterator { + private static Logger logger = Logger.getLogger(IntervalSharder.class); + + private final SAMDataSource dataSource; + + private final GenomeLocSortedSet loci; + + private final PeekableIterator locusIterator; + + private GenomeLoc currentLocus; + + private FilePointer nextFilePointer = null; + + public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + this.dataSource = dataSource; + this.loci = loci; + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + + public boolean hasNext() { + return nextFilePointer != null; + } + + public FilePointer next() { + if(!hasNext()) + throw new NoSuchElementException("No next element available in interval sharder"); + FilePointer currentFilePointer = nextFilePointer; + advance(); + return currentFilePointer; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder"); + } + + private void advance() { + if(loci.isEmpty()) + return; + + nextFilePointer = null; + while(nextFilePointer == null && currentLocus != null) { + nextFilePointer = new FilePointer(currentLocus.getContig()); + + int coveredRegionStart = 1; + int coveredRegionStop = Integer.MAX_VALUE; + GenomeLoc coveredRegion = null; + + for(SAMReaderID reader: dataSource.getReaderIDs()) { + GATKBAMIndex index = (GATKBAMIndex)dataSource.getIndex(reader); + BinTree binTree = getNextOverlappingBinTree((GATKBAMIndex)dataSource.getIndex(reader),currentLocus); + if(binTree != null) { + coveredRegionStart = Math.max(coveredRegionStart,index.getFirstLocusInBin(binTree.getLowestLevelBin().toBin())); + coveredRegionStop = Math.min(coveredRegionStop,index.getLastLocusInBin(binTree.getLowestLevelBin().toBin())); + coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop); + + GATKBAMFileSpan fileSpan = generateFileSpan(index,binTree,currentLocus); + nextFilePointer.addFileSpans(reader,fileSpan); + } + } + + // Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list. + GenomeLoc initialLocation = currentLocus.intersect(coveredRegion); + nextFilePointer.addLocation(initialLocation); + + // See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval. + if(!nextFilePointer.locations.isEmpty()) { + while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) { + currentLocus = locusIterator.next(); + nextFilePointer.addLocation(locusIterator.next()); + } + + // Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus, + // we can simplify the interval creation process to the end of the covered region to the stop of the given interval. + if(coveredRegionStop < currentLocus.getStop()) + currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop()); + else if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + else + currentLocus = null; + } + + } + } + + /** + * The last reference sequence processed by this iterator. + */ + private int lastReferenceSequenceLoaded = -1; + + /** + * The stateful iterator used to progress through the genoem. + */ + private PeekableIterator binTreeIterator = null; + + /** + * Get the next overlapping tree of bins associated with the given BAM file. + * @param index BAM index representation. + * @param locus Locus for which to grab the bin tree, if available. + * @return The BinTree overlapping the given locus. + */ + private BinTree getNextOverlappingBinTree(final GATKBAMIndex index, final GenomeLoc locus) { + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. + if(locus.getContigIndex() != lastReferenceSequenceLoaded) { + if(binTreeIterator != null) + binTreeIterator.close(); + lastReferenceSequenceLoaded = locus.getContigIndex(); + binTreeIterator = new PeekableIterator(new BinTreeIterator(index,index.getIndexFile(),locus.getContigIndex())); + } + + if(!binTreeIterator.hasNext()) + return null; + + BinTree binTree = binTreeIterator.peek(); + while(index.getLastLocusInBin(binTree.getLowestLevelBin().toBin()) < locus.getStart()) { + binTreeIterator.next(); // Before the point of interest. Consume this one. + binTree = binTreeIterator.peek(); + } + + if(binTree.overlaps(locus)) { + binTreeIterator.next(); + return binTree; + } + + return null; + } + + /** + * Converts a bin list to a file span, trimmed based on the linear index and with overlapping regions removed. + * @param index BAM index. + * @param binTree Tree of data found to overlap the region. binTree.overlaps(initialRegion) must return true. + * @param initialRegion The region to employ when trimming the linear index. + * @return File span mapping to given region. + */ + private GATKBAMFileSpan generateFileSpan(final GATKBAMIndex index, final BinTree binTree, final GenomeLoc initialRegion) { + List chunks = new ArrayList(binTree.size()); + for(GATKBin bin: binTree.getBins()) { + if(bin == null) + continue; + chunks.addAll(Arrays.asList(bin.getChunkList())); + } + + // Optimize the chunk list with a linear index optimization + chunks = index.optimizeChunkList(chunks,index.getLinearIndex(initialRegion.getContigIndex()).getMinimumOffset(initialRegion.getStart())); + + return new GATKBAMFileSpan(chunks.toArray(new GATKChunk[chunks.size()])); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 7d43cb3a4..5877f67bb 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -46,6 +46,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.*; /** @@ -79,6 +81,11 @@ public class SAMDataSource { */ private final SAMFileReader.ValidationStringency validationStringency; + /** + * Store BAM indices for each reader present. + */ + private final Map bamIndices = new HashMap(); + /** * How far along is each reader? */ @@ -121,6 +128,8 @@ public class SAMDataSource { */ private final SAMResourcePool resourcePool; + static final boolean TRY_LOW_MEMORY_SHARDING = false; + /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. @@ -272,6 +281,17 @@ public class SAMDataSource { originalToMergedReadGroupMappings.put(id,mappingToMerged); } + if(TRY_LOW_MEMORY_SHARDING) { + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.samFile); + if(indexFile != null) { + SAMSequenceDictionary sequenceDictionary = readers.getReader(id).getFileHeader().getSequenceDictionary(); + GATKBAMIndex index = new GATKBAMIndex(indexFile,sequenceDictionary); + bamIndices.put(id,index); + } + } + } + resourcePool.releaseReaders(readers); } @@ -366,14 +386,18 @@ public class SAMDataSource { /** * True if all readers have an index. - * @return + * @return True if all readers have an index. */ public boolean hasIndex() { - for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) { - if(!reader.hasIndex()) - return false; + if(TRY_LOW_MEMORY_SHARDING) + return readerIDs.size() == bamIndices.size(); + else { + for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) { + if(!reader.hasIndex()) + return false; + } + return true; } - return true; } /** @@ -382,8 +406,12 @@ public class SAMDataSource { * @return The index. Will preload the index if necessary. */ public BrowseableBAMIndex getIndex(final SAMReaderID id) { - SAMReaders readers = resourcePool.getReadersWithoutLocking(); - return readers.getReader(id).getBrowseableIndex(); + if(TRY_LOW_MEMORY_SHARDING) + return bamIndices.get(id); + else { + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + return readers.getReader(id).getBrowseableIndex(); + } } /** @@ -701,11 +729,10 @@ public class SAMDataSource { for(SAMReaderID readerID: readerIDs) { SAMFileReader reader = new SAMFileReader(readerID.samFile); reader.enableFileSource(true); - reader.enableIndexCaching(true); + if(!TRY_LOW_MEMORY_SHARDING) + reader.enableIndexCaching(true); reader.setValidationStringency(validationStringency); - // If no read group is present, hallucinate one. - // TODO: Straw poll to see whether this is really required. final SAMFileHeader header = reader.getFileHeader(); logger.debug(String.format("Sort order is: " + header.getSortOrder())); @@ -944,6 +971,37 @@ public class SAMDataSource { } } } + + /** + * Locates the index file alongside the given BAM, if present. + * TODO: This is currently a hachetjob that reaches into Picard and pulls out its index file locator. Replace with something more permanent. + * @param bamFile The data file to use. + * @return A File object if the index file is present; null otherwise. + */ + private File findIndexFile(File bamFile) { + File indexFile; + + try { + Class bamFileReaderClass = Class.forName("net.sf.samtools.BAMFileReader"); + Method indexFileLocator = bamFileReaderClass.getDeclaredMethod("findIndexFile",File.class); + indexFileLocator.setAccessible(true); + indexFile = (File)indexFileLocator.invoke(null,bamFile); + } + catch(ClassNotFoundException ex) { + throw new ReviewedStingException("Unable to locate BAMFileReader class, used to check for index files"); + } + catch(NoSuchMethodException ex) { + throw new ReviewedStingException("Unable to locate Picard index file locator."); + } + catch(IllegalAccessException ex) { + throw new ReviewedStingException("Unable to access Picard index file locator."); + } + catch(InvocationTargetException ex) { + throw new ReviewedStingException("Unable to invoke Picard index file locator."); + } + + return indexFile; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/BAMProcessingPerformanceMeter.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/BAMProcessingPerformanceMeter.java new file mode 100644 index 000000000..ef506b1fa --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/BAMProcessingPerformanceMeter.java @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.utils.SimpleTimer; + +import java.io.File; + +/** + * Basic suite for testing idealized and actual performance of read processing. + */ +public class BAMProcessingPerformanceMeter extends CommandLineProgram { + @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = true) + File samFile; + + @Input(fullName = "reference_file", shortName="R", doc = "Associated FASTA sequence", required = true) + File referenceFile; + + @Argument(fullName="test_repetitions", shortName = "test_reps", doc="Number of times to repeat each test", required = false) + int testRepetitions = 5; + + @Argument(fullName="print_frequency", shortName = "pf", doc="Print cumulative time after x # reads", required = false) + int printFrequency = 100000; + + private void testBAMFileProcessingThroughput(ReadProcessor readProcessor) { + readProcessor.execute(samFile,referenceFile); + } + + public int execute() { + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new NoAdditionalProcessing(this)); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new IterateOverEachBase(this)); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new IterateOverCigarString(this)); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new ExtractTag(this,"OQ")); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeSamLocusIterator(this)); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeLocusIteratorByState(this, GATKArgumentCollection.getDefaultDownsamplingMethod())); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(new InvokeLocusIteratorByState(this, DownsamplingMethod.NONE)); + GATKWalkerInvoker countReadsInvoker = new GATKWalkerInvoker(this); + CountReadsPerformanceWalker countReadsWalker = new CountReadsPerformanceWalker(countReadsInvoker); + countReadsInvoker.setWalker(countReadsWalker); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(countReadsInvoker); + + GATKWalkerInvoker countBasesInReadInvoker = new GATKWalkerInvoker(this); + CountBasesInReadPerformanceWalker countBasesInReadWalker = new CountBasesInReadPerformanceWalker(countBasesInReadInvoker); + countBasesInReadInvoker.setWalker(countBasesInReadWalker); + for(int i = 0; i < testRepetitions; i++) testBAMFileProcessingThroughput(countBasesInReadInvoker); + + return 0; + } + + /** + * Required main method implementation. + * @param argv Command-line argument text. + * @throws Exception on error. + */ + public static void main(String[] argv) throws Exception { + int returnCode = 0; + try { + BAMProcessingPerformanceMeter instance = new BAMProcessingPerformanceMeter(); + start(instance, argv); + returnCode = 0; + } + catch(Exception ex) { + returnCode = 1; + ex.printStackTrace(); + throw ex; + } + finally { + System.exit(returnCode); + } + } +} + +abstract class ReadProcessor { + private final SimpleTimer timer; + private final int printFrequency; + protected int iterations = 0; + + public ReadProcessor(BAMProcessingPerformanceMeter performanceMeter) { + timer = new SimpleTimer("timer"); + this.printFrequency = performanceMeter.printFrequency; + } + + public abstract String getTestName(); + public String getIterationType() { return "loci"; } + + public void processRead(final SAMRecord read) { } + public void execute(File bamFile,File fastaFile) { + SAMFileReader reader = new SAMFileReader(bamFile); + startTest(); + for(SAMRecord read: reader) { + processRead(read); + updateIterationCount(); + } + stopTest(); + reader.close(); + } + + protected void startTest() { + timer.start(); + } + + protected void stopTest() { + timer.stop(); + printStatus("TEST COMPLETE"); + } + + protected void updateIterationCount() { + if(++iterations % printFrequency == 0) printStatus("ONGOING"); + } + + private void printStatus(String prefix) { + System.out.printf("%s: %s printed %d %s in %f seconds.%n",prefix,getTestName(),iterations,getIterationType(),timer.getElapsedTime()); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountBasesInReadPerformanceWalker.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountBasesInReadPerformanceWalker.java new file mode 100644 index 000000000..d40a33892 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountBasesInReadPerformanceWalker.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:55 AM + * To change this template use File | Settings | File Templates. + */ +class CountBasesInReadPerformanceWalker extends ReadWalker { + private long As; + private long Cs; + private long Gs; + private long Ts; + + private final GATKWalkerInvoker invoker; + + public CountBasesInReadPerformanceWalker(GATKWalkerInvoker walkerInvoker) { + this.invoker = walkerInvoker; + + } + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + for(byte base: read.getReadBases()) { + switch(base) { + case 'A': As++; break; + case 'C': Cs++; break; + case 'G': Gs++; break; + case 'T': Ts++; break; + } + } + invoker.updateIterationCount(); + return 1; + } + + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return value + accum; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountReadsPerformanceWalker.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountReadsPerformanceWalker.java new file mode 100644 index 000000000..3d0771e4b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/CountReadsPerformanceWalker.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:55 AM + * To change this template use File | Settings | File Templates. + */ +class CountReadsPerformanceWalker extends ReadWalker { + private final GATKWalkerInvoker invoker; + + public CountReadsPerformanceWalker(GATKWalkerInvoker walkerInvoker) { + this.invoker = walkerInvoker; + + } + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + invoker.updateIterationCount(); + return 1; + } + + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return value + accum; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/ExtractTag.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/ExtractTag.java new file mode 100644 index 000000000..a2a05bb55 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/ExtractTag.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:53 AM + * To change this template use File | Settings | File Templates. + */ +class ExtractTag extends ReadProcessor { + private final String tag; + + public ExtractTag(final BAMProcessingPerformanceMeter performanceMeter, final String tag) { + super(performanceMeter); + this.tag = tag; + } + + @Override + public String getTestName() { return "extract tag"; } + public void processRead(final SAMRecord read) { + read.getAttribute(tag); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/GATKWalkerInvoker.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/GATKWalkerInvoker.java new file mode 100644 index 000000000..14a2d3e1e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/GATKWalkerInvoker.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.picard.filter.SamRecordFilter; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.walkers.Walker; + +import java.io.File; +import java.util.Collections; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:54 AM + * To change this template use File | Settings | File Templates. + */ +class GATKWalkerInvoker extends ReadProcessor { + /** + * Walker to run over the existing dataset. + */ + private Walker walker; + + public GATKWalkerInvoker(BAMProcessingPerformanceMeter performanceMeter) { + super(performanceMeter); + } + + @Override + public String getTestName() { return "GATK-CountReads"; } + + public void setWalker(Walker walker) { + this.walker = walker; + } + + @Override + public void execute(File samFile, File fastaFile) { + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + + // Establish the argument collection + GATKArgumentCollection argCollection = new GATKArgumentCollection(); + argCollection.referenceFile = fastaFile; + argCollection.samFiles = Collections.singletonList(samFile.getAbsolutePath()); + + engine.setArguments(argCollection); + // Bugs in the engine mean that this has to be set twice. + engine.setSAMFileIDs(Collections.singletonList(new SAMReaderID(samFile,new Tags()))); + engine.setFilters(Collections.emptyList()); + engine.setReferenceMetaDataFiles(Collections.emptyList()); + + // Create the walker + engine.setWalker(walker); + + startTest(); + engine.execute(); + stopTest(); + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeLocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeLocusIteratorByState.java new file mode 100644 index 000000000..db2280a8e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeLocusIteratorByState.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.picard.filter.FilteringIterator; +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.baq.BAQ; + +import java.io.File; +import java.util.Collections; +import java.util.Iterator; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:54 AM + * To change this template use File | Settings | File Templates. + */ +class InvokeLocusIteratorByState extends ReadProcessor { + private final DownsamplingMethod downsamplingMethod; + + public InvokeLocusIteratorByState(final BAMProcessingPerformanceMeter performanceMeter,DownsamplingMethod downsamplingMethod) { + super(performanceMeter); + this.downsamplingMethod = downsamplingMethod; + } + + @Override + public String getTestName() { + if(downsamplingMethod != DownsamplingMethod.NONE) + return String.format("invoke locus iterator by state; downsampling by sample to coverage = %d; ",downsamplingMethod.toCoverage); + else + return String.format("invoke locus iterator by state; no downsampling; "); + } + + @Override + public String getIterationType() { return "loci"; } + + @Override + public void execute(File samFile, File fastaFile) { + SAMFileReader reader = new SAMFileReader(samFile); + ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(samFile,new Tags())), + reader.getFileHeader(), + false, + SAMFileReader.ValidationStringency.SILENT, + 0, + downsamplingMethod, + new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), + Collections.emptyList(), + false, + false, + BAQ.CalculationMode.OFF, + BAQ.QualityMode.DONT_MODIFY, + null, + (byte)0); + + GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); + SampleDataSource sampleDataSource = new SampleDataSource(); + sampleDataSource.addSamplesFromSAMHeader(reader.getFileHeader()); + + // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? + Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); + LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource); + startTest(); + while(locusIteratorByState.hasNext()) { + locusIteratorByState.next(); + updateIterationCount(); + } + stopTest(); + + reader.close(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeSamLocusIterator.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeSamLocusIterator.java new file mode 100644 index 000000000..f55dfcec1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/InvokeSamLocusIterator.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.picard.util.SamLocusIterator; +import net.sf.samtools.SAMFileReader; + +import java.io.File; +import java.util.Iterator; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:54 AM + * To change this template use File | Settings | File Templates. + */ +class InvokeSamLocusIterator extends ReadProcessor { + public InvokeSamLocusIterator(final BAMProcessingPerformanceMeter performanceMeter) { + super(performanceMeter); + } + + @Override + public String getTestName() { + return String.format("invoke sam locus iterator"); + } + + @Override + public String getIterationType() { return "loci"; } + + @Override + public void execute(File samFile, File fastaFile) { + SAMFileReader reader = new SAMFileReader(samFile); + + SamLocusIterator samLocusIterator = new SamLocusIterator(reader); + samLocusIterator.setEmitUncoveredLoci(false); + Iterator workhorseIterator = samLocusIterator.iterator(); + + startTest(); + while(workhorseIterator.hasNext()) { + SamLocusIterator.LocusInfo locusInfo = workhorseIterator.next(); + updateIterationCount(); + } + stopTest(); + + reader.close(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverCigarString.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverCigarString.java new file mode 100644 index 000000000..5836c0974 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverCigarString.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:53 AM + * To change this template use File | Settings | File Templates. + */ +class IterateOverCigarString extends ReadProcessor { + private long matchMismatches; + private long insertions; + private long deletions; + private long others; + + public IterateOverCigarString(final BAMProcessingPerformanceMeter performanceMeter) { + super(performanceMeter); + } + + @Override + public String getTestName() { return "iterator over cigar string"; } + public void processRead(final SAMRecord read) { + Cigar cigar = read.getCigar(); + for(CigarElement cigarElement: cigar.getCigarElements()) { + int elementSize = cigarElement.getLength(); + while(elementSize > 0) { + switch(cigarElement.getOperator()) { + case M: matchMismatches++; break; + case I: insertions++; break; + case D: deletions++; break; + default: others++; break; + } + elementSize--; + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverEachBase.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverEachBase.java new file mode 100644 index 000000000..3c019841d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/IterateOverEachBase.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:53 AM + * To change this template use File | Settings | File Templates. + */ +class IterateOverEachBase extends ReadProcessor { + private long As; + private long Cs; + private long Gs; + private long Ts; + + public IterateOverEachBase(final BAMProcessingPerformanceMeter performanceMeter) { + super(performanceMeter); + } + + @Override + public String getTestName() { return "iterate over each base"; } + public void processRead(final SAMRecord read) { + for(byte base: read.getReadBases()) { + switch(base) { + case 'A': As++; break; + case 'C': Cs++; break; + case 'G': Gs++; break; + case 'T': Ts++; break; + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/NoAdditionalProcessing.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/NoAdditionalProcessing.java new file mode 100644 index 000000000..9e2afd548 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/performance/NoAdditionalProcessing.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.performance; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:53 AM + * To change this template use File | Settings | File Templates. + */ +class NoAdditionalProcessing extends ReadProcessor { + public NoAdditionalProcessing(final BAMProcessingPerformanceMeter performanceMeter) { + super(performanceMeter); + } + + @Override + public String getTestName() { return "no additional processing"; } + public void processRead(final SAMRecord read) {} +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java new file mode 100644 index 000000000..70687ec25 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/UnzipSingleBlock.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads.utilities; + +import net.sf.samtools.util.BlockGunzipper; +import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.broadinstitute.sting.commandline.Input; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +/** + * Test decompression of a single BGZF block. + */ +public class UnzipSingleBlock extends CommandLineProgram { + @Input(fullName = "block_file", shortName = "b", doc = "block file over which to test unzipping", required = true) + private File blockFile; + + @Input(fullName = "compressed_block_size", shortName = "cbs", doc = "size of compressed block", required = true) + private int compressedBufferSize; + + public int execute() throws IOException, NoSuchMethodException, IllegalAccessException, InvocationTargetException { + byte[] compressedBuffer = new byte[(int)blockFile.length()]; + byte[] uncompressedBuffer = new byte[65536]; + + FileInputStream fis = new FileInputStream(blockFile); + fis.read(compressedBuffer); + fis.close(); + + BlockGunzipper gunzipper = new BlockGunzipper(); + gunzipper.setCheckCrcs(true); + Method unzipBlock = BlockGunzipper.class.getDeclaredMethod("unzipBlock",byte[].class,byte[].class,Integer.TYPE); + unzipBlock.setAccessible(true); + + unzipBlock.invoke(gunzipper,uncompressedBuffer,compressedBuffer,compressedBufferSize); + + System.out.printf("SUCCESS!%n"); + + return 0; + } + + /** + * Required main method implementation. + * @param argv Command-line argument text. + * @throws Exception on error. + */ + public static void main(String[] argv) throws Exception { + int returnCode = 0; + try { + UnzipSingleBlock instance = new UnzipSingleBlock(); + start(instance, argv); + returnCode = 0; + } + catch(Exception ex) { + returnCode = 1; + ex.printStackTrace(); + throw ex; + } + finally { + System.exit(returnCode); + } + } +} diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java index 567b1a3de..fa21b9f38 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java @@ -88,7 +88,7 @@ public class GATKBAMIndexUnitTest extends BaseTest { // Level 5 Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(5),4681); - Assert.assertEquals(bamIndex.getLevelSize(5),37449-4681+1); + Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1); } }