diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java index a7487ba55..bc6089d1c 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java @@ -91,7 +91,7 @@ public class BAMSchedule implements CloseableIterator { * @param indexFiles Index files. * @param intervals List of */ - public BAMSchedule(final Map indexFiles, final List intervals) { + public BAMSchedule(final Map indexFiles, final List intervals) { if(intervals.isEmpty()) throw new ReviewedStingException("Tried to write schedule for empty interval list."); @@ -102,7 +102,8 @@ public class BAMSchedule implements CloseableIterator { readerIDs.addAll(indexFiles.keySet()); for(final SAMReaderID reader: readerIDs) { - GATKBAMIndex index = new GATKBAMIndex(indexFiles.get(reader),referenceSequence); + final GATKBAMIndex index = indexFiles.get(reader); + final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); Iterator locusIterator = intervals.iterator(); @@ -132,7 +133,7 @@ public class BAMSchedule implements CloseableIterator { // Code at this point knows that the current bin is neither before nor after the current locus, // so it must overlap. Add this region to the filesystem. - final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(bin); + final GATKBAMFileSpan fileSpan = indexData.getSpanOverlapping(bin); if(!fileSpan.isEmpty()) { // File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index b3cdbc31d..45aee6758 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -47,7 +47,7 @@ import java.util.NoSuchElementException; public class BAMScheduler implements Iterator { private final SAMDataSource dataSource; - private final Map indexFiles = new HashMap(); + private final Map indexFiles = new HashMap(); private FilePointer nextFilePointer = null; @@ -60,7 +60,7 @@ public class BAMScheduler implements Iterator { public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { this.dataSource = dataSource; for(SAMReaderID reader: dataSource.getReaderIDs()) - indexFiles.put(reader,(File)dataSource.getIndex(reader)); + indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); this.loci = loci; locusIterator = new PeekableIterator(loci.iterator()); if(locusIterator.hasNext()) @@ -184,11 +184,11 @@ public class BAMScheduler implements Iterator { /** * Get the next overlapping tree of bins associated with the given BAM file. - * @param indexFiles BAM index file. + * @param indices BAM indices. * @param currentLocus The actual locus for which to check overlap. * @return The next schedule entry overlapping with the given list of loci. */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indexFiles, final GenomeLoc currentLocus) { + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { // Stale reference sequence or first invocation. (Re)create the binTreeIterator. if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { if(bamScheduleIterator != null) @@ -202,7 +202,7 @@ public class BAMScheduler implements Iterator { lociInContig.add(locus); } - bamScheduleIterator = new PeekableIterator(new BAMSchedule(indexFiles,lociInContig)); + bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); } if(!bamScheduleIterator.hasNext()) diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMFileConstants.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMFileConstants.java deleted file mode 100644 index 47b56db94..000000000 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMFileConstants.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -/** - * Constants used in reading & writing BAM files - */ -class GATKBAMFileConstants { - /** - * The beginning of a BAMRecord is a fixed-size block of 8 int32s - */ - static final int FIXED_BLOCK_SIZE = 8 * 4; - - /** - * Sanity check -- we never expect BAMRecords to be as big as this. - */ - static final int MAXIMUM_RECORD_LENGTH = 1024 * 1024; - - /** - * BAM file magic number. This is what is present in the gunzipped version of the file, - * which never exists on disk. - */ - - static final byte[] BAM_MAGIC = "BAM\1".getBytes(); - /** - * BAM index file magic number. - */ - static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); -} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 46d6d97de..8ebb8b1a8 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -44,11 +44,17 @@ import java.util.*; /** * A basic interface for querying BAM indices. + * Very much not thread-safe. * * @author mhanna * @version 0.1 */ public class GATKBAMIndex { + /** + * BAM index file magic number. + */ + private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); + /** * Reports the total amount of genomic data that any bin can index. */ @@ -66,48 +72,64 @@ public class GATKBAMIndex { private final File mFile; - private final List bins = new ArrayList(); - private final LinearIndex linearIndex; + /** + * Number of sequences stored in this index. + */ + private final int sequenceCount; - public GATKBAMIndex(final File file, final int referenceSequence) { + /** + * A cache of the starting positions of the sequences. + */ + private final long[] sequenceStartCache; + + private FileInputStream fileStream; + private FileChannel fileChannel; + + public GATKBAMIndex(final File file) { mFile = file; // Open the file stream. - - FileInputStream fileStream = null; - FileChannel fileChannel = null; - - try { - fileStream = new FileInputStream(mFile); - fileChannel = fileStream.getChannel(); - } - catch (IOException exc) { - throw new RuntimeIOException(exc.getMessage(), exc); - } + openIndexFile(); // Verify the magic number. - seek(fileChannel,0); - final byte[] buffer = readBytes(fileChannel,4); - if (!Arrays.equals(buffer, GATKBAMFileConstants.BAM_INDEX_MAGIC)) { + seek(0); + final byte[] buffer = readBytes(4); + if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) { throw new RuntimeException("Invalid file header in BAM index " + mFile + ": " + new String(buffer)); } - seek(fileChannel,4); + seek(4); - final int sequenceCount = readInteger(fileChannel); + sequenceCount = readInteger(); + + // Create a cache of the starting position of each sequence. Initialize it to -1. + sequenceStartCache = new long[sequenceCount]; + for(int i = 1; i < sequenceCount; i++) + sequenceStartCache[i] = -1; + + // Seed the first element in the array with the current position. + if(sequenceCount > 0) + sequenceStartCache[0] = position(); + + closeIndexFile(); + } + + public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { + openIndexFile(); if (referenceSequence >= sequenceCount) throw new ReviewedStingException("Invalid sequence number " + referenceSequence); - skipToSequence(fileChannel,referenceSequence); + skipToSequence(referenceSequence); - int binCount = readInteger(fileChannel); + int binCount = readInteger(); + List bins = new ArrayList(); for (int binNumber = 0; binNumber < binCount; binNumber++) { - final int indexBin = readInteger(fileChannel); - final int nChunks = readInteger(fileChannel); + final int indexBin = readInteger(); + final int nChunks = readInteger(); List chunks = new ArrayList(nChunks); - long[] rawChunkData = readLongs(fileChannel,nChunks*2); + long[] rawChunkData = readLongs(nChunks*2); for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = rawChunkData[ci*2]; final long chunkEnd = rawChunkData[ci*2+1]; @@ -120,18 +142,14 @@ public class GATKBAMIndex { bins.set(indexBin,bin); } - final int nLinearBins = readInteger(fileChannel); - long[] linearIndexEntries = readLongs(fileChannel,nLinearBins); + final int nLinearBins = readInteger(); + long[] linearIndexEntries = readLongs(nLinearBins); - linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); + LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); - try { - fileChannel.close(); - fileStream.close(); - } - catch (IOException exc) { - throw new RuntimeIOException(exc.getMessage(), exc); - } + closeIndexFile(); + + return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); } /** @@ -203,72 +221,6 @@ public class GATKBAMIndex { return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); } - /** - * Perform an overlapping query of all bins bounding the given location. - * @param bin The bin over which to perform an overlapping query. - * @return The file pointers - */ - public GATKBAMFileSpan getSpanOverlapping(final Bin bin) { - if(bin == null) - return null; - - GATKBin gatkBin = new GATKBin(bin); - - final int binLevel = getLevelForBin(bin); - final int firstLocusInBin = getFirstLocusInBin(bin); - - // Add the specified bin to the tree if it exists. - List binTree = new ArrayList(); - if(gatkBin.getBinNumber() < bins.size() && bins.get(gatkBin.getBinNumber()) != null) - binTree.add(bins.get(gatkBin.getBinNumber())); - - int currentBinLevel = binLevel; - while(--currentBinLevel >= 0) { - final int binStart = getFirstBinInLevel(currentBinLevel); - final int binWidth = getMaxAddressibleGenomicLocation()/getLevelSize(currentBinLevel); - final int binNumber = firstLocusInBin/binWidth + binStart; - if(binNumber < bins.size() && bins.get(binNumber) != null) - binTree.add(bins.get(binNumber)); - } - - List chunkList = new ArrayList(); - for(GATKBin coveringBin: binTree) { - for(GATKChunk chunk: coveringBin.getChunkList()) - chunkList.add(chunk.clone()); - } - - final int start = getFirstLocusInBin(bin); - chunkList = optimizeChunkList(chunkList,linearIndex.getMinimumOffset(start)); - return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()])); - } - - protected List optimizeChunkList(final List chunks, final long minimumOffset) { - GATKChunk lastChunk = null; - Collections.sort(chunks); - final List result = new ArrayList(); - for (final GATKChunk chunk : chunks) { - if (chunk.getChunkEnd() <= minimumOffset) { - continue; // linear index optimization - } - if (result.isEmpty()) { - result.add(chunk); - lastChunk = chunk; - continue; - } - // Coalesce chunks that are in adjacent file blocks. - // This is a performance optimization. - if (!lastChunk.overlaps(chunk) && !lastChunk.isAdjacentTo(chunk)) { - result.add(chunk); - lastChunk = chunk; - } else { - if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) { - lastChunk.setChunkEnd(chunk.getChunkEnd()); - } - } - } - return result; - } - /** * Gets the possible number of bins for a given reference sequence. * @return How many bins could possibly be used according to this indexing scheme to index a single contig. @@ -277,51 +229,84 @@ public class GATKBAMIndex { return BIN_GENOMIC_SPAN; } - protected void skipToSequence(final FileChannel fileChannel, final int sequenceIndex) { - for (int i = 0; i < sequenceIndex; i++) { + protected void skipToSequence(final int referenceSequence) { + // Find the offset in the file of the last sequence whose position has been determined. Start here + // when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always + // be present, so no extra stopping condition is necessary. + int sequenceIndex = referenceSequence; + while(sequenceStartCache[sequenceIndex] == -1) + sequenceIndex--; + + // Advance to the most recently found position. + seek(sequenceStartCache[sequenceIndex]); + + for (int i = sequenceIndex; i < referenceSequence; i++) { + sequenceStartCache[i] = position(); + // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(fileChannel); + final int nBins = readInteger(); // System.out.println("# nBins: " + nBins); for (int j = 0; j < nBins; j++) { - final int bin = readInteger(fileChannel); - final int nChunks = readInteger(fileChannel); + final int bin = readInteger(); + final int nChunks = readInteger(); // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); - skipBytes(fileChannel, 16 * nChunks); + skipBytes(16 * nChunks); } - final int nLinearBins = readInteger(fileChannel); + final int nLinearBins = readInteger(); // System.out.println("# nLinearBins: " + nLinearBins); - skipBytes(fileChannel, 8 * nLinearBins); + skipBytes(8 * nLinearBins); + } + + sequenceStartCache[referenceSequence] = position(); + } + + private void openIndexFile() { + try { + fileStream = new FileInputStream(mFile); + fileChannel = fileStream.getChannel(); + } + catch (IOException exc) { + throw new ReviewedStingException("Unable to open index file " + mFile, exc); + } + } + + private void closeIndexFile() { + try { + fileChannel.close(); + fileStream.close(); + } + catch (IOException exc) { + throw new ReviewedStingException("Unable to close index file " + mFile, exc); } } private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; - private byte[] readBytes(final FileChannel fileChannel, int count) { + private byte[] readBytes(int count) { ByteBuffer buffer = getBuffer(count); - read(fileChannel,buffer); + read(buffer); buffer.flip(); byte[] contents = new byte[count]; buffer.get(contents); return contents; } - private int readInteger(final FileChannel fileChannel) { + private int readInteger() { ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES); - read(fileChannel,buffer); + read(buffer); buffer.flip(); return buffer.getInt(); } /** * Reads an array of longs from the file channel, returning the results as an array. - * @param fileChannel The file backing the schedule. * @param count Number of longs to read. * @return An array of longs. Size of array should match count. */ - private long[] readLongs(final FileChannel fileChannel, final int count) { + private long[] readLongs(final int count) { ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES); - read(fileChannel,buffer); + read(buffer); buffer.flip(); long[] result = new long[count]; for(int i = 0; i < count; i++) @@ -329,7 +314,7 @@ public class GATKBAMIndex { return result; } - private void read(final FileChannel fileChannel, final ByteBuffer buffer) { + private void read(final ByteBuffer buffer) { try { fileChannel.read(buffer); } @@ -356,7 +341,7 @@ public class GATKBAMIndex { return buffer; } - private void skipBytes(final FileChannel fileChannel, final int count) { + private void skipBytes(final int count) { try { fileChannel.position(fileChannel.position() + count); } @@ -365,7 +350,7 @@ public class GATKBAMIndex { } } - private void seek(final FileChannel fileChannel, final int position) { + private void seek(final long position) { try { fileChannel.position(position); } @@ -373,4 +358,17 @@ public class GATKBAMIndex { throw new ReviewedStingException("Index: unable to reposition of file channel."); } } + + /** + * Retrieve the position from the current file channel. + * @return position of the current file channel. + */ + private long position() { + try { + return fileChannel.position(); + } + catch (IOException exc) { + throw new ReviewedStingException("Unable to read position from index file " + mFile, exc); + } + } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java new file mode 100644 index 000000000..f9b998a60 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexData.java @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.Bin; +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKBin; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.LinearIndex; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Stores and processes a single reference worth of GATK data. + */ +public class GATKBAMIndexData { + private final GATKBAMIndex index; + private final int referenceSequence; + private final List bins; + private final LinearIndex linearIndex; + + public GATKBAMIndexData(final GATKBAMIndex index, final int referenceSequence, final List bins, final LinearIndex linearIndex) { + this.index = index; + this.referenceSequence = referenceSequence; + this.bins = bins; + this.linearIndex = linearIndex; + } + + public int getReferenceSequence() { + return referenceSequence; + } + + /** + * Perform an overlapping query of all bins bounding the given location. + * @param bin The bin over which to perform an overlapping query. + * @return The file pointers + */ + public GATKBAMFileSpan getSpanOverlapping(final Bin bin) { + if(bin == null) + return null; + + GATKBin gatkBin = new GATKBin(bin); + + final int binLevel = index.getLevelForBin(bin); + final int firstLocusInBin = index.getFirstLocusInBin(bin); + + // Add the specified bin to the tree if it exists. + List binTree = new ArrayList(); + if(gatkBin.getBinNumber() < bins.size() && bins.get(gatkBin.getBinNumber()) != null) + binTree.add(bins.get(gatkBin.getBinNumber())); + + int currentBinLevel = binLevel; + while(--currentBinLevel >= 0) { + final int binStart = index.getFirstBinInLevel(currentBinLevel); + final int binWidth = index.getMaxAddressibleGenomicLocation()/index.getLevelSize(currentBinLevel); + final int binNumber = firstLocusInBin/binWidth + binStart; + if(binNumber < bins.size() && bins.get(binNumber) != null) + binTree.add(bins.get(binNumber)); + } + + List chunkList = new ArrayList(); + for(GATKBin coveringBin: binTree) { + for(GATKChunk chunk: coveringBin.getChunkList()) + chunkList.add(chunk.clone()); + } + + final int start = index.getFirstLocusInBin(bin); + chunkList = optimizeChunkList(chunkList,linearIndex.getMinimumOffset(start)); + return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()])); + } + + private List optimizeChunkList(final List chunks, final long minimumOffset) { + GATKChunk lastChunk = null; + Collections.sort(chunks); + final List result = new ArrayList(); + for (final GATKChunk chunk : chunks) { + if (chunk.getChunkEnd() <= minimumOffset) { + continue; // linear index optimization + } + if (result.isEmpty()) { + result.add(chunk); + lastChunk = chunk; + continue; + } + // Coalesce chunks that are in adjacent file blocks. + // This is a performance optimization. + if (!lastChunk.overlaps(chunk) && !lastChunk.isAdjacentTo(chunk)) { + result.add(chunk); + lastChunk = chunk; + } else { + if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) { + lastChunk.setChunkEnd(chunk.getChunkEnd()); + } + } + } + return result; + } + + +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index b27b6e61b..c2aa5f18e 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -85,7 +85,7 @@ public class SAMDataSource { /** * Store BAM indices for each reader present. */ - private final Map bamIndices = new HashMap(); + private final Map bamIndices = new HashMap(); /** * How far along is each reader? @@ -292,9 +292,8 @@ public class SAMDataSource { if(enableLowMemorySharding) { for(SAMReaderID id: readerIDs) { File indexFile = findIndexFile(id.samFile); - if(indexFile != null) { - bamIndices.put(id,indexFile); - } + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); } } @@ -424,6 +423,9 @@ public class SAMDataSource { /** * Gets the index for a particular reader. Always preloaded. + * TODO: Should return object of type GATKBAMIndex, but cannot because there + * TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new + * TODO: sharding system goes live. * @param id Id of the reader. * @return The index. Will preload the index if necessary. */ diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index d3750ef28..24d8bc6c5 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; @@ -42,6 +43,7 @@ import org.broadinstitute.sting.utils.text.ListFileUtils; import java.io.File; import java.io.IOException; +import java.io.PrintStream; import java.math.BigInteger; import java.util.ArrayList; import java.util.List; @@ -61,6 +63,9 @@ public class FindLargeShards extends CommandLineProgram { @Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.",required=false) public List intervals = null; + @Output(required=false) + public PrintStream out = System.out; + /** * The square of the sum of all uncompressed data. Based on the BAM spec, the size of this could be * up to (2^64)^2. @@ -99,7 +104,7 @@ public class FindLargeShards extends CommandLineProgram { intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength())); } - logger.info(String.format("Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); + logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); while(sharder.hasNext()) { @@ -115,7 +120,7 @@ public class FindLargeShards extends CommandLineProgram { if(numberOfShards % 1000 == 0) { GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); - logger.info(String.format("Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); + logger.info(String.format("PROGRESS: Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); } } @@ -127,7 +132,9 @@ public class FindLargeShards extends CommandLineProgram { // Crank through the shards again, this time reporting on the shards significantly larger than the mean. long threshold = mean + stddev*5; - logger.warn(String.format("Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); + logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); + out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); + sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet); while(sharder.hasNext()) { FilePointer filePointer = sharder.next(); @@ -142,11 +149,11 @@ public class FindLargeShards extends CommandLineProgram { if(filePointer.size() <= threshold) { if(numberOfShards % 1000 == 0) - logger.info(String.format("%s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); + logger.info(String.format("PROGRESS: Searching for large shards: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); continue; } - logger.warn(String.format("FOUND LARGE SHARD: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); + out.printf("%s\t%d\t%d\t%d%n",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size); } return 0; diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java index 55d6a908e..fde0ce62f 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java @@ -57,7 +57,7 @@ public class GATKBAMIndexUnitTest extends BaseTest { SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); reader.close(); - bamIndex = new GATKBAMIndex(bamIndexFile,0); + bamIndex = new GATKBAMIndex(bamIndexFile); } @Test