From 24e24b946873464becc9244aa50ef2220e4ae41e Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Wed, 12 Aug 2015 01:23:54 -0300 Subject: [PATCH] Using `SamIndexes.asBaiSeekableStreamOrNull()` to support `.cram.crai`. Updated other IntelliJ IDEA warnings in GATKBAMIndex. Updated example .cram files to match versions generated by current GATK/HTSJDK. Bumped HTSJDK and Picard to 1.139 releases. Added support for using `-SNAPSHOT` of HTSJDK in the future. --- pom.xml | 1 + .../datasources/reads/GATKBAMIndex.java | 25 +++++++++++------- .../datasources/reads/SAMDataSource.java | 12 ++++++++- .../engine/arguments/CramIntegrationTest.java | 18 +++++++------ .../reads/GATKBAMIndexUnitTest.java | 15 +++++++---- public/gatk-root/pom.xml | 15 +++++++++-- .../resources/exampleCRAM-nobai-nocrai.cram | Bin 5281 -> 5281 bytes .../resources/exampleCRAM-nobai-withcrai.cram | Bin 5281 -> 5281 bytes .../src/test/resources/exampleCRAM.cram | Bin 5281 -> 5281 bytes 9 files changed, 60 insertions(+), 26 deletions(-) diff --git a/pom.xml b/pom.xml index da256634c..6490280d1 100644 --- a/pom.xml +++ b/pom.xml @@ -161,6 +161,7 @@ ${gatk.executable.directory}/lib runtime + false diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java index eba6c017f..b1d54d2b1 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java @@ -25,12 +25,10 @@ package org.broadinstitute.gatk.engine.datasources.reads; -import htsjdk.samtools.Bin; -import htsjdk.samtools.GATKBin; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.LinearIndex; +import htsjdk.samtools.*; import htsjdk.samtools.seekablestream.SeekableBufferedStream; import htsjdk.samtools.seekablestream.SeekableFileStream; +import htsjdk.samtools.seekablestream.SeekableStream; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -70,10 +68,11 @@ public class GATKBAMIndex { */ public static final int MAX_BINS = 37450; // =(8^6-1)/7+1 + private final SAMSequenceDictionary sequenceDictionary; private final File mFile; //TODO: figure out a good value for this buffer size - private final int BUFFERED_STREAM_BUFFER_SIZE = 8192; + private static final int BUFFERED_STREAM_BUFFER_SIZE = 8192; /** * Number of sequences stored in this index. @@ -86,11 +85,14 @@ public class GATKBAMIndex { private final long[] sequenceStartCache; private SeekableFileStream fileStream; + private SeekableStream baiStream; private SeekableBufferedStream bufferedStream; private long fileLength; - public GATKBAMIndex(final File file) { + public GATKBAMIndex(final File file, final SAMSequenceDictionary sequenceDictionary) { mFile = file; + this.sequenceDictionary = sequenceDictionary; + // Open the file stream. openIndexFile(); @@ -127,12 +129,12 @@ public class GATKBAMIndex { skipToSequence(referenceSequence); int binCount = readInteger(); - List bins = new ArrayList(); + List bins = new ArrayList<>(); for (int binNumber = 0; binNumber < binCount; binNumber++) { final int indexBin = readInteger(); final int nChunks = readInteger(); - List chunks = new ArrayList(nChunks); + List chunks = new ArrayList<>(nChunks); long[] rawChunkData = readLongs(nChunks*2); for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = rawChunkData[ci*2]; @@ -289,7 +291,8 @@ public class GATKBAMIndex { final int nBins = readInteger(); // System.out.println("# nBins: " + nBins); for (int j = 0; j < nBins; j++) { - final int bin = readInteger(); + /* final int bin = */ + readInteger(); final int nChunks = readInteger(); // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); skipBytes(16 * nChunks); @@ -308,7 +311,8 @@ public class GATKBAMIndex { private void openIndexFile() { try { fileStream = new SeekableFileStream(mFile); - bufferedStream = new SeekableBufferedStream(fileStream,BUFFERED_STREAM_BUFFER_SIZE); + baiStream = SamIndexes.asBaiSeekableStreamOrNull(fileStream, sequenceDictionary); + bufferedStream = new SeekableBufferedStream(baiStream, BUFFERED_STREAM_BUFFER_SIZE); fileLength=bufferedStream.length(); } catch (IOException exc) { @@ -319,6 +323,7 @@ public class GATKBAMIndex { private void closeIndexFile() { try { bufferedStream.close(); + baiStream.close(); fileStream.close(); fileLength = -1; } diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java index b735ff833..c97201b09 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java @@ -28,6 +28,7 @@ package org.broadinstitute.gatk.engine.datasources.reads; import htsjdk.samtools.MergingSamRecordIterator; import htsjdk.samtools.SamFileHeaderMerger; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.RuntimeIOException; @@ -372,10 +373,19 @@ public class SAMDataSource { originalToMergedReadGroupMappings.put(id,mappingToMerged); } + final SAMSequenceDictionary samSequenceDictionary; + if (referenceFile == null) { + samSequenceDictionary = mergedHeader.getSequenceDictionary(); + } else { + samSequenceDictionary = ReferenceSequenceFileFactory. + getReferenceSequenceFile(referenceFile). + getSequenceDictionary(); + } + for(SAMReaderID id: readerIDs) { File indexFile = findIndexFile(id.getSamFile()); if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); + bamIndices.put(id,new GATKBAMIndex(indexFile, samSequenceDictionary)); } resourcePool.releaseReaders(readers); diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java index 2748b1a81..f1e832d2c 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java @@ -50,11 +50,14 @@ public class CramIntegrationTest extends WalkerTest { {"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, {"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, {"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, + {"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, + {"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, + {"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, }; } @Test(dataProvider = "cramData") - public void testCRAM(String walker, String input, String args, String ext, String md5) { + public void testCram(String walker, String input, String args, String ext, String md5) { WalkerTestSpec spec = new WalkerTestSpec( " -T Test" + walker + "Walker" + " -I " + publicTestDir + input + @@ -64,25 +67,24 @@ public class CramIntegrationTest extends WalkerTest { 1, // just one output file Collections.singletonList(ext), Collections.singletonList(md5)); - executeTest(String.format("testCRAM %s %s -> %s: %s", walker, input, ext, args), spec); + executeTest(String.format("testCram %s %s -> %s: %s", walker, input, ext, args), spec); } - @DataProvider(name = "cramNoBaiData") - public Object[][] getCRAMNoBaiData() { + @DataProvider(name = "cramNoIndexData") + public Object[][] getCramNoIndexData() { return new Object[][]{ {"exampleCRAM-nobai-nocrai.cram"}, - {"exampleCRAM-nobai-withcrai.cram"}, }; } - @Test(dataProvider = "cramNoBaiData") - public void testCRAMNoBai(String input) { + @Test(dataProvider = "cramNoIndexData") + public void testCramNoIndex(String input) { WalkerTestSpec spec = new WalkerTestSpec( " -T TestPrintReadsWalker" + " -I " + publicTestDir + input + " -R " + exampleFASTA, 0, UserException.class); - executeTest(String.format("testCRAMNoBai %s", input), spec); + executeTest(String.format("testCramNoIndex %s", input), spec); } } diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java index 8e7f86de1..13f356959 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java @@ -52,14 +52,19 @@ public class GATKBAMIndexUnitTest extends BaseTest { */ private GATKBAMIndex bamIndex; - + /** + * Sequences. + */ + private SAMSequenceDictionary sequenceDictionary; + + @BeforeClass public void init() throws FileNotFoundException { SAMFileReader reader = new SAMFileReader(bamFile); - SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); + this.sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); reader.close(); - bamIndex = new GATKBAMIndex(bamIndexFile); + bamIndex = new GATKBAMIndex(bamIndexFile, sequenceDictionary); } @Test @@ -95,13 +100,13 @@ public class GATKBAMIndexUnitTest extends BaseTest { @Test( expectedExceptions = UserException.MalformedFile.class ) public void testDetectTruncatedBamIndexWordBoundary() { - GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_word_boundary.bai")); + GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_word_boundary.bai"), sequenceDictionary); index.readReferenceSequence(0); } @Test( expectedExceptions = UserException.MalformedFile.class ) public void testDetectTruncatedBamIndexNonWordBoundary() { - GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_non_word_boundary.bai")); + GATKBAMIndex index = new GATKBAMIndex(new File(privateTestDir + "truncated_at_non_word_boundary.bai"), sequenceDictionary); index.readReferenceSequence(0); } diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index a67020fdd..11fcd7c98 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -44,8 +44,8 @@ org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.gatk.utils.TestNGTestTransformer,org.broadinstitute.gatk.utils.GATKTextReporter,org.uncommons.reportng.HTMLReporter - 1.138 - 1.138 + 1.139 + 1.139 @@ -718,6 +718,17 @@ GATK Public Local Repository file:${gatk.basedir}/public/repo + + + false + + + true + + broad.artifactory.snapshots + Broad Institute Artifactory SNAPSHOTs + https://artifactory.broadinstitute.org/artifactory/libs-snapshot + diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram index b688b9c2864d2b6905c628efbaa26e6e1825ff91..33409003103c41a4962cab5d3562ed850a69b6bd 100644 GIT binary patch delta 12 TcmZ3exlnV06{G1!>ogGn9FPP1 delta 12 TcmZ3exlnV06{E>U>ogGn9E$_` diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram index b688b9c2864d2b6905c628efbaa26e6e1825ff91..33409003103c41a4962cab5d3562ed850a69b6bd 100644 GIT binary patch delta 12 TcmZ3exlnV06{G1!>ogGn9FPP1 delta 12 TcmZ3exlnV06{E>U>ogGn9E$_` diff --git a/public/gatk-utils/src/test/resources/exampleCRAM.cram b/public/gatk-utils/src/test/resources/exampleCRAM.cram index b688b9c2864d2b6905c628efbaa26e6e1825ff91..33409003103c41a4962cab5d3562ed850a69b6bd 100644 GIT binary patch delta 12 TcmZ3exlnV06{G1!>ogGn9FPP1 delta 12 TcmZ3exlnV06{E>U>ogGn9E$_`