From b992abb6eb6e7936dbe388e0f50ee9144324d247 Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 9 Feb 2011 01:51:34 +0000 Subject: [PATCH] A few more unit tests plus some extra functionality for BAM index visualization. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5222 348d0f76-0448-11de-a6fe-93d51630548a --- build.xml | 1 + java/src/net/sf/samtools/GATKBAMFileSpan.java | 11 ++ java/src/net/sf/samtools/GATKBin.java | 5 + .../reads/BAMBlockStartIterator.java | 133 ++++++++++++++++++ .../gatk/datasources/reads/GATKBAMIndex.java | 29 +++- .../reads/GATKBAMIndexUnitTest.java | 94 +++++++++++++ 6 files changed, 271 insertions(+), 2 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java create mode 100644 java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java diff --git a/build.xml b/build.xml index f5a6dc62e..984b254d3 100644 --- a/build.xml +++ b/build.xml @@ -321,6 +321,7 @@ + diff --git a/java/src/net/sf/samtools/GATKBAMFileSpan.java b/java/src/net/sf/samtools/GATKBAMFileSpan.java index d1bf9d1cd..ba99f73e5 100644 --- a/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -56,4 +56,15 @@ public class GATKBAMFileSpan extends BAMFileSpan { public GATKBAMFileSpan(final List chunks) { super(new ArrayList(chunks)); } + + /** + * Gets the constituent chunks stored in this span. + * @return An unmodifiable list of chunks. + */ + public List getGATKChunks() { + List gatkChunks = new ArrayList(); + for(Chunk chunk: getChunks()) + gatkChunks.add(new GATKChunk(chunk)); + return gatkChunks; + } } diff --git a/java/src/net/sf/samtools/GATKBin.java b/java/src/net/sf/samtools/GATKBin.java index 95f4000ec..0922ef630 100644 --- a/java/src/net/sf/samtools/GATKBin.java +++ b/java/src/net/sf/samtools/GATKBin.java @@ -62,4 +62,9 @@ public class GATKBin extends Bin { public void setGATKChunkList(List chunks) { super.setChunkList(new ArrayList(chunks)); } + + @Override + public String toString() { + return String.format("Bin %d in contig %d",getBinNumber(),getReferenceSequence()); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java new file mode 100644 index 000000000..a9e04e357 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMBlockStartIterator.java @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 7, 2011 + * Time: 2:46:34 PM + * To change this template use File | Settings | File Templates. + */ +public class BAMBlockStartIterator implements Iterator { + /** + * How large is a BGZF header? + */ + private static int BGZF_HEADER_SIZE = 18; + + /** + * Where within the header does the BLOCKSIZE actually live? + */ + private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2; + + private FileChannel bamInputChannel; + private ByteBuffer headerByteBuffer; + + private long nextLocation = 0; + + public BAMBlockStartIterator(File bamFile) { + try { + FileInputStream bamInputStream = new FileInputStream(bamFile); + bamInputChannel = bamInputStream.getChannel(); + + headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE); + headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN); + + } + catch(IOException ex) { + throw new StingException("Could not open file",ex); + } + } + + public boolean hasNext() { + return nextLocation != -1; + } + + public Long next() { + long currentLocation = nextLocation; + advance(); + return currentLocation; + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator"); + } + + private void advance() { + int readStatus; + + headerByteBuffer.clear(); + try { + readStatus = bamInputChannel.read(headerByteBuffer); + } + catch(IOException ex) { + throw new StingException("Could not read header data",ex); + } + + if(readStatus == -1) { + nextLocation = -1; + try { + bamInputChannel.close(); + } + catch(IOException ex) { + throw new StingException("Could not close input file",ex); + } + return; + } + + headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION); + int blockSize = headerByteBuffer.getShort(); + + try { + bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1); + nextLocation = bamInputChannel.position(); + } + catch(IOException ex) { + throw new StingException("Could not reposition input stream",ex); + } + } + + public static void main(String argv[]) throws IOException { + BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam")); + int i = 0; + while(blockStartIterator.hasNext()) + System.out.printf("%d -> %d%n",i++,blockStartIterator.next()); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index bdc544d47..782fa3f88 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -126,8 +126,8 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { * @return The size (number of possible bins) of the given level. */ public int getLevelSize(final int levelNumber) { - if(levelNumber == getNumIndexLevels()) - return MAX_BINS+1-LEVEL_STARTS[levelNumber]; + if(levelNumber == getNumIndexLevels()-1) + return MAX_BINS-LEVEL_STARTS[levelNumber]; else return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; } @@ -331,6 +331,31 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { return new GATKBAMFileSpan(chunkList); } + public GATKBAMFileSpan getContentsOfBin(final Bin bin) { + if(bin == null) + return null; + + GATKBin gatkBin = new GATKBin(bin); + + BAMIndexContent indexQuery = getQueryResults(gatkBin.getReferenceSequence()); + + if(indexQuery == null) + return null; + + GATKBin queriedBin = indexQuery.getBins().getBin(gatkBin.getBinNumber()); + + return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getGATKChunkList()) : null; + } + + /** + * Retrieves the linear index for the given reference sequence. + * @param referenceSequence Reference sequence number for which to retrieve the reference. + * @return The linear index for the given reference sequence. + */ + public LinearIndex getLinearIndex(int referenceSequence) { + return getQueryResults(referenceSequence).getLinearIndex(); + } + /** * Get a list of bins in the BAM file that may contain SAMRecords for the given range. * @param referenceIndex sequence of desired SAMRecords diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java new file mode 100644 index 000000000..567b1a3de --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +/** + * Test basic functionality in the GATK's implementation of the BAM index classes. + */ +public class GATKBAMIndexUnitTest extends BaseTest { + private static File bamFile = new File(validationDataLocation+"MV1994.selected.bam"); + + /** + * Index file forming the source of all unit tests. + */ + private static File bamIndexFile = new File(validationDataLocation+"MV1994.selected.bam.bai"); + + /** + * Storage for the index itself. + */ + private GATKBAMIndex bamIndex; + + + @BeforeClass + public void init() throws FileNotFoundException { + SAMFileReader reader = new SAMFileReader(bamFile); + SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); + reader.close(); + + bamIndex = new GATKBAMIndex(bamIndexFile,sequenceDictionary); + } + + @Test + public void testNumberAndSizeOfIndexLevels() { + // The correct values for this test are pulled directly from the + // SAM Format Specification v1.3-r882, Section 4.1.1, last paragraph. + Assert.assertEquals(GATKBAMIndex.getNumIndexLevels(),6,"Incorrect number of levels in BAM index"); + + // Level 0 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(0),0); + Assert.assertEquals(bamIndex.getLevelSize(0),1); + + // Level 1 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(1),1); + Assert.assertEquals(bamIndex.getLevelSize(1),8-1+1); + + // Level 2 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(2),9); + Assert.assertEquals(bamIndex.getLevelSize(2),72-9+1); + + // Level 3 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(3),73); + Assert.assertEquals(bamIndex.getLevelSize(3),584-73+1); + + // Level 4 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(4),585); + Assert.assertEquals(bamIndex.getLevelSize(4),4680-585+1); + + // Level 5 + Assert.assertEquals(GATKBAMIndex.getFirstBinInLevel(5),4681); + Assert.assertEquals(bamIndex.getLevelSize(5),37449-4681+1); + } + +}