diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java new file mode 100644 index 000000000..651d759e0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.NGSPlatform; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Easy to use creator of artificial BAM files for testing + * + * Allows us to make a stream of reads or an index BAM file with read having the following properties + * + * - coming from n samples + * - of fixed read length and aligned to the genome with M operator + * - having N reads per alignment start + * - skipping N bases between each alignment start + * - starting at a given alignment start + * + * User: depristo + * Date: 1/15/13 + * Time: 9:22 AM + */ +public class ArtificialBAMBuilder { + public final static int BAM_SHARD_SIZE = 16384; + + final int nReadsPerLocus; + final int nLoci; + + int skipNLoci = 0; + int alignmentStart = 1; + int readLength = 10; + private final ArrayList samples = new ArrayList(); + + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + { + factory.setCreateIndex(true); + } + + SAMFileHeader header; + + public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) { + this.nReadsPerLocus = nReadsPerLocus; + this.nLoci = nLoci; + createAndSetHeader(1); + } + + public ArtificialBAMBuilder createAndSetHeader(final int nSamples) { + this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + samples.clear(); + + for ( int i = 0; i < nSamples; i++ ) { + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); + final String sample = "sample" + i; + samples.add(sample); + rg.setSample(sample); + rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); + header.addReadGroup(rg); + } + + return this; + } + + public List getSamples() { + return samples; + } + + /** + * Create a read stream based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like LocusIteratorBystate + * + * @return a ordered list of reads + */ + public List makeReads() { + final String baseName = "read"; + List reads = new ArrayList(nReadsPerLocus*nLoci); + for ( int locusI = 0; locusI < nLoci; locusI++) { + final int locus = locusI * (skipNLoci + 1); + for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { + for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { + final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength); + read.setReadGroup(new GATKSAMReadGroupRecord(rg)); + reads.add(read); + } + } + } + + return reads; + } + + /** + * Make an indexed BAM file contains the reads in the builder, marking it for deleteOnExit() + * @return the BAM file + */ + public File makeTemporarilyBAMFile() { + try { + final File file = File.createTempFile("tempBAM", ".bam"); + file.deleteOnExit(); + return makeBAMFile(file); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + /** + * Write the reads from this builder to output, creating an index as well + * @param output the output BAM file we want to use + * @return + */ + public File makeBAMFile(final File output) { + final SAMFileWriter writer = factory.makeBAMWriter(header, true, output, 0); + for ( final GATKSAMRecord read : makeReads() ) + writer.addAlignment(read); + writer.close(); + return output; + } + + public int getnReadsPerLocus() { return nReadsPerLocus; } + public int getnLoci() { return nLoci; } + public int getSkipNLoci() { return skipNLoci; } + public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; return this; } + public int getAlignmentStart() { return alignmentStart; } + public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; return this; } + public int getReadLength() { return readLength; } + public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; return this; } + public SAMFileHeader getHeader() { return header; } + public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; } + + public int getNSamples() { return samples.size(); } + + public int expectedNumberOfReads() { + return nLoci * nReadsPerLocus * header.getReadGroups().size(); + } + + @Override + public String toString() { + return "ArtificialBAMBuilder{" + + "samples=" + samples + + ", readLength=" + readLength + + ", alignmentStart=" + alignmentStart + + ", skipNLoci=" + skipNLoci + + ", nLoci=" + nLoci + + ", nReadsPerLocus=" + nReadsPerLocus + + '}'; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 4af6555d9..0f5d6a2f7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -327,35 +327,6 @@ public class ArtificialSAMUtils { return stack; } - /** - * Create a read stream based on the parameters. The cigar string for each - * read will be *M, where * is the length of the read. - * - * Useful for testing things like LocusIteratorBystate - * - * @return a collection of stackSize reads all sharing the above properties - */ - public static List createReadStream( final int nReadsPerLocus, - final int nLoci, - final SAMFileHeader header, - final int alignmentStart, - final int length ) { - final String baseName = "read"; - List reads = new ArrayList(nReadsPerLocus*nLoci); - for ( int locus = 0; locus < nLoci; locus++ ) { - for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { - for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { - final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); - final GATKSAMRecord read = createArtificialRead(header, readName, 0, alignmentStart + locus, length); - read.setReadGroup(new GATKSAMReadGroupRecord(rg)); - reads.add(read); - } - } - } - - return reads; - } - /** * create an iterator containing the specified read piles * diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index 37494903c..2f984165e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -28,17 +28,16 @@ package org.broadinstitute.sting.utils.locusiterator; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -447,26 +446,19 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); final int readLength = 10; - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); - final List samples = new ArrayList(nSamples); - for ( int i = 0; i < nSamples; i++ ) { - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); - final String sample = "sample" + i; - samples.add(sample); - rg.setSample(sample); - rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); - header.addReadGroup(rg); - } - final boolean downsample = downsampleTo != -1; final DownsamplingMethod downsampler = downsample ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) : new DownsamplingMethod(DownsampleType.NONE, null, null, false); - final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci); + bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1); + + final List reads = bamBuilder.makeReads(); li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), createTestReadProperties(downsampler, keepReads), genomeLocParser, - samples); + bamBuilder.getSamples()); final Set seenSoFar = new HashSet(); final Set keptReads = new HashSet(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java new file mode 100644 index 000000000..cf3c97b34 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.apache.commons.collections.IteratorUtils; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 1/15/13 + * Time: 3:49 PM + * To change this template use File | Settings | File Templates. + */ +public class ArtificialBAMBuilderUnitTest extends BaseTest { + @DataProvider(name = "CombinatorialARTTilingProvider") + public Object[][] makeCombinatorialARTTilingProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + for ( final int readLength : Arrays.asList(10, 20) ) { + for ( final int skips : Arrays.asList(0, 1, 10) ) { + for ( final int start : starts ) { + for ( final int nSamples : Arrays.asList(1, 2) ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) { + for ( final int nLoci : Arrays.asList(10, 100, 1000) ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + bamBuilder.createAndSetHeader(nSamples); + tests.add(new Object[]{bamBuilder, readLength, skips, start, nSamples, nReadsPerLocus, nLoci}); + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CombinatorialARTTilingProvider") + public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) { + Assert.assertEquals(bamBuilder.getReadLength(), readLength); + Assert.assertEquals(bamBuilder.getSkipNLoci(), skips); + Assert.assertEquals(bamBuilder.getAlignmentStart(), start); + Assert.assertEquals(bamBuilder.getNSamples(), nSamples); + Assert.assertEquals(bamBuilder.getnReadsPerLocus(), nReadsPerLocus); + Assert.assertEquals(bamBuilder.getnLoci(), nLoci); + + final List reads = bamBuilder.makeReads(); + Assert.assertEquals(reads.size(), bamBuilder.expectedNumberOfReads()); + for ( final GATKSAMRecord read : reads ) { + assertGoodRead(read, bamBuilder); + } + + final File bam = bamBuilder.makeTemporarilyBAMFile(); + final SAMFileReader reader = new SAMFileReader(bam); + Assert.assertTrue(reader.hasIndex()); + final Iterator bamIt = reader.iterator(); + int nReadsFromBam = 0; + int lastStart = -1; + while ( bamIt.hasNext() ) { + final SAMRecord read = bamIt.next(); + assertGoodRead(read, bamBuilder); + nReadsFromBam++; + Assert.assertTrue(read.getAlignmentStart() >= lastStart); + lastStart = read.getAlignmentStart(); + } + Assert.assertEquals(nReadsFromBam, bamBuilder.expectedNumberOfReads()); + } + + private void assertGoodRead(final SAMRecord read, final ArtificialBAMBuilder bamBuilder) { + Assert.assertEquals(read.getReadLength(), bamBuilder.getReadLength()); + Assert.assertEquals(read.getReadBases().length, bamBuilder.getReadLength()); + Assert.assertEquals(read.getBaseQualities().length, bamBuilder.getReadLength()); + Assert.assertTrue(read.getAlignmentStart() >= bamBuilder.getAlignmentStart()); + Assert.assertNotNull(read.getReadGroup()); + } +} + +