ArtificialBAMBuilder utility class for creating streams of GATKSAMRecords with a variety of properties

--  Allows us to make a stream of reads or an index BAM file with read having the following properties (coming from n samples, of fixed read length and aligned to the genome with M operator, having N reads per alignment start, skipping N bases between each alignment start, starting at a given alignment start)
-- This stream can be handed back to the caller immediately, or written to an indexed BAM file
-- Update LocusIteratorByStateUnitTest to use this functionality (which was refactored from LIBS unit tests and ArtificialSAMUtils)
This commit is contained in:
Mark DePristo 2013-01-15 16:45:45 -05:00
parent ec1cfe6732
commit 4d0e7b50ec
4 changed files with 305 additions and 44 deletions

View File

@ -0,0 +1,176 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.NGSPlatform;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Easy to use creator of artificial BAM files for testing
*
* Allows us to make a stream of reads or an index BAM file with read having the following properties
*
* - coming from n samples
* - of fixed read length and aligned to the genome with M operator
* - having N reads per alignment start
* - skipping N bases between each alignment start
* - starting at a given alignment start
*
* User: depristo
* Date: 1/15/13
* Time: 9:22 AM
*/
public class ArtificialBAMBuilder {
public final static int BAM_SHARD_SIZE = 16384;
final int nReadsPerLocus;
final int nLoci;
int skipNLoci = 0;
int alignmentStart = 1;
int readLength = 10;
private final ArrayList<String> samples = new ArrayList<String>();
final SAMFileWriterFactory factory = new SAMFileWriterFactory();
{
factory.setCreateIndex(true);
}
SAMFileHeader header;
public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) {
this.nReadsPerLocus = nReadsPerLocus;
this.nLoci = nLoci;
createAndSetHeader(1);
}
public ArtificialBAMBuilder createAndSetHeader(final int nSamples) {
this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
samples.clear();
for ( int i = 0; i < nSamples; i++ ) {
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
final String sample = "sample" + i;
samples.add(sample);
rg.setSample(sample);
rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
header.addReadGroup(rg);
}
return this;
}
public List<String> getSamples() {
return samples;
}
/**
* Create a read stream based on the parameters. The cigar string for each
* read will be *M, where * is the length of the read.
*
* Useful for testing things like LocusIteratorBystate
*
* @return a ordered list of reads
*/
public List<GATKSAMRecord> makeReads() {
final String baseName = "read";
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(nReadsPerLocus*nLoci);
for ( int locusI = 0; locusI < nLoci; locusI++) {
final int locus = locusI * (skipNLoci + 1);
for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {
final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId());
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength);
read.setReadGroup(new GATKSAMReadGroupRecord(rg));
reads.add(read);
}
}
}
return reads;
}
/**
* Make an indexed BAM file contains the reads in the builder, marking it for deleteOnExit()
* @return the BAM file
*/
public File makeTemporarilyBAMFile() {
try {
final File file = File.createTempFile("tempBAM", ".bam");
file.deleteOnExit();
return makeBAMFile(file);
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}
/**
* Write the reads from this builder to output, creating an index as well
* @param output the output BAM file we want to use
* @return
*/
public File makeBAMFile(final File output) {
final SAMFileWriter writer = factory.makeBAMWriter(header, true, output, 0);
for ( final GATKSAMRecord read : makeReads() )
writer.addAlignment(read);
writer.close();
return output;
}
public int getnReadsPerLocus() { return nReadsPerLocus; }
public int getnLoci() { return nLoci; }
public int getSkipNLoci() { return skipNLoci; }
public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; return this; }
public int getAlignmentStart() { return alignmentStart; }
public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; return this; }
public int getReadLength() { return readLength; }
public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; return this; }
public SAMFileHeader getHeader() { return header; }
public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; }
public int getNSamples() { return samples.size(); }
public int expectedNumberOfReads() {
return nLoci * nReadsPerLocus * header.getReadGroups().size();
}
@Override
public String toString() {
return "ArtificialBAMBuilder{" +
"samples=" + samples +
", readLength=" + readLength +
", alignmentStart=" + alignmentStart +
", skipNLoci=" + skipNLoci +
", nLoci=" + nLoci +
", nReadsPerLocus=" + nReadsPerLocus +
'}';
}
}

View File

@ -327,35 +327,6 @@ public class ArtificialSAMUtils {
return stack;
}
/**
* Create a read stream based on the parameters. The cigar string for each
* read will be *M, where * is the length of the read.
*
* Useful for testing things like LocusIteratorBystate
*
* @return a collection of stackSize reads all sharing the above properties
*/
public static List<GATKSAMRecord> createReadStream( final int nReadsPerLocus,
final int nLoci,
final SAMFileHeader header,
final int alignmentStart,
final int length ) {
final String baseName = "read";
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(nReadsPerLocus*nLoci);
for ( int locus = 0; locus < nLoci; locus++ ) {
for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {
final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId());
final GATKSAMRecord read = createArtificialRead(header, readName, 0, alignmentStart + locus, length);
read.setReadGroup(new GATKSAMReadGroupRecord(rg));
reads.add(read);
}
}
}
return reads;
}
/**
* create an iterator containing the specified read piles
*

View File

@ -28,17 +28,16 @@ package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -447,26 +446,19 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
//logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample));
final int readLength = 10;
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
final List<String> samples = new ArrayList<String>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
final String sample = "sample" + i;
samples.add(sample);
rg.setSample(sample);
rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
header.addReadGroup(rg);
}
final boolean downsample = downsampleTo != -1;
final DownsamplingMethod downsampler = downsample
? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false)
: new DownsamplingMethod(DownsampleType.NONE, null, null, false);
final List<GATKSAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1);
final List<GATKSAMRecord> reads = bamBuilder.makeReads();
li = new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
createTestReadProperties(downsampler, keepReads),
genomeLocParser,
samples);
bamBuilder.getSamples());
final Set<GATKSAMRecord> seenSoFar = new HashSet<GATKSAMRecord>();
final Set<GATKSAMRecord> keptReads = new HashSet<GATKSAMRecord>();

View File

@ -0,0 +1,122 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.apache.commons.collections.IteratorUtils;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 1/15/13
* Time: 3:49 PM
* To change this template use File | Settings | File Templates.
*/
public class ArtificialBAMBuilderUnitTest extends BaseTest {
@DataProvider(name = "CombinatorialARTTilingProvider")
public Object[][] makeCombinatorialARTTilingProvider() {
final List<Object[]> tests = new LinkedList<Object[]>();
final List<Integer> starts = Arrays.asList(
1, // very start of the chromosome
ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary
ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary
);
for ( final int readLength : Arrays.asList(10, 20) ) {
for ( final int skips : Arrays.asList(0, 1, 10) ) {
for ( final int start : starts ) {
for ( final int nSamples : Arrays.asList(1, 2) ) {
for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) {
for ( final int nLoci : Arrays.asList(10, 100, 1000) ) {
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
bamBuilder.setReadLength(readLength);
bamBuilder.setSkipNLoci(skips);
bamBuilder.setAlignmentStart(start);
bamBuilder.createAndSetHeader(nSamples);
tests.add(new Object[]{bamBuilder, readLength, skips, start, nSamples, nReadsPerLocus, nLoci});
}
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "CombinatorialARTTilingProvider")
public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) {
Assert.assertEquals(bamBuilder.getReadLength(), readLength);
Assert.assertEquals(bamBuilder.getSkipNLoci(), skips);
Assert.assertEquals(bamBuilder.getAlignmentStart(), start);
Assert.assertEquals(bamBuilder.getNSamples(), nSamples);
Assert.assertEquals(bamBuilder.getnReadsPerLocus(), nReadsPerLocus);
Assert.assertEquals(bamBuilder.getnLoci(), nLoci);
final List<GATKSAMRecord> reads = bamBuilder.makeReads();
Assert.assertEquals(reads.size(), bamBuilder.expectedNumberOfReads());
for ( final GATKSAMRecord read : reads ) {
assertGoodRead(read, bamBuilder);
}
final File bam = bamBuilder.makeTemporarilyBAMFile();
final SAMFileReader reader = new SAMFileReader(bam);
Assert.assertTrue(reader.hasIndex());
final Iterator<SAMRecord> bamIt = reader.iterator();
int nReadsFromBam = 0;
int lastStart = -1;
while ( bamIt.hasNext() ) {
final SAMRecord read = bamIt.next();
assertGoodRead(read, bamBuilder);
nReadsFromBam++;
Assert.assertTrue(read.getAlignmentStart() >= lastStart);
lastStart = read.getAlignmentStart();
}
Assert.assertEquals(nReadsFromBam, bamBuilder.expectedNumberOfReads());
}
private void assertGoodRead(final SAMRecord read, final ArtificialBAMBuilder bamBuilder) {
Assert.assertEquals(read.getReadLength(), bamBuilder.getReadLength());
Assert.assertEquals(read.getReadBases().length, bamBuilder.getReadLength());
Assert.assertEquals(read.getBaseQualities().length, bamBuilder.getReadLength());
Assert.assertTrue(read.getAlignmentStart() >= bamBuilder.getAlignmentStart());
Assert.assertNotNull(read.getReadGroup());
}
}