ArtificialBAMBuilder utility class for creating streams of GATKSAMRecords with a variety of properties
-- Allows us to make a stream of reads or an index BAM file with read having the following properties (coming from n samples, of fixed read length and aligned to the genome with M operator, having N reads per alignment start, skipping N bases between each alignment start, starting at a given alignment start) -- This stream can be handed back to the caller immediately, or written to an indexed BAM file -- Update LocusIteratorByStateUnitTest to use this functionality (which was refactored from LIBS unit tests and ArtificialSAMUtils)
This commit is contained in:
parent
ec1cfe6732
commit
4d0e7b50ec
|
|
@ -0,0 +1,176 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Easy to use creator of artificial BAM files for testing
|
||||
*
|
||||
* Allows us to make a stream of reads or an index BAM file with read having the following properties
|
||||
*
|
||||
* - coming from n samples
|
||||
* - of fixed read length and aligned to the genome with M operator
|
||||
* - having N reads per alignment start
|
||||
* - skipping N bases between each alignment start
|
||||
* - starting at a given alignment start
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 1/15/13
|
||||
* Time: 9:22 AM
|
||||
*/
|
||||
public class ArtificialBAMBuilder {
|
||||
public final static int BAM_SHARD_SIZE = 16384;
|
||||
|
||||
final int nReadsPerLocus;
|
||||
final int nLoci;
|
||||
|
||||
int skipNLoci = 0;
|
||||
int alignmentStart = 1;
|
||||
int readLength = 10;
|
||||
private final ArrayList<String> samples = new ArrayList<String>();
|
||||
|
||||
final SAMFileWriterFactory factory = new SAMFileWriterFactory();
|
||||
{
|
||||
factory.setCreateIndex(true);
|
||||
}
|
||||
|
||||
SAMFileHeader header;
|
||||
|
||||
public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) {
|
||||
this.nReadsPerLocus = nReadsPerLocus;
|
||||
this.nLoci = nLoci;
|
||||
createAndSetHeader(1);
|
||||
}
|
||||
|
||||
public ArtificialBAMBuilder createAndSetHeader(final int nSamples) {
|
||||
this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
samples.clear();
|
||||
|
||||
for ( int i = 0; i < nSamples; i++ ) {
|
||||
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
|
||||
final String sample = "sample" + i;
|
||||
samples.add(sample);
|
||||
rg.setSample(sample);
|
||||
rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
|
||||
header.addReadGroup(rg);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<String> getSamples() {
|
||||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a read stream based on the parameters. The cigar string for each
|
||||
* read will be *M, where * is the length of the read.
|
||||
*
|
||||
* Useful for testing things like LocusIteratorBystate
|
||||
*
|
||||
* @return a ordered list of reads
|
||||
*/
|
||||
public List<GATKSAMRecord> makeReads() {
|
||||
final String baseName = "read";
|
||||
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(nReadsPerLocus*nLoci);
|
||||
for ( int locusI = 0; locusI < nLoci; locusI++) {
|
||||
final int locus = locusI * (skipNLoci + 1);
|
||||
for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
|
||||
for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {
|
||||
final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId());
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength);
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord(rg));
|
||||
reads.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an indexed BAM file contains the reads in the builder, marking it for deleteOnExit()
|
||||
* @return the BAM file
|
||||
*/
|
||||
public File makeTemporarilyBAMFile() {
|
||||
try {
|
||||
final File file = File.createTempFile("tempBAM", ".bam");
|
||||
file.deleteOnExit();
|
||||
return makeBAMFile(file);
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the reads from this builder to output, creating an index as well
|
||||
* @param output the output BAM file we want to use
|
||||
* @return
|
||||
*/
|
||||
public File makeBAMFile(final File output) {
|
||||
final SAMFileWriter writer = factory.makeBAMWriter(header, true, output, 0);
|
||||
for ( final GATKSAMRecord read : makeReads() )
|
||||
writer.addAlignment(read);
|
||||
writer.close();
|
||||
return output;
|
||||
}
|
||||
|
||||
public int getnReadsPerLocus() { return nReadsPerLocus; }
|
||||
public int getnLoci() { return nLoci; }
|
||||
public int getSkipNLoci() { return skipNLoci; }
|
||||
public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; return this; }
|
||||
public int getAlignmentStart() { return alignmentStart; }
|
||||
public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; return this; }
|
||||
public int getReadLength() { return readLength; }
|
||||
public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; return this; }
|
||||
public SAMFileHeader getHeader() { return header; }
|
||||
public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; }
|
||||
|
||||
public int getNSamples() { return samples.size(); }
|
||||
|
||||
public int expectedNumberOfReads() {
|
||||
return nLoci * nReadsPerLocus * header.getReadGroups().size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ArtificialBAMBuilder{" +
|
||||
"samples=" + samples +
|
||||
", readLength=" + readLength +
|
||||
", alignmentStart=" + alignmentStart +
|
||||
", skipNLoci=" + skipNLoci +
|
||||
", nLoci=" + nLoci +
|
||||
", nReadsPerLocus=" + nReadsPerLocus +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
@ -327,35 +327,6 @@ public class ArtificialSAMUtils {
|
|||
return stack;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a read stream based on the parameters. The cigar string for each
|
||||
* read will be *M, where * is the length of the read.
|
||||
*
|
||||
* Useful for testing things like LocusIteratorBystate
|
||||
*
|
||||
* @return a collection of stackSize reads all sharing the above properties
|
||||
*/
|
||||
public static List<GATKSAMRecord> createReadStream( final int nReadsPerLocus,
|
||||
final int nLoci,
|
||||
final SAMFileHeader header,
|
||||
final int alignmentStart,
|
||||
final int length ) {
|
||||
final String baseName = "read";
|
||||
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(nReadsPerLocus*nLoci);
|
||||
for ( int locus = 0; locus < nLoci; locus++ ) {
|
||||
for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
|
||||
for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {
|
||||
final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId());
|
||||
final GATKSAMRecord read = createArtificialRead(header, readName, 0, alignmentStart + locus, length);
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord(rg));
|
||||
reads.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* create an iterator containing the specified read piles
|
||||
*
|
||||
|
|
|
|||
|
|
@ -28,17 +28,16 @@ package org.broadinstitute.sting.utils.locusiterator;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -447,26 +446,19 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
|
|||
//logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample));
|
||||
final int readLength = 10;
|
||||
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
|
||||
final List<String> samples = new ArrayList<String>(nSamples);
|
||||
for ( int i = 0; i < nSamples; i++ ) {
|
||||
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
|
||||
final String sample = "sample" + i;
|
||||
samples.add(sample);
|
||||
rg.setSample(sample);
|
||||
rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
|
||||
header.addReadGroup(rg);
|
||||
}
|
||||
|
||||
final boolean downsample = downsampleTo != -1;
|
||||
final DownsamplingMethod downsampler = downsample
|
||||
? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false)
|
||||
: new DownsamplingMethod(DownsampleType.NONE, null, null, false);
|
||||
final List<GATKSAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
|
||||
|
||||
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
|
||||
bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1);
|
||||
|
||||
final List<GATKSAMRecord> reads = bamBuilder.makeReads();
|
||||
li = new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
|
||||
createTestReadProperties(downsampler, keepReads),
|
||||
genomeLocParser,
|
||||
samples);
|
||||
bamBuilder.getSamples());
|
||||
|
||||
final Set<GATKSAMRecord> seenSoFar = new HashSet<GATKSAMRecord>();
|
||||
final Set<GATKSAMRecord> keptReads = new HashSet<GATKSAMRecord>();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,122 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.commons.collections.IteratorUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 1/15/13
|
||||
* Time: 3:49 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ArtificialBAMBuilderUnitTest extends BaseTest {
|
||||
@DataProvider(name = "CombinatorialARTTilingProvider")
|
||||
public Object[][] makeCombinatorialARTTilingProvider() {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
|
||||
final List<Integer> starts = Arrays.asList(
|
||||
1, // very start of the chromosome
|
||||
ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary
|
||||
ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary
|
||||
);
|
||||
|
||||
for ( final int readLength : Arrays.asList(10, 20) ) {
|
||||
for ( final int skips : Arrays.asList(0, 1, 10) ) {
|
||||
for ( final int start : starts ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 2) ) {
|
||||
for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) {
|
||||
for ( final int nLoci : Arrays.asList(10, 100, 1000) ) {
|
||||
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
|
||||
bamBuilder.setReadLength(readLength);
|
||||
bamBuilder.setSkipNLoci(skips);
|
||||
bamBuilder.setAlignmentStart(start);
|
||||
bamBuilder.createAndSetHeader(nSamples);
|
||||
tests.add(new Object[]{bamBuilder, readLength, skips, start, nSamples, nReadsPerLocus, nLoci});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CombinatorialARTTilingProvider")
|
||||
public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) {
|
||||
Assert.assertEquals(bamBuilder.getReadLength(), readLength);
|
||||
Assert.assertEquals(bamBuilder.getSkipNLoci(), skips);
|
||||
Assert.assertEquals(bamBuilder.getAlignmentStart(), start);
|
||||
Assert.assertEquals(bamBuilder.getNSamples(), nSamples);
|
||||
Assert.assertEquals(bamBuilder.getnReadsPerLocus(), nReadsPerLocus);
|
||||
Assert.assertEquals(bamBuilder.getnLoci(), nLoci);
|
||||
|
||||
final List<GATKSAMRecord> reads = bamBuilder.makeReads();
|
||||
Assert.assertEquals(reads.size(), bamBuilder.expectedNumberOfReads());
|
||||
for ( final GATKSAMRecord read : reads ) {
|
||||
assertGoodRead(read, bamBuilder);
|
||||
}
|
||||
|
||||
final File bam = bamBuilder.makeTemporarilyBAMFile();
|
||||
final SAMFileReader reader = new SAMFileReader(bam);
|
||||
Assert.assertTrue(reader.hasIndex());
|
||||
final Iterator<SAMRecord> bamIt = reader.iterator();
|
||||
int nReadsFromBam = 0;
|
||||
int lastStart = -1;
|
||||
while ( bamIt.hasNext() ) {
|
||||
final SAMRecord read = bamIt.next();
|
||||
assertGoodRead(read, bamBuilder);
|
||||
nReadsFromBam++;
|
||||
Assert.assertTrue(read.getAlignmentStart() >= lastStart);
|
||||
lastStart = read.getAlignmentStart();
|
||||
}
|
||||
Assert.assertEquals(nReadsFromBam, bamBuilder.expectedNumberOfReads());
|
||||
}
|
||||
|
||||
private void assertGoodRead(final SAMRecord read, final ArtificialBAMBuilder bamBuilder) {
|
||||
Assert.assertEquals(read.getReadLength(), bamBuilder.getReadLength());
|
||||
Assert.assertEquals(read.getReadBases().length, bamBuilder.getReadLength());
|
||||
Assert.assertEquals(read.getBaseQualities().length, bamBuilder.getReadLength());
|
||||
Assert.assertTrue(read.getAlignmentStart() >= bamBuilder.getAlignmentStart());
|
||||
Assert.assertNotNull(read.getReadGroup());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue