added tools to test read based traversals using the artificial in-memory SAM file tools, and testing of the PrintReadsWalker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@957 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-06-10 01:52:25 +00:00
parent eb962fe52a
commit 36c98b9d6c
5 changed files with 503 additions and 1 deletions

View File

@ -0,0 +1,139 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class ArtificialPatternedSAMIterator
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class ArtificialPatternedSAMIterator extends ArtificialSAMIterator {
/** the pattern we're implementing */
public enum PATTERN {
RANDOM_READS, IN_ORDER_READS;
}
// our pattern
private final PATTERN mPattern;
/**
* this is pretty heavy (and it could be extremely heavy, given the amount of reads they request, but it
* allows us to give them each read once, reguardless of the order specified
*/
private final int[] reads;
private final int readCount;
/**
* create the fake iterator, given the mapping of chromosomes and read counts. If pattern
* is specified to be random, it will generate reads that are randomly placed on the current chromosome
*
* @param startingChr the starting chromosome
* @param endingChr the ending chromosome
* @param readCount the number of reads in each chromosome
* @param header the associated header
* @param pattern the pattern to implement
*/
ArtificialPatternedSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header, PATTERN pattern ) {
super(startingChr, endingChr, readCount, unmappedReadCount, header);
mPattern = pattern;
this.readCount = readCount;
reads = new int[readCount];
for (int x = 0; x < readCount; x++) {
reads[x] = x;
}
if (pattern == PATTERN.RANDOM_READS) {
// scramble a bunch of the reads
for (int y = 0; y < readCount; y++) {
int ranOne = (int) Math.round(Math.random() * ( readCount - 1 ));
int ranTwo = (int) Math.round(Math.random() * ( readCount - 1 ));
int temp = reads[ranOne];
reads[ranOne] = reads[ranTwo];
reads[ranTwo] = temp;
}
/**
* up to this point there's no garauntee that the random() has made the reads out of order (though it's
* extremely extremely unlikely it's failed). Let's make sure there at least out of order:
*/
if (this.reads[0] < this.reads[reads.length -1]) {
int temp = reads[0];
reads[0] = reads[reads.length -1];
reads[reads.length -1] = temp;
}
}
}
/**
* override the default ArtificialSAMIterator createNextRead method, which creates the next read
* @return
*/
protected boolean createNextRead() {
if (currentRead >= rCount) {
currentChromo++;
currentRead = 0;
}
// check for end condition, have we finished the chromosome listing, and have no unmapped reads
if (currentChromo >= eChromosomeCount) {
if (uMappedReadCount < 1) {
this.next = null;
return false;
} else {
++totalReadCount;
this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), -1, -1, 50);
--uMappedReadCount;
return true;
}
}
++totalReadCount;
this.next = getNextRecord(currentRead);
++currentRead;
return true;
}
/**
* get the next read, given it's index in the chromosome
* @param read the read index in the chromosome
* @return a SAMRecord
*/
private SAMRecord getNextRecord( int read ) {
if (read > this.readCount) {
return ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(reads[readCount - 1]), currentChromo, reads[readCount - 1], 50);
}
return ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(reads[read]), currentChromo, reads[read], 50);
}
}

View File

@ -0,0 +1,144 @@
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.apache.log4j.Logger;
import java.util.Arrays;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class ArtificialReadsTraversal
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class ArtificialReadsTraversal extends TraversalEngine {
public int startingChr = 1;
public int endingChr = 5;
public int readsPerChr = 100;
public int unMappedReads = 1000;
private int DEFAULT_READ_LENGTH = ArtificialSamUtils.DEFAULT_READ_LENGTH;
private ArtificialPatternedSAMIterator iter;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(ArtificialReadsTraversal.class);
/** Creates a new, uninitialized ArtificialReadsTraversal */
public ArtificialReadsTraversal() {
super(null, null, null);
}
// what read ordering are we using
private ArtificialPatternedSAMIterator.PATTERN readOrder = ArtificialPatternedSAMIterator.PATTERN.IN_ORDER_READS;
/**
* set the read ordering of the reads given to the walker
*
* @param readOrdering
*/
public void setReadOrder( ArtificialPatternedSAMIterator.PATTERN readOrdering ) {
readOrder = readOrdering;
}
/**
* Traverse by reads, given the data and the walker
*
* @param walker the walker to traverse with
* @param shard the shard, specifying the range of data to iterate over
* @param dataProvider the provider of the reads data
* @param sum the value of type T, specified by the walker, to feed to the walkers reduce function
* @param <M> the map type of the walker
* @param <T> the reduce type of the walker
*
* @return the reduce variable of the read walker
*/
public <M, T> T traverse( Walker<M, T> walker,
Shard shard,
ShardDataProvider dataProvider,
T sum ) {
if (!( walker instanceof ReadWalker ))
throw new IllegalArgumentException("Walker isn't a read walker!");
ReadWalker<M, T> readWalker = (ReadWalker<M, T>) walker;
SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readsPerChr + DEFAULT_READ_LENGTH);
iter = new ArtificialPatternedSAMIterator(this.startingChr,
this.endingChr,
this.readsPerChr,
this.unMappedReads,
header,
this.readOrder);
// while we still have more reads
for (SAMRecord read : iter) {
// our locus context
LocusContext locus = null;
// an array of characters that represent the reference
char[] refSeq = null;
// update the number of reads we've seen
TraversalStatistics.nRecords++;
final boolean keepMeP = readWalker.filter(refSeq, read);
if (keepMeP) {
M x = readWalker.map(refSeq, read);
sum = readWalker.reduce(x, sum);
}
if (locus != null) { printProgress("loci", locus.getLocation()); }
}
return sum;
}
/**
* Temporary override of printOnTraversalDone.
* TODO: Add some sort of TE.getName() function once all TraversalEngines are ported.
*
* @param sum Result of the computation.
* @param <T> Type of the result.
*/
public <T> void printOnTraversalDone( T sum ) {
printOnTraversalDone("reads", sum);
}
}

View File

@ -109,7 +109,7 @@ public class ArtificialSAMIterator implements PeekingStingIterator {
return false;
}
private boolean createNextRead() {
protected boolean createNextRead() {
if (currentRead >= rCount) {
currentChromo++;
currentRead = 0;
@ -143,6 +143,10 @@ public class ArtificialSAMIterator implements PeekingStingIterator {
throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator.");
}
/**
* return this iterator, for the iterable interface
* @return
*/
public Iterator<SAMRecord> iterator() {
return this;
}

View File

@ -0,0 +1,118 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.executive.Accumulator;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider;
import org.broadinstitute.sting.utils.sam.ArtificialReadsTraversal;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class PrintReadsWalkerTest
* <p/>
* This tests the print reads walker, using the artificial reads traversal
*/
public class PrintReadsWalkerTest extends BaseTest {
/**
* our private fake reads traversal. This traversal seeds the
* the walker with a specified number of fake reads, which are named sequentially
*/
private ArtificialReadsTraversal trav;
private int readTotal = 0;
private char bases[] = {'a', 't'};
@Before
public void before() {
trav = new ArtificialReadsTraversal();
readTotal = ( ( trav.endingChr - trav.startingChr ) + 1 ) * trav.readsPerChr + trav.unMappedReads;
}
/** test that we get out the same number of reads we put in */
@Test
public void testReadCount() {
PrintReadsWalker walker = new PrintReadsWalker();
ArtificialSAMFileWriter writer = new ArtificialSAMFileWriter();
trav.traverse(walker, (Shard) null, (ShardDataProvider) null, writer);
assertEquals(readTotal, writer.getRecords().size());
}
/** test that we're ok with a null read */
@Test
public void testNullRead() {
PrintReadsWalker walker = new PrintReadsWalker();
SAMRecord rec = walker.map(bases, null);
assertTrue(rec == null);
}
/** tes that we get the read we put into the map function */
@Test
public void testReturnRead() {
PrintReadsWalker walker = new PrintReadsWalker();
SAMFileHeader head = ArtificialSamUtils.createArtificialSamHeader(3,1,1000);
SAMRecord rec = ArtificialSamUtils.createArtificialRead(head, "FakeRead", 1, 1, 50);
SAMRecord ret = walker.map(bases, rec);
assertTrue(ret == rec);
assertTrue(ret.getReadName().equals(rec.getReadName()));
}
/** test that the read makes it to the output source */
@Test
public void testReducingRead() {
PrintReadsWalker walker = new PrintReadsWalker();
SAMFileHeader head = ArtificialSamUtils.createArtificialSamHeader(3,1,1000);
SAMRecord rec = ArtificialSamUtils.createArtificialRead(head, "FakeRead", 1, 1, 50);
ArtificialSAMFileWriter writer = new ArtificialSAMFileWriter();
SAMRecord ret = walker.map(bases, null);
walker.reduce(ret,writer);
assertTrue(writer.getRecords().size() == 1);
}
/** test that we close the output source */
@Test
public void testClosingOnTraversalDone() {
ArtificialSAMFileWriter writer = new ArtificialSAMFileWriter();
assertTrue(!writer.isClosed());
PrintReadsWalker walker = new PrintReadsWalker();
walker.onTraversalDone(writer);
assertTrue(writer.isClosed());
}
}

View File

@ -0,0 +1,97 @@
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.BaseTest;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.fail;
import static org.junit.Assert.assertTrue;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
*
* @author aaron
*
* Class ArtificialPatternedSAMIteratorTest
*
* tests ArtificialPatternedSAMIterator, making sure that if you specify in order
* you get reads in order, and if you specify out of order you get them out of order.
*/
public class ArtificialPatternedSAMIteratorTest extends BaseTest {
// our artifical patterned iterator
ArtificialPatternedSAMIterator iter;
private int startingChr = 1;
private int endingChr = 2;
private int readCount = 100;
private int DEFAULT_READ_LENGTH = ArtificialSamUtils.DEFAULT_READ_LENGTH;
SAMFileHeader header;
@Before
public void before() {
header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
}
@Test
public void testInOrder() {
iter = new ArtificialPatternedSAMIterator(startingChr,endingChr,readCount,0,header, ArtificialPatternedSAMIterator.PATTERN.IN_ORDER_READS);
if (!iter.hasNext()) {
fail("no reads in the ArtificialPatternedSAMIterator");
}
SAMRecord last = iter.next();
while (iter.hasNext()) {
SAMRecord rec = iter.next();
if (!(rec.getReferenceIndex() > last.getReferenceIndex()) && (rec.getAlignmentStart() <= last.getAlignmentStart())) {
fail("read " + rec.getReadName() + " out of order compared to last read, " + last.getReadName());
}
last = rec;
}
}
@Test
public void testOutOfOrder() {
int outOfOrderCount = 0;
iter = new ArtificialPatternedSAMIterator(startingChr,endingChr,readCount,0,header, ArtificialPatternedSAMIterator.PATTERN.RANDOM_READS);
if (!iter.hasNext()) {
fail("no reads in the ArtificialPatternedSAMIterator");
}
SAMRecord last = iter.next();
while (iter.hasNext()) {
SAMRecord rec = iter.next();
if (!(rec.getReferenceIndex() > last.getReferenceIndex()) && (rec.getAlignmentStart() <= last.getAlignmentStart())) {
++outOfOrderCount;
}
last = rec;
}
assertTrue(outOfOrderCount > 0);
}
}