diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java index acbe78565..42d24b83b 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java @@ -56,16 +56,17 @@ public class IntervalShardStrategy implements ShardStrategy { * * @param size the next recommended shard size. */ - public void adjustNextShardSize(long size) { + public void adjustNextShardSize( long size ) { this.size = size; } /** * construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs + * * @param size * @param locations */ - IntervalShardStrategy(long size, GenomeLocSortedSet locations) { + IntervalShardStrategy( long size, GenomeLocSortedSet locations ) { if (locations == null || locations.isEmpty()) { throw new StingException("IntervalShardStrategy: genomic regions list is empty."); } @@ -73,12 +74,12 @@ public class IntervalShardStrategy implements ShardStrategy { this.size = size; } - /** + /** * the default constructor * - * @param dict the sequence dictionary to use + * @param dict the sequence dictionary to use */ - IntervalShardStrategy(SAMSequenceDictionary dict, GenomeLocSortedSet locations) { + IntervalShardStrategy( SAMSequenceDictionary dict, GenomeLocSortedSet locations ) { if (locations == null || locations.isEmpty()) { throw new StingException("IntervalShardStrategy: genomic regions list is empty."); } @@ -88,44 +89,39 @@ public class IntervalShardStrategy implements ShardStrategy { /** * returns true if there are additional shards + * * @return false if we're done processing shards */ public boolean hasNext() { - return (!regions.isEmpty()); + return ( !regions.isEmpty() ); } /** * gets the next Shard + * * @return the next shard */ public Shard next() { - if ((this.regions == null) || (regions.isEmpty())) { + if (( this.regions == null ) || ( regions.isEmpty() )) { throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function."); } // get the first region in the list GenomeLoc loc = regions.iterator().next(); - if (loc.getStop() - loc.getStart() <= this.size) { - regions.removeRegion(loc); - return new IntervalShard(loc); - } else { - GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(),loc.getStart(),loc.getStart()+size-1); - regions.removeRegion(subLoc); - return new IntervalShard(subLoc); - } - + regions.removeRegion(loc); + return new IntervalShard(loc); + } - /** - * we don't support the remove command - */ + /** we don't support the remove command */ public void remove() { throw new UnsupportedOperationException("ShardStrategies don't support remove()"); } /** * makes the ReadIntervalShard iterable, i.e. usable in a for loop. + * * @return */ public Iterator iterator() { diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java index 9372c815e..34316167d 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -5,13 +5,11 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; -import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator; -import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.utils.GenomeLoc; @@ -35,43 +33,53 @@ import java.util.List; * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. */ public class SAMDataSource implements SimpleDataSource { + + /** Backing support for reads. */ private Reads reads = null; - /** our SAM data files */ - private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; - /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(SAMDataSource.class); - // How strict should we be with SAM/BAM parsing? - protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT; - - // our list of readers - private final List samFileList = new ArrayList(); - - /** SAM header file. */ - private final SAMFileHeader header; - // used for the reads case, the last count of reads retrieved - private long readsTaken = 0; + long readsTaken = 0; // our last genome loc position - private GenomeLoc lastReadPos = null; + GenomeLoc lastReadPos = null; // do we take unmapped reads private boolean includeUnmappedReads = true; // reads based traversal variables private boolean intoUnmappedReads = false; - private int readsAtLastPos = 0; + private int readsSeenAtLastPos = 0; + + + /** + * package protected getter and setter for the iterator generator + * + * @return + */ + IteratorGenerator getIterGen() { + return iterGen; + } + + void setIterGen( IteratorGenerator iterGen ) { + this.iterGen = iterGen; + } + + // where we get out iterators from + private IteratorGenerator iterGen; /** * constructor, given sam files * * @param reads the list of sam files + * @param byReads are we a by reads traversal, or a loci traversal. We could delete this field + * if we passed in iterGen, which would be a better (although more complicated for the + * consumers of SAMDataSources). */ - public SAMDataSource(Reads reads) throws SimpleDataSourceLoadException { + public SAMDataSource( Reads reads, boolean byReads ) throws SimpleDataSourceLoadException { this.reads = reads; // check the length @@ -82,32 +90,11 @@ public class SAMDataSource implements SimpleDataSource { if (!smFile.canRead()) { throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName()); } - samFileList.add(smFile); - } + iterGen = new MSR2IteratorGenerator(reads, byReads); - header = createHeaderMerger().getMergedHeader(); } - /** - * Load a SAM/BAM, given an input file. - * - * @param samFile the file name - * @return a SAMFileReader for the file, null if we're attempting to read a list - */ - private SAMFileReader initializeSAMFile(final File samFile) { - if (samFile.toString().endsWith(".list")) { - return null; - } else { - SAMFileReader samReader = new SAMFileReader(samFile, true); - samReader.setValidationStringency(strictness); - - final SAMFileHeader header = samReader.getFileHeader(); - logger.debug(String.format("Sort order is: " + header.getSortOrder())); - - return samReader; - } - } /** *

@@ -115,18 +102,12 @@ public class SAMDataSource implements SimpleDataSource { *

* * @param location the genome location to extract data for + * * @return an iterator for that region */ - public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException { - - // right now this is very heavy, it copies the file list into a reader list every time - SamFileHeaderMerger headerMerger = createHeaderMerger(); - - // make a merging iterator for this record - MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger); - - iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1); - + public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException { + // make a merging iterator for this record + CloseableIterator iter = iterGen.seek(location); // return the iterator return StingSAMIteratorAdapter.adapt(reads, iter); } @@ -137,9 +118,10 @@ public class SAMDataSource implements SimpleDataSource { *

* * @param shard the shard to get data for + * * @return an iterator for that region */ - public StingSAMIterator seek(Shard shard) throws SimpleDataSourceLoadException { + public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException { StingSAMIterator iterator = null; if (shard.getShardType() == Shard.ShardType.READ) { iterator = seekRead((ReadShard) shard); @@ -149,8 +131,7 @@ public class SAMDataSource implements SimpleDataSource { reads.getMaxOnTheFlySorts(), reads.getFilterZeroMappingQualityReads(), reads.getSafetyChecking()); - } else if (shard.getShardType() == Shard.ShardType.LOCUS || - shard.getShardType() == Shard.ShardType.INTERVAL) { + } else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) { iterator = seekLocus(shard.getGenomeLoc()); iterator = TraversalEngine.applyDecoratingIterators(false, iterator, @@ -158,8 +139,7 @@ public class SAMDataSource implements SimpleDataSource { reads.getMaxOnTheFlySorts(), reads.getFilterZeroMappingQualityReads(), reads.getSafetyChecking()); - } - else { + } else { throw new StingException("seek: Unknown shard type"); } @@ -173,17 +153,7 @@ public class SAMDataSource implements SimpleDataSource { * @return SAM file header. */ public SAMFileHeader getHeader() { - return header; - } - - /** - * create the merging header. - * - * @return a SamFileHeaderMerger that includes the set of SAM files we were created with - */ - private SamFileHeaderMerger createHeaderMerger() { - List lst = GetReaderList(); - return new SamFileHeaderMerger(lst, SORT_ORDER, true); + return this.iterGen.getHeader(); } @@ -193,26 +163,33 @@ public class SAMDataSource implements SimpleDataSource { *

* * @param shard the read shard to extract from + * * @return an iterator for that region */ - private BoundedReadIterator seekRead(ReadShard shard) throws SimpleDataSourceLoadException { + private BoundedReadIterator seekRead( ReadShard shard ) throws SimpleDataSourceLoadException { BoundedReadIterator bound = null; - SamFileHeaderMerger headerMerger = createHeaderMerger(); - MergingSamRecordIterator2 iter = null; + CloseableIterator iter = null; if (!intoUnmappedReads) { - iter = new MergingSamRecordIterator2(headerMerger); - bound = fastMappedReadSeek(shard.getSize(), iter); + if (lastReadPos == null) { + lastReadPos = new GenomeLoc(iterGen.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE); + iter = iterGen.seek(lastReadPos); + return InitialReadIterator(shard.getSize(), iter); + } else { + lastReadPos.setStop(-1); + iter = iterGen.seek(lastReadPos); + bound = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, iter)); + } } - if ((bound == null || intoUnmappedReads) && includeUnmappedReads) { + + if (( bound == null || intoUnmappedReads ) && includeUnmappedReads) { if (iter != null) { iter.close(); } - iter = new MergingSamRecordIterator2(createHeaderMerger()); - bound = toUnmappedReads(shard.getSize(), iter); + iter = iterGen.seek(null); + bound = toUnmappedReads(shard.getSize(), (PeekingStingIterator) iter); } - if (bound == null) { shard.signalDone(); bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), 0); @@ -228,7 +205,7 @@ public class SAMDataSource implements SimpleDataSource { * * @param seeUnMappedReads true to see unmapped reads, false otherwise */ - public void viewUnmappedReads(boolean seeUnMappedReads) { + public void viewUnmappedReads( boolean seeUnMappedReads ) { includeUnmappedReads = seeUnMappedReads; } @@ -238,19 +215,21 @@ public class SAMDataSource implements SimpleDataSource { * * @param readCount how many reads to retrieve * @param iter the iterator to use + * * @return the bounded iterator that you can use to get the intervaled reads from * @throws SimpleDataSourceLoadException */ - private BoundedReadIterator toUnmappedReads(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException { + BoundedReadIterator toUnmappedReads( long readCount, U iter ) throws SimpleDataSourceLoadException { int count = 0; + int cnt = 0; SAMRecord d = null; while (iter.hasNext()) { d = iter.peek(); int x = d.getReferenceIndex(); - if (x < 0 || x >= d.getHeader().getSequenceDictionary().getSequences().size()) { + if (x < 0) // we have the magic read that starts the unmapped read segment! break; - } + cnt++; iter.next(); } @@ -262,6 +241,7 @@ public class SAMDataSource implements SimpleDataSource { // now walk until we've taken the unmapped read count while (iter.hasNext() && count < this.readsTaken) { iter.next(); + count++; } // check to see what happened, did we run out of reads? @@ -280,108 +260,164 @@ public class SAMDataSource implements SimpleDataSource { * A seek function for unmapped reads. * * @param readCount how many reads to retrieve - * @param iter the iterator to use + * @param iter the iterator to use, seeked to the correct start location + * * @return the bounded iterator that you can use to get the intervaled reads from * @throws SimpleDataSourceLoadException */ - private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException { - if (lastReadPos == null) { + BoundedReadIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException { + BoundedReadIterator bound; + correctForReadPileupSeek(iter); + if (readsTaken == 0) { return InitialReadIterator(readCount, iter); - } else { - BoundedReadIterator bound; - iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1); + } + int x = 0; + SAMRecord rec = null; + int lastPos = 0; - // move the number of reads we read from the last pos - boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.) - while (iter.hasNext() && this.readsAtLastPos > 0) { - iter.next(); - --readsAtLastPos; - atLeastOneReadSeen = true; - } - if (readsAtLastPos != 0 && atLeastOneReadSeen) { - throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0"); - } - - int x = 0; - SAMRecord rec = null; - int lastPos = 0; - - while (x < readsTaken) { - if (iter.hasNext()) { - rec = iter.next(); - if (lastPos == rec.getAlignmentStart()) { - ++this.readsAtLastPos; - } else { - this.readsAtLastPos = 1; - } - lastPos = rec.getAlignmentStart(); - ++x; + while (x < readsTaken) { + if (iter.hasNext()) { + rec = iter.next(); + if (lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos; + else this.readsSeenAtLastPos = 1; + lastPos = rec.getAlignmentStart(); + ++x; + } else { + // jump contigs + if (lastReadPos.toNextContig() == false) { + // check to see if we're using unmapped reads, if not return, we're done + readsTaken = 0; + intoUnmappedReads = true; + return null; } else { - // jump contigs - if (lastReadPos.toNextContig() == false) { - // check to see if we're using unmapped reads, if not return, we're done - readsTaken = 0; - intoUnmappedReads = true; - return null; - - } else { - iter.close(); - // now merge the headers - // right now this is pretty damn heavy, it copies the file list into a reader list every time - SamFileHeaderMerger mg = createHeaderMerger(); - iter = new MergingSamRecordIterator2(mg); - iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE); - return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); - } + readsTaken = readCount; + readsSeenAtLastPos = 0; + lastReadPos.setStop(-1); + CloseableIterator ret = iterGen.seek(lastReadPos); + return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount); } } - - - // if we're off the end of the last contig (into unmapped territory) - if (rec != null && rec.getAlignmentStart() == 0) { - readsTaken += readCount; - intoUnmappedReads = true; - } - // else we're not off the end, store our location - else if (rec != null) { - int stopPos = rec.getAlignmentStart(); - if (stopPos < lastReadPos.getStart()) { - lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos); - } else { - lastReadPos.setStop(rec.getAlignmentStart()); - } - } - // in case we're run out of reads, get out - else { - throw new StingException("Danger: weve run out reads in fastMappedReadSeek"); - //return null; - } - bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); - - // return the iterator - return bound; } + // if we're off the end of the last contig (into unmapped territory) + if (rec != null && rec.getAlignmentStart() == 0) { + readsTaken += readCount; + intoUnmappedReads = true; + } + // else we're not off the end, store our location + else if (rec != null) { + int stopPos = rec.getAlignmentStart(); + if (stopPos < lastReadPos.getStart()) { + lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos); + } else { + lastReadPos.setStart(rec.getAlignmentStart()); + } + } + // in case we're run out of reads, get out + else { + throw new StingException("Danger: weve run out reads in fastMappedReadSeek"); + //return null; + } + bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); + + // return the iterator + return bound; } + /** + * Even though the iterator has seeked to the correct location, there may be multiple reads at that location, + * and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable + * + * @param iter the iterator + */ + private void correctForReadPileupSeek( StingSAMIterator iter ) { + // move the number of reads we read from the last pos + boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.) + while (iter.hasNext() && this.readsSeenAtLastPos > 0) { + iter.next(); + --readsSeenAtLastPos; + atLeastOneReadSeen = true; + } + if (readsSeenAtLastPos != 0 && atLeastOneReadSeen) { + throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0"); + } + } + + /** * set the initial iterator * * @param readCount the number of reads * @param iter the merging iterator + * * @return a bounded read iterator at the first read position in the file. */ - private BoundedReadIterator InitialReadIterator(long readCount, MergingSamRecordIterator2 iter) { + private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator iter ) { BoundedReadIterator bound; - lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0); - iter.queryContained(lastReadPos.getContig(), 1, -1); bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); this.readsTaken = readCount; return bound; } +} + + +/** + * iterator generator + *

+ * This class generates iterators for the SAMDataSource. This class was introduced for testing purposes, + * since it became increasingly hard to test the SAM data source code. The class defines two abstraact + * methods: + *

+ * -seek( GenomeLoc ) which returns an iterator seeked to the genome loc, and if null is passed to the default + * location (which is implementation specific). + *

+ * -getHeader(), which returns a SAMFileHeader for the specified IteratorGenerator. I hope we can phase this + * method out, since it doesn't seem necessary, and it would be much cleaner with out it. + */ +abstract class IteratorGenerator { + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(SAMDataSource.class); + + /** + * seek to a location + * + * @param seekTo the genome loc to seek to + * + * @return StingSAMIterator + */ + public abstract CloseableIterator seek( GenomeLoc seekTo ); + + /** + * get the merged header + * + * @return the merged header + */ + public abstract SAMFileHeader getHeader(); + + /** + * Load a SAM/BAM, given an input file. + * + * @param samFile the file name + * + * @return a SAMFileReader for the file, null if we're attempting to read a list + */ + protected static SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) { + if (samFile.toString().endsWith(".list")) { + return null; + } else { + SAMFileReader samReader = new SAMFileReader(samFile, true); + samReader.setValidationStringency(strictness); + + final SAMFileHeader header = samReader.getFileHeader(); + logger.debug(String.format("Sort order is: " + header.getSortOrder())); + + return samReader; + } + } + /** * A private function that, given the internal file list, generates a SamFileReader * list of validated files. @@ -389,11 +425,11 @@ public class SAMDataSource implements SimpleDataSource { * @return a list of SAMFileReaders that represent the stored file names * @throws SimpleDataSourceLoadException if there's a problem loading the files */ - private List GetReaderList() throws SimpleDataSourceLoadException { + protected static List GetReaderList( Reads reads, SAMFileReader.ValidationStringency strictness ) throws SimpleDataSourceLoadException { // right now this is pretty damn heavy, it copies the file list into a reader list every time List lst = new ArrayList(); - for (File f : this.samFileList) { - SAMFileReader reader = initializeSAMFile(f); + for (File f : reads.getReadsFiles()) { + SAMFileReader reader = initializeSAMFile(f, strictness); if (reader.getFileHeader().getReadGroups().size() < 1) { //logger.warn("Setting header in reader " + f.getName()); @@ -412,4 +448,65 @@ public class SAMDataSource implements SimpleDataSource { return lst; } + /** + * create the merging header. + * + * @return a SamFileHeaderMerger that includes the set of SAM files we were created with + */ + protected static SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileReader.ValidationStringency strictness, SAMFileHeader.SortOrder SORT_ORDER ) { + List lst = GetReaderList(reads, strictness); + return new SamFileHeaderMerger(lst, SORT_ORDER, true); + } } + + +/** + * MSR2IteratorGenerator + *

+ * generates a MerginsSAMIterator2, given a genomic location. The constructor takes the reads structure, + * and a flag indicating if we're dealing with reads or loci (to determine the correct query function). + */ +class MSR2IteratorGenerator extends IteratorGenerator { + /** our read pile */ + private Reads reads; + + private SamFileHeaderMerger header; + + // How strict should we be with SAM/BAM parsing? + protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT; + + // are we by reads or by loci + protected boolean byReads = true; + + /** our SAM data files */ + private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; + + public MSR2IteratorGenerator( Reads reads, boolean byReads ) { + this.reads = reads; + this.header = IteratorGenerator.createHeaderMerger(reads, strictness, sortOrder); + this.byReads = byReads; + } + + public CloseableIterator seek( GenomeLoc seekTo ) { + SamFileHeaderMerger mg = createHeaderMerger(reads, strictness, sortOrder); + MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(mg, reads); + if (seekTo != null) { + if (byReads) + iter.queryContained(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop()); + else + iter.queryOverlapping(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop()); + } + return iter; + } + + /** + * get the merged header + * + * @return the merged header + */ + public SAMFileHeader getHeader() { + return header.getMergedHeader(); + } +} + + diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 8336007e9..eb6674105 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -75,11 +75,13 @@ public abstract class MicroScheduler { protected MicroScheduler(Walker walker, Reads reads, File refFile, List> rods) { if (walker instanceof ReadWalker) { traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods); + this.reads = getReadsDataSource(reads,true); } else { traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods); + this.reads = getReadsDataSource(reads,false); } - this.reads = getReadsDataSource(reads); + this.reference = openReferenceSequenceFile(refFile); this.rods = getReferenceOrderedDataSources(rods); } @@ -159,12 +161,12 @@ public abstract class MicroScheduler { * Gets a data source for the given set of reads. * @return A data source for the given set of reads. */ - private SAMDataSource getReadsDataSource(Reads reads) { + private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) { // By reference traversals are happy with no reads. Make sure that case is handled. if (reads.getReadsFiles().size() == 0) return null; - SAMDataSource dataSource = new SAMDataSource(reads); + SAMDataSource dataSource = new SAMDataSource(reads, byReads); // Side effect: initialize the traversal engine with reads data. // TODO: Give users a dedicated way of getting the header so that the MicroScheduler diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java index 9f8d9c8fb..eb5fb0a48 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/BoundedReadIterator.java @@ -7,20 +7,29 @@ import java.util.Iterator; import org.broadinstitute.sting.gatk.Reads; -/** +/* + * Copyright (c) 2009 The Broad Institute * - * User: aaron - * Date: Apr 14, 2009 - * Time: 5:27:31 PM + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java index 87f22ee96..12b022d44 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/MergingSamRecordIterator2.java @@ -31,7 +31,7 @@ import java.util.PriorityQueue; * iterable stream. The underlying iterators/files must all have the same sort order unless * the requested output format is unsorted, in which case any combination is valid. */ -public class MergingSamRecordIterator2 implements CloseableIterator, Iterable { +public class MergingSamRecordIterator2 implements CloseableIterator, Iterable, PeekingStingIterator { protected PriorityQueue pq = null; protected final SamFileHeaderMerger samHeaderMerger; protected final SAMFileHeader.SortOrder sortOrder; @@ -43,7 +43,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator, * Constructs a new merging iterator with the same set of readers and sort order as * provided by the header merger parameter. */ - public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) { + public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) { this.samHeaderMerger = headerMerger; this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); this.pq = new PriorityQueue(samHeaderMerger.getReaders().size()); @@ -275,6 +275,16 @@ public class MergingSamRecordIterator2 implements CloseableIterator, public Iterator iterator() { return this; } + + /** + * Gets source information for the reads. Contains information about the original reads + * files, plus information about downsampling, etc. + * + * @return + */ + public Reads getSourceInfo() { + return null; //To change body of implemented methods use File | Settings | File Templates. + } } // Should replace picard class with the same name diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/PeekingStingIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/PeekingStingIterator.java new file mode 100644 index 000000000..8abf05409 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/PeekingStingIterator.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Interface Peekable + *

+ * This interface indicates that + */ +public interface PeekingStingIterator extends StingSAMIterator { + /** + * peek, given the specified type + * @return + */ + SAMRecord peek(); +} diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index f427117f6..7f029eeeb 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -337,34 +337,7 @@ public abstract class TraversalEngine { return true; } - protected Iterator initializeReads() { - if( readsFiles.size() == 1 ) - samReader = initializeSAMFile(readsFiles.get(0)); - else - samReader = null; - return wrapReadsIterator(getReadsIterator(samReader), true); - } - - protected Iterator getReadsIterator(final SAMFileReader samReader) { - // If the file has an index, querying functions are available. Use them if possible... - if ( samReader == null && readsFiles.size() > 0 ) { - SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; - - List readers = new ArrayList(); - - for ( File readsFile: readsFiles ) { - SAMFileReader reader = initializeSAMFile(readsFile); - readers.add(reader); - } - - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); - return new MergingSamRecordIterator2(headerMerger); - } - else { - return samReader.iterator(); - } - } - + @Deprecated protected StingSAMIterator wrapReadsIterator( final Iterator rawIterator, final boolean enableVerification ) { // Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 25c033806..50266bb09 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -246,39 +246,7 @@ public class TraverseDuplicates extends TraversalEngine { return sum; } - // -------------------------------------------------------------------------------------------------------------- - // - // old style interface to the system - // - // -------------------------------------------------------------------------------------------------------------- - @Override - public T traverse(Walker walker, List locations) { - if ( walker instanceof DuplicateWalker) { - Walker x = walker; - DuplicateWalker dupWalker = (DuplicateWalker)x; - return (T)this.traverseByRead(dupWalker, locations); - } else { - throw new IllegalArgumentException("Walker isn't a duplicate walker!"); - } - } - /** - * Should we deleted at the soonist possible opportunity - */ - public Object traverseByRead(DuplicateWalker walker, List locations) { - samReadIter = initializeReads(); - - // Initialize the walker - walker.initialize(); - - // Initialize the sum - FilteringIterator filterIter = new FilteringIterator(samReadIter, new duplicateStreamFilterFunc()); - T sum = actuallyTraverse(walker, filterIter, walker.reduceInit()); - - //printOnTraversalDone("reads", sum); - walker.onTraversalDone(sum); - return sum; - } // -------------------------------------------------------------------------------------------------------------- // diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java new file mode 100644 index 000000000..e371749f9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java @@ -0,0 +1,165 @@ +package org.broadinstitute.sting.utils.sam; + +import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; +import org.broadinstitute.sting.gatk.Reads; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileHeader; + +import java.util.Iterator; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** this fake iterator allows us to look at how specific piles of reads are handled */ +public class ArtificialSAMIterator implements PeekingStingIterator { + + + protected int currentChromo = 0; + protected int currentRead = 0; + protected int totalReadCount = 0; + protected boolean done = false; + // the next record + protected SAMRecord next = null; + protected SAMFileHeader header = null; + + // the passed in parameters + protected final int sChr; + protected final int eChromosomeCount; + protected final int rCount; + protected int uMappedReadCount; + + // let us know to make a read, we need this to help out the fake sam query iterator + private boolean initialized = false; + + /** + * create the fake iterator, given the mapping of chromosomes and read counts + * + * @param startingChr the starting chromosome + * @param endingChr the ending chromosome + * @param readCount the number of reads in each chromosome + * @param header the associated header + */ + ArtificialSAMIterator( int startingChr, int endingChr, int readCount, SAMFileHeader header ) { + sChr = startingChr; + eChromosomeCount = (endingChr - startingChr) + 1; + rCount = readCount; + this.header = header; + this.currentChromo = 0; + uMappedReadCount = 0; + } + + /** + * create the fake iterator, given the mapping of chromosomes and read counts + * + * @param startingChr the starting chromosome + * @param endingChr the ending chromosome + * @param readCount the number of reads in each chromosome + * @param header the associated header + */ + ArtificialSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) { + sChr = startingChr; + eChromosomeCount = (endingChr - startingChr) + 1; + rCount = readCount; + this.header = header; + this.currentChromo = 0; + this.uMappedReadCount = unmappedReadCount; + } + + + public Reads getSourceInfo() { + throw new UnsupportedOperationException("We don't support this"); + } + + public void close() { + // done + currentChromo = Integer.MAX_VALUE; + } + + public boolean hasNext() { + if (!initialized){ + initialized = true; + createNextRead(); + } + if (this.next != null) { + return true; + } + return false; + } + + private boolean createNextRead() { + if (currentRead >= rCount) { + currentChromo++; + currentRead = 0; + } + // check for end condition, have we finished the chromosome listing, and have no unmapped reads + if (currentChromo >= eChromosomeCount) { + if (uMappedReadCount < 1) { + this.next = null; + return false; + } else { + ++totalReadCount; + this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), -1, -1, 50); + --uMappedReadCount; + return true; + } + } + ++totalReadCount; + this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), currentChromo, currentRead, 50); + ++currentRead; + return true; + } + + + public SAMRecord next() { + SAMRecord ret = next; + createNextRead(); + return ret; + } + + public void remove() { + throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator."); + } + + public Iterator iterator() { + return this; + } + + /** + * some instrumentation methods + */ + public int readsTaken() { + return totalReadCount; + } + + /** + * peek at the next sam record + * + * @return + */ + public SAMRecord peek() { + return this.next; + } +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java new file mode 100644 index 000000000..c6f7793be --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java @@ -0,0 +1,181 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.SAMRecord; + +import java.util.List; + +import org.broadinstitute.sting.utils.StingException; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class ArtificialSAMQueryIterator + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class ArtificialSAMQueryIterator extends ArtificialSAMIterator { + + // get the next positon + protected int finalPos = 0; + protected int startPos = 0; + protected int contigIndex = -1; + protected boolean overlapping = false; + protected int startingChr = 0; + protected boolean seeked = false; + + /** + * create the fake iterator, given the mapping of chromosomes and read counts + * + * @param startingChr the starting chromosome + * @param endingChr the ending chromosome + * @param readCount the number of reads in each chromosome + * @param header the associated header + */ + ArtificialSAMQueryIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) { + super(startingChr, endingChr, readCount, unmappedReadCount, header); + this.startingChr = startingChr; + } + + /** + * query containing - get reads contained by the specified interval + * + * @param contig the contig index string + * @param start the start position + * @param stop the stop position + */ + public void queryContained( String contig, int start, int stop ) { + this.overlapping = false; + initialize(contig, start, stop); + } + + /** + * query containing - get reads contained by the specified interval + * + * @param contig the contig index string + * @param start the start position + * @param stop the stop position + */ + public void queryOverlapping( String contig, int start, int stop ) { + this.overlapping = true; + initialize(contig, start, stop); + } + + + /** + * initialize the query iterator + * + * @param contig the contig + * @param start the start position + * @param stop the stop postition + */ + private void initialize( String contig, int start, int stop ) { + ensureUntouched(); + finalPos = stop; + startPos = start; + if (finalPos < 0) { + finalPos = Integer.MAX_VALUE; + } + // sanity check that we have the contig + contigIndex = -1; + List list = header.getSequenceDictionary().getSequences(); + for (SAMSequenceRecord rec : list) { + if (rec.getSequenceName().equals(contig)) { + contigIndex = rec.getSequenceIndex(); + } + } + if (contigIndex < 0) { throw new IllegalArgumentException("Contig" + contig + " doesn't exist"); } + while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) { + super.next(); + } + if (!super.hasNext()) { + throw new StingException("Unable to find the target chromosome"); + } + while (super.hasNext() && this.peek().getAlignmentStart() < start) { + super.next(); + } + // sanity check that we have an actual matching read next + SAMRecord rec = this.peek(); + if (!matches(rec)) { + throw new StingException("The next read doesn't match"); + } + // set the seeked variable to true + seeked = true; + } + + /** + * given a read and the query type, check if it matches our regions + * + * @param rec the read + * + * @return true if it belongs in our region + */ + public boolean matches( SAMRecord rec ) { + if (rec.getReferenceIndex() != this.contigIndex) { + return false; + } + if (!overlapping) { + // if the start or the end are somewhere within our range + if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) { + return true; + } + } else { + if (( rec.getAlignmentStart() <= finalPos && rec.getAlignmentStart() >= startPos ) || + ( rec.getAlignmentEnd() <= finalPos && rec.getAlignmentEnd() >= startPos )) { + return true; + } + } + return false; + } + + + /** + * override the hasNext, to incorportate our limiting factor + * + * @return + */ + public boolean hasNext() { + boolean res = super.hasNext(); + if (!seeked) { + return res; + } + if (res && matches(this.next)) { + return true; + } + return false; + } + + /** make sure we haven't been used as an iterator yet; this is to miror the MergingSamIterator2 action. */ + public void ensureUntouched() { + if (this.currentChromo != 0 || this.currentRead > 0) { + throw new UnsupportedOperationException("We've already been used as an iterator; you can't query after that"); + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSamUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSamUtils.java index 30c6897e1..953862ab9 100755 --- a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSamUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSamUtils.java @@ -5,7 +5,7 @@ import net.sf.samtools.*; import java.io.File; import java.util.*; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; import org.broadinstitute.sting.gatk.Reads; /** @@ -28,11 +28,6 @@ import org.broadinstitute.sting.gatk.Reads; /** * @author aaron * @version 1.0 - * @date May 21, 2009 - *

- * Class samUtils - *

- * A collection of utilities for making sam and bam files. Mostly these are for creating sam and bam files for testing purposes. */ public class ArtificialSamUtils { public static final int DEFAULT_READ_LENGTH = 50; @@ -46,7 +41,7 @@ public class ArtificialSamUtils { * @param chromosomeSize how large each chromosome is * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) */ - public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { + public static void createArtificialBamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) { SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); File outFile = new File(filename); @@ -70,7 +65,7 @@ public class ArtificialSamUtils { * @param chromosomeSize how large each chromosome is * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) */ - public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { + public static void createArtificialSamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) { SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); File outFile = new File(filename); @@ -91,14 +86,15 @@ public class ArtificialSamUtils { * @param numberOfChromosomes the number of chromosomes to create * @param startingChromosome the starting number for the chromosome (most likely set to 1) * @param chromosomeSize the length of each chromosome + * * @return */ - public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { + public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) { SAMFileHeader header = new SAMFileHeader(); SAMSequenceDictionary dict = new SAMSequenceDictionary(); // make up some sequence records for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); + SAMSequenceRecord rec = new SAMSequenceRecord("chr" + ( x ), chromosomeSize /* size */); rec.setSequenceLength(chromosomeSize); dict.addSequence(rec); } @@ -114,9 +110,10 @@ public class ArtificialSamUtils { * @param refIndex the reference index, i.e. what chromosome to associate it with * @param alignmentStart where to start the alignment * @param length the length of the read + * * @return the artificial read */ - public static SAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { + public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { SAMRecord record = new SAMRecord(header); record.setReadName(name); record.setReferenceIndex(refIndex); @@ -131,95 +128,64 @@ public class ArtificialSamUtils { /** * create an iterator containing the specified read piles + * * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * * @return StingSAMIterator representing the specified amount of fake data */ - public static StingSAMIterator unmappedReadIterator(int startingChr, int endingChr, int readCount ) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - Map map = new HashMap(); - for (int x = startingChr; x < endingChr; x++) { - map.put(x,readCount); - } - return new ArtificialSAMIterator(startingChr, endingChr, readCount,header); + public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) { + SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMIterator(startingChr, endingChr, readCount, header); } - -} - -/** - * this fake iterator allows us to look at how specific piles of reads are handled - */ -class ArtificialSAMIterator implements StingSAMIterator { - - - private int currentChromo = 0; - private int currentRead = 0; - private int readCount = 0; - private boolean done = false; - // the next record - private SAMRecord next = null; - SAMFileHeader header = null; - - // the passed in parameters - final int sChr; - final int eChr; - final int rCount; - /** - * create the fake iterator, given the mapping of chromosomes and read counts - * @param startingChr the starting chromosome - * @param endingChr the ending chromosome - * @param readCount the number of reads in each chromosome - * @param header the associated header + * create an iterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file + * + * @return StingSAMIterator representing the specified amount of fake data */ - ArtificialSAMIterator(int startingChr, int endingChr, int readCount, SAMFileHeader header) { - sChr = startingChr; - eChr = endingChr; - rCount = readCount; - this.header = header; - this.currentChromo = startingChr; + public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) { + SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } - public Reads getSourceInfo() { - throw new UnsupportedOperationException("We don't support this"); + /** + * create an ArtificialSAMQueryIterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * + * @return StingSAMIterator representing the specified amount of fake data + */ + public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) { + SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); } - public void close() { - // done - currentChromo = Integer.MAX_VALUE; - } + /** + * create an ArtificialSAMQueryIterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file + * + * @return StingSAMIterator representing the specified amount of fake data + */ + public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) { + SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - public boolean hasNext() { - if (currentRead >= rCount) { - currentChromo++; - currentRead = 0; - } - if (currentChromo >= eChr) { - return false; - } - int alignment = 0; - if (currentChromo >= 0) { - alignment = currentRead; - } else { - alignment = 0; - } - - this.next = ArtificialSamUtils.createArtificialRead(this.header,String.valueOf(readCount), currentChromo, alignment, 50); - currentRead++; - return true; - } - - public SAMRecord next() { - return next; - } - - public void remove() { - throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator."); - } - - public Iterator iterator() { - return this; + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } } + diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java index 4330ea46a..2b6a49ad4 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java @@ -43,6 +43,12 @@ import net.sf.samtools.SAMFileHeader; * Class ReadIntervalShardStrategyTest *

* Tests the ReadIntervalShardStrategy class + * + * + * TODO: this test has been changed, since we now force interval shards to not subdivide if they're large, + * so you should always get one shard back, reguardless of the specified length. If this behavior changes + * back in the future, the expected number of shards should change from 1 to > 1. + * */ public class IntervalShardStrategyTest extends BaseTest { @@ -75,7 +81,7 @@ public class IntervalShardStrategyTest extends BaseTest { counter++; } assertTrue(d instanceof IntervalShard); - assertEquals(10, counter); + assertEquals(1, counter); } @Test @@ -92,7 +98,7 @@ public class IntervalShardStrategyTest extends BaseTest { counter++; } assertTrue(d instanceof IntervalShard); - assertEquals(50, counter); + assertEquals(5, counter); } @Test @@ -105,16 +111,11 @@ public class IntervalShardStrategyTest extends BaseTest { int counter = 0; while (strat.hasNext()) { Shard d = strat.next(); - if (counter % 2 == 0) { assertEquals(1, d.getGenomeLoc().getStart()); - assertEquals(789, d.getGenomeLoc().getStop()); - } else { - assertEquals(790, d.getGenomeLoc().getStart()); assertEquals(1000, d.getGenomeLoc().getStop()); - } counter++; } - assertEquals(10, counter); + assertEquals(5, counter); } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java index 8b192feda..aff387c0b 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java @@ -89,7 +89,7 @@ public class SAMBAMDataSourceTest extends BaseTest { Reads reads = new Reads(fl); try { - SAMDataSource data = new SAMDataSource(reads); + SAMDataSource data = new SAMDataSource(reads,false); for (Shard sh : strat) { int readCount = 0; count++; @@ -139,7 +139,7 @@ public class SAMBAMDataSourceTest extends BaseTest { int count = 0; try { - SAMDataSource data = new SAMDataSource(reads); + SAMDataSource data = new SAMDataSource(reads,false); for (Shard sh : strat) { int readCount = 0; count++; @@ -176,7 +176,7 @@ public class SAMBAMDataSourceTest extends BaseTest { logger.debug("Pile two:"); try { - SAMDataSource data = new SAMDataSource(reads); + SAMDataSource data = new SAMDataSource(reads,false); for (Shard sh : strat) { int readCount = 0; count++; diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java index 5edcd0394..2c6e456e3 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java @@ -2,15 +2,20 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; import static junit.framework.Assert.fail; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.dataSources.shards.Shard; import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator; +import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMQueryIterator; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import org.junit.Before; import org.junit.Test; @@ -38,16 +43,14 @@ import java.util.List; /** * @author aaron * @version 1.0 - * @date Apr 15, 2009 - *

- * Class SAMByReadsTest - *

- * Test sam traversal by reads FIX ME */ public class SAMByReadsTest extends BaseTest { - private FastaSequenceFile2 seq; + private List fl; + ShardStrategy shardStrategy; + Reads reads; + private int targetReadCount = 14; /** * This function does the setup of our parser, before each method call. @@ -59,56 +62,42 @@ public class SAMByReadsTest extends BaseTest { fl = new ArrayList(); // sequence - seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta")); - GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary()); + //seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta")); + //GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary()); + + // setup the test files + fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam")); + reads = new Reads(fl); + + } /** Test out that we can shard the file and iterate over every read */ @Test - public void testTotalReadCount() { - logger.warn("Executing testTotalReadCount"); - - // setup the test files - fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam")); - Reads reads = new Reads(fl); - - final int targetReadCount = 5000; - - ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,seq.getSequenceDictionary(),targetReadCount); - + public void testToUnmappedReads() { + ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000); + GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary()); try { - SAMDataSource data = new SAMDataSource(reads); - - // check the total read count - final int totalReads = 10000; - - int readsSeen = 0; - BoundedReadIterator iter; - - - for (Shard sd : shardStrategy) { - int readcnt = 0; - - iter = (BoundedReadIterator)data.seek(sd); - - for (SAMRecord r : iter) { - - readcnt++; - + int unmappedReadsSeen = 0; + int iterations = 0; + SAMDataSource data = new SAMDataSource(reads,true); + for (int x = 0; x < 10; x++) { + ++iterations; + PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10, 1000); + BoundedReadIterator ret = data.toUnmappedReads(100, iter); + // count the reads we've gotten back + if (ret == null) { + fail("On iteration " + iterations + " we were returned a null pointer, after seeing " + unmappedReadsSeen + " reads out of a 1000"); + } + while (ret.hasNext()) { + ret.next(); + unmappedReadsSeen++; } - - - readsSeen += readcnt; - //logger.warn("Seen " + readsSeen + " reads."); - } - // make sure we've seen all the reads - assertEquals(totalReads,readsSeen); - logger.warn("Success " + readsSeen + " equals target count of " + totalReads); - + assertEquals(1000,unmappedReadsSeen); } - + catch (SimpleDataSourceLoadException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException"); @@ -116,4 +105,145 @@ public class SAMByReadsTest extends BaseTest { } + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testShardingOfReadsSize14() { + ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000); + GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary()); + targetReadCount = 14; + try { + int iterations = 0; + int readCount = 0; + SAMDataSource data = new SAMDataSource(reads,true); + + + data.setIterGen(gen); + shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); + while (shardStrategy.hasNext()) { + + + BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next()); + assertTrue(ret != null); + while (ret.hasNext()) { + ret.next(); + readCount++; + } + ret.close(); + iterations++; + } + + // assert that we saw 2000 reads + assertEquals(2000,readCount); + + /** + * this next assertion is based on the following logic: + * 14 reads per shard = 8 shards per each 100 read chromosome + * 10 chromosomes = 8 * 10 = 80 + * 1000 unmapped reads / 14 = 72 + * 1 iteration at the end to know we're done + * 80 + 72 + 1 = 153 + */ + assertEquals(153,iterations); + + } + + catch (SimpleDataSourceLoadException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException"); + } + + + } + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testShardingOfReadsSize25() { + ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000); + GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary()); + targetReadCount = 25; + try { + int iterations = 0; + int readCount = 0; + SAMDataSource data = new SAMDataSource(reads,true); + + + data.setIterGen(gen); + shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount); + while (shardStrategy.hasNext()) { + + + BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next()); + assertTrue(ret != null); + while (ret.hasNext()) { + ret.next(); + readCount++; + } + ret.close(); + iterations++; + } + + // assert that we saw 2000 reads + assertEquals(2000,readCount); + + /** + * this next assertion is based on the following logic: + * 25 reads per shard = 5 shards (1 on the end to realize we're done) + * 10 chromosomes = 5 * 10 = 50 + * 1000 unmapped reads / 25 = 40 + * 1 iteration at the end to know we're done + * 50 + 40 + 1 = 91 + */ + assertEquals(91,iterations); + + } + + catch (SimpleDataSourceLoadException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException"); + } + + + } + + } + +/** + * use this to inject into SAMDataSource for testing + */ +class ArtificialIteratorGenerator extends IteratorGenerator { + // How strict should we be with SAM/BAM parsing? + protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT; + + // the header + private SAMFileHeader header; + private int endingChr; + private int startingChr; + private int readCount; + private int readSize; + /** our SAM data files */ + private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; + + public ArtificialIteratorGenerator( int startingChr, int endingChr, int readCount, int readSize) { + header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + readSize); + + } + + public CloseableIterator seek( GenomeLoc seekTo ) { + ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 10, 100, 1000); + if (seekTo != null) { + iter.queryContained(seekTo.getContig(), (int)seekTo.getStart(), (int)seekTo.getStop()); + } + return iter; + } + + /** + * get the merged header + * + * @return the merged header + */ + public SAMFileHeader getHeader() { + return this.header; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java index 40db61131..3ecc8df76 100755 --- a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java +++ b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java @@ -85,7 +85,7 @@ public class BoundedReadIteratorTest extends BaseTest { long shardReadCount = 0; try { - SAMDataSource data = new SAMDataSource(reads); + SAMDataSource data = new SAMDataSource(reads,true); // make sure we have a shard if (!strat.hasNext()) { diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java index d1023c8cd..4e9707e98 100755 --- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsTest.java @@ -122,7 +122,7 @@ public class TraverseReadsTest extends BaseTest { ref.getSequenceDictionary(), readSize); - SAMDataSource dataSource = new SAMDataSource(new Reads(bamList)); + SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true); dataSource.viewUnmappedReads(false); countReadWalker.initialize(); @@ -170,7 +170,7 @@ public class TraverseReadsTest extends BaseTest { ref.getSequenceDictionary(), readSize); - SAMDataSource dataSource = new SAMDataSource(new Reads(bamList)); + SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true); dataSource.viewUnmappedReads(true); countReadWalker.initialize(); diff --git a/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorTest.java b/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorTest.java new file mode 100644 index 000000000..8ad9c15ba --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIteratorTest.java @@ -0,0 +1,115 @@ +package org.broadinstitute.sting.utils.sam; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; +import org.junit.Test; +import static org.junit.Assert.assertEquals; +import net.sf.samtools.SAMRecord; +import static junit.framework.Assert.assertTrue; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class ArtificialSAMQueryIteratorTest + *

+ * a test for the ArtificialSAMQueryIterator class. + */ +public class ArtificialSAMQueryIteratorTest extends BaseTest { + + @Test + public void testWholeChromosomeQuery() { + ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100); + iter.queryContained("chr1", 1, -1); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + count++; + } + assertEquals(100, count); + + } + + @Test + public void testContainedQueryStart() { + ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100); + iter.queryContained("chr1", 1, 50); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + count++; + } + assertEquals(1, count); + + } + + @Test + public void testOverlappingQueryStart() { + ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100); + iter.queryOverlapping("chr1", 1, 50); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + count++; + } + assertEquals(50, count); + + } + + @Test + public void testContainedQueryMiddle() { + ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100); + iter.queryContained("chr1", 25, 74); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + count++; + } + assertEquals(1, count); + + } + + @Test + public void testOverlappingQueryMiddle() { + ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100); + iter.queryOverlapping("chr1", 25, 74); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + count++; + } + assertEquals(50, count); + + } + + @Test(expected = IllegalArgumentException.class) + public void testUnknownChromosome() { + ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100); + iter.queryOverlapping("chr621", 25, 74); + } +} diff --git a/java/test/org/broadinstitute/sting/utils/sam/ArtificialSamUtilsTest.java b/java/test/org/broadinstitute/sting/utils/sam/ArtificialSamUtilsTest.java new file mode 100644 index 000000000..558c47b74 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/sam/ArtificialSamUtilsTest.java @@ -0,0 +1,109 @@ +package org.broadinstitute.sting.utils.sam; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; +import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; +import static junit.framework.Assert.assertTrue; +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Jun 3, 2009 + * Time: 3:09:34 AM + * To change this template use File | Settings | File Templates. + */ +public class ArtificialSamUtilsTest extends BaseTest { + + + @Test + public void basicReadIteratorTest() { + StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + count++; + } + assertEquals(100 * 100, count); + } + + @Test + public void tenPerChromosome() { + StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + assertEquals(Integer.valueOf(Math.round(count / 10)), rec.getReferenceIndex()); + count++; + } + assertEquals(100 * 10, count); + } + + @Test + public void onePerChromosome() { + StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 1); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + assertEquals(Integer.valueOf(count), rec.getReferenceIndex()); + count++; + } + assertEquals(100 * 1, count); + } + + @Test + public void basicUnmappedIteratorTest() { + StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100, 1000); + int count = 0; + for (int x = 0; x < (100* 100); x++ ) { + if (!iter.hasNext()) { + fail ("we didn't get the expected number of reads"); + } + SAMRecord rec = iter.next(); + assertTrue(rec.getReferenceIndex() >= 0); + count++; + } + assertEquals(count, 100 * 100); + + // now we should have 1000 unmapped reads + count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + assertTrue(rec.getReferenceIndex() < 0); + count++; + } + assertEquals(1000, count); + } + + @Test + public void testPeeking() { + PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100); + int count = 0; + while (iter.hasNext()) { + int readCnt = ((ArtificialSAMIterator)(iter)).readsTaken(); + + // peek the record + SAMRecord rec = iter.peek(); + assertTrue(rec.getReferenceIndex() >= 0); + + // next the record + SAMRecord rec2 = iter.next(); + assertTrue(rec2.getReadName() == rec.getReadName()); + assertTrue(rec2.getAlignmentStart() == rec.getAlignmentStart()); + + // find out how many reads we've taken now + int readCnt2 = ((ArtificialSAMIterator)(iter)).readsTaken(); + + count++; + if (count < 100*100) assertEquals(readCnt + 1, readCnt2); + else assertEquals(readCnt, readCnt2); + } + assertEquals(100 * 100, count ); + + } +}