A lot of changes to support by-read sharding and some from debugging of the by loci traversals

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@511 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-04-23 19:03:14 +00:00
parent 32715a6c47
commit 8c13940c5a
5 changed files with 276 additions and 42 deletions

View File

@ -33,14 +33,27 @@ public class ReadShard implements Shard {
// the count of the reads we want to copy off
int size = 0;
// this is going to get gross
private final ReadShardStrategy str;
/**
* create a read shard, given a read size
* @param size
*/
public ReadShard(int size) {
this.str = null;
this.size = size;
}
/**
* create a read shard, given a read size
* @param size
*/
ReadShard(ReadShardStrategy caller, int size) {
this.str = caller;
this.size = size;
}
/** @return the genome location represented by this shard */
public GenomeLoc getGenomeLoc() {
return null; //To change body of implemented methods use File | Settings | File Templates.

View File

@ -3,7 +3,9 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -40,6 +42,20 @@ public class SAMDataSource implements SimpleDataSource {
// our list of readers
private final List<File> samFileList = new ArrayList<File>();
// used for the reads case, the last count of reads retrieved
private long readsTaken = 0;
// our last genome loc position
private GenomeLoc lastReadPos = null;
// do we take unmapped reads
private boolean includeUnmappedReads = true;
// reads based traversal variables
private boolean intoUnmappedReads = false;
private int readsAtLastPos = 0;
/**
* constructor, given sam files
*
@ -48,18 +64,16 @@ public class SAMDataSource implements SimpleDataSource {
public SAMDataSource(List<?> samFiles) throws SimpleDataSourceLoadException {
// check the length
if (samFiles.size() < 1) {
throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0");
throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0");
}
for (Object fileName : samFiles) {
File smFile;
if ( samFiles.get(0) instanceof String) {
smFile = new File((String)samFiles.get(0));
}
else if (samFiles.get(0) instanceof File) {
smFile = (File)fileName;
}
else {
throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File");
if (samFiles.get(0) instanceof String) {
smFile = new File((String) samFiles.get(0));
} else if (samFiles.get(0) instanceof File) {
smFile = (File) fileName;
} else {
throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File");
}
if (!smFile.canRead()) {
@ -71,11 +85,13 @@ public class SAMDataSource implements SimpleDataSource {
}
protected SAMFileReader initializeSAMFile(final File samFile) {
/**
* Load up a sam file.
*
* @param samFile the file name
* @return a SAMFileReader for the file
*/
private SAMFileReader initializeSAMFile(final File samFile) {
if (samFile.toString().endsWith(".list")) {
return null;
} else {
@ -99,6 +115,197 @@ public class SAMDataSource implements SimpleDataSource {
*/
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = GetReaderList();
// now merge the headers
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
// make a merging iterator for this record
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
// we do different things for locus and read modes
if (locusMode) {
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
} else {
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
}
// return the iterator
return iter;
}
/**
* If we're in by-read mode, this indicates if we want
* to see unmapped reads too. Only seeing mapped reads
* is much faster, but most BAM files have significant
* unmapped read counts.
*
* @param seeUnMappedReads true to see unmapped reads, false otherwise
*/
public void viewUnmappedReads(boolean seeUnMappedReads) {
includeUnmappedReads = seeUnMappedReads;
}
/**
* <p>
* seek
* </p>
*
* @param readCount the length or reads to extract
* @return an iterator for that region
*/
public BoundedReadIterator seek(long readCount) throws SimpleDataSourceLoadException {
// TODO: make extremely less horrible
List<SAMFileReader> lst = GetReaderList();
BoundedReadIterator bound;
// now merge the headers
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
// make a merging iterator for this record
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
if (!includeUnmappedReads) {
return fastMappedReadSeek(readCount, iter);
} else {
return unmappedReadSeek(readCount, iter);
}
}
/**
* Seek, if we want unmapped reads. This method will be faster then the unmapped read method, but you cannot extract the
* unmapped reads.
*
* @param readCount how many reads to retrieve
* @param iter the iterator to use
* @return the bounded iterator that you can use to get the intervaled reads from
* @throws SimpleDataSourceLoadException
*/
private BoundedReadIterator unmappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
BoundedReadIterator bound;// is this the first time we're doing this?
int count = 0;
// throw away as many reads as it takes
while (count < this.readsTaken && iter.hasNext()) {
iter.next();
++count;
}
// check to see what happened, did we run out of reads?
if (count != this.readsTaken) {
return null;
}
// we're good, increment our read cout
this.readsTaken += readCount;
return new BoundedReadIterator(iter,readCount);
}
/**
* Seek, if we only want mapped reads. This method will be faster then the unmapped read method, but you cannot extract the
* unmapped reads.
*
* @param readCount how many reads to retrieve
* @param iter the iterator to use
* @return the bounded iterator that you can use to get the intervaled reads from
* @throws SimpleDataSourceLoadException
*/
private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
BoundedReadIterator bound;// is this the first time we're doing this?
if (lastReadPos == null) {
lastReadPos = new GenomeLoc(iter.getMergedHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
bound = new BoundedReadIterator(iter, readCount);
this.readsTaken = readCount;
}
// we're not at the beginning, not at the end, so we move forward with our ghastly plan...
else {
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
// move the number of reads we read from the last pos
while (iter.hasNext() && this.readsAtLastPos > 0) {
iter.next();
--readsAtLastPos;
}
if (readsAtLastPos != 0) {
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
}
int x = 0;
SAMRecord rec = null;
int lastPos = 0;
while (x < readsTaken) {
if (iter.hasNext()) {
rec = iter.next();
if (lastPos == rec.getAlignmentStart()) {
this.readsAtLastPos++;
} else {
this.readsAtLastPos = 1;
}
lastPos = rec.getAlignmentStart();
//System.out.print("t" + rec.getAlignmentStart() + " ");
++x;
} else {
// jump contigs
if (lastReadPos.toNextContig() == false) {
return null;
}
iter.close();
// now merge the headers
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst2 = GetReaderList();
SamFileHeaderMerger mg = new SamFileHeaderMerger(lst2, SORT_ORDER);
iter = new MergingSamRecordIterator2(mg);
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
}
}
// if we're off the end of the last contig (into unmapped territory)
if (rec != null && rec.getAlignmentStart() == 0) {
readsTaken += readCount;
intoUnmappedReads = true;
}
// else we're not off the end, store our location
else if (rec != null) {
int stopPos = rec.getAlignmentStart();
if (stopPos < lastReadPos.getStart()) {
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
} else {
lastReadPos.setStop(rec.getAlignmentStart());
}
}
// in case we're run out of reads, get out
else {
return null;
}
bound = new BoundedReadIterator(iter, readCount);
}
// return the iterator
return bound;
}
/**
* A private function that, given the internal file list, generates a SamFileReader
* list of validated files.
*
* @return a list of SAMFileReaders that represent the stored file names
* @throws SimpleDataSourceLoadException if there's a problem loading the files
*/
private List<SAMFileReader> GetReaderList() throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
for (File f : this.samFileList) {
@ -108,25 +315,7 @@ public class SAMDataSource implements SimpleDataSource {
}
lst.add(reader);
}
// now merge the headers
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
// make a merging iterator for this record
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
// we do different things for locus and read modes
if (locusMode) {
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop()+1);
} else {
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop()+1);
}
// return the iterator
return iter;
return lst;
}
}

View File

@ -37,11 +37,19 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
// the genome loc we're bounding
final private long readCount;
private long currentCount = 0;
// the iterator we want to decorate
private CloseableIterator<SAMRecord> iterator;
private final CloseableIterator<SAMRecord> iterator;
// our unmapped read flag
private boolean doNotUseThatUnmappedReadPile = false;
// are we open
private boolean isOpen = false;
// the next read we've buffered
private SAMRecord record = null;
/**
* constructor
* @param iter
@ -50,11 +58,16 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
public BoundedReadIterator(CloseableIterator<SAMRecord> iter, long readCount) {
if (iter != null) {
isOpen = true;
this.iterator = iter;
}
this.iterator = iter;
this.readCount = readCount;
}
public void useUnmappedReads(boolean useThem) {
this.doNotUseThatUnmappedReadPile = useThem;
}
/**
* Do we have a next? If the iterator has a read and we're not over the read
@ -63,6 +76,11 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
*/
public boolean hasNext() {
if (isOpen && iterator.hasNext() && currentCount < readCount) {
record = iterator.next();
++currentCount;
if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) {
return false;
}
return true;
} else {
if (isOpen) {
@ -77,11 +95,7 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
* @return SAMRecord representing the next read
*/
public SAMRecord next() {
if (!isOpen) {
throw new UnsupportedOperationException("You cannot call next on a closed iterator");
}
++currentCount;
return iterator.next();
return record;
}
/**

View File

@ -110,7 +110,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
}
/** Returns true if any of the underlying iterators has more records, otherwise false. */
public synchronized boolean hasNext() {
public boolean hasNext() {
if (!initialized) {
lazyInitialization();
}
@ -118,7 +118,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
}
/** Returns the next record from the top most iterator during merging. */
public synchronized SAMRecord next() {
public SAMRecord next() {
if (!initialized) {
lazyInitialization();
}

View File

@ -284,6 +284,24 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
}
}
/**
* Move this Genome loc to the next contig, with a start
* and stop of 1.
* @return true if we are not out of contigs, otherwise false if we're
* at the end of the genome (no more contigs to jump to).
*/
public boolean toNextContig() {
if (contigIndex < GenomeLoc.contigInfo.size()) {
this.contigIndex++;
this.start = 1;
this.stop = 1;
return true;
}
return false;
}
/**
* Returns true iff we have a specified series of locations to process AND we are past the last
* location in the list. It means that, in a serial processing of the genome, that we are done.