A lot of changes to support by-read sharding and some from debugging of the by loci traversals
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@511 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
32715a6c47
commit
8c13940c5a
|
|
@ -33,14 +33,27 @@ public class ReadShard implements Shard {
|
||||||
// the count of the reads we want to copy off
|
// the count of the reads we want to copy off
|
||||||
int size = 0;
|
int size = 0;
|
||||||
|
|
||||||
|
// this is going to get gross
|
||||||
|
private final ReadShardStrategy str;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a read shard, given a read size
|
* create a read shard, given a read size
|
||||||
* @param size
|
* @param size
|
||||||
*/
|
*/
|
||||||
public ReadShard(int size) {
|
public ReadShard(int size) {
|
||||||
|
this.str = null;
|
||||||
this.size = size;
|
this.size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a read shard, given a read size
|
||||||
|
* @param size
|
||||||
|
*/
|
||||||
|
ReadShard(ReadShardStrategy caller, int size) {
|
||||||
|
this.str = caller;
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
/** @return the genome location represented by this shard */
|
/** @return the genome location represented by this shard */
|
||||||
public GenomeLoc getGenomeLoc() {
|
public GenomeLoc getGenomeLoc() {
|
||||||
return null; //To change body of implemented methods use File | Settings | File Templates.
|
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,9 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMFileReader;
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
||||||
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
|
@ -40,6 +42,20 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
// our list of readers
|
// our list of readers
|
||||||
private final List<File> samFileList = new ArrayList<File>();
|
private final List<File> samFileList = new ArrayList<File>();
|
||||||
|
|
||||||
|
// used for the reads case, the last count of reads retrieved
|
||||||
|
private long readsTaken = 0;
|
||||||
|
|
||||||
|
// our last genome loc position
|
||||||
|
private GenomeLoc lastReadPos = null;
|
||||||
|
|
||||||
|
// do we take unmapped reads
|
||||||
|
private boolean includeUnmappedReads = true;
|
||||||
|
|
||||||
|
// reads based traversal variables
|
||||||
|
private boolean intoUnmappedReads = false;
|
||||||
|
private int readsAtLastPos = 0;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* constructor, given sam files
|
* constructor, given sam files
|
||||||
*
|
*
|
||||||
|
|
@ -48,18 +64,16 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
public SAMDataSource(List<?> samFiles) throws SimpleDataSourceLoadException {
|
public SAMDataSource(List<?> samFiles) throws SimpleDataSourceLoadException {
|
||||||
// check the length
|
// check the length
|
||||||
if (samFiles.size() < 1) {
|
if (samFiles.size() < 1) {
|
||||||
throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0");
|
throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0");
|
||||||
}
|
}
|
||||||
for (Object fileName : samFiles) {
|
for (Object fileName : samFiles) {
|
||||||
File smFile;
|
File smFile;
|
||||||
if ( samFiles.get(0) instanceof String) {
|
if (samFiles.get(0) instanceof String) {
|
||||||
smFile = new File((String)samFiles.get(0));
|
smFile = new File((String) samFiles.get(0));
|
||||||
}
|
} else if (samFiles.get(0) instanceof File) {
|
||||||
else if (samFiles.get(0) instanceof File) {
|
smFile = (File) fileName;
|
||||||
smFile = (File)fileName;
|
} else {
|
||||||
}
|
throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File");
|
||||||
else {
|
|
||||||
throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!smFile.canRead()) {
|
if (!smFile.canRead()) {
|
||||||
|
|
@ -71,11 +85,13 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load up a sam file.
|
||||||
|
*
|
||||||
|
* @param samFile the file name
|
||||||
protected SAMFileReader initializeSAMFile(final File samFile) {
|
* @return a SAMFileReader for the file
|
||||||
|
*/
|
||||||
|
private SAMFileReader initializeSAMFile(final File samFile) {
|
||||||
if (samFile.toString().endsWith(".list")) {
|
if (samFile.toString().endsWith(".list")) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -99,6 +115,197 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
*/
|
*/
|
||||||
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
|
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
|
||||||
|
|
||||||
|
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||||
|
List<SAMFileReader> lst = GetReaderList();
|
||||||
|
|
||||||
|
// now merge the headers
|
||||||
|
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
|
||||||
|
|
||||||
|
// make a merging iterator for this record
|
||||||
|
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
||||||
|
|
||||||
|
|
||||||
|
// we do different things for locus and read modes
|
||||||
|
if (locusMode) {
|
||||||
|
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
|
||||||
|
} else {
|
||||||
|
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// return the iterator
|
||||||
|
return iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If we're in by-read mode, this indicates if we want
|
||||||
|
* to see unmapped reads too. Only seeing mapped reads
|
||||||
|
* is much faster, but most BAM files have significant
|
||||||
|
* unmapped read counts.
|
||||||
|
*
|
||||||
|
* @param seeUnMappedReads true to see unmapped reads, false otherwise
|
||||||
|
*/
|
||||||
|
public void viewUnmappedReads(boolean seeUnMappedReads) {
|
||||||
|
includeUnmappedReads = seeUnMappedReads;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* seek
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param readCount the length or reads to extract
|
||||||
|
* @return an iterator for that region
|
||||||
|
*/
|
||||||
|
public BoundedReadIterator seek(long readCount) throws SimpleDataSourceLoadException {
|
||||||
|
// TODO: make extremely less horrible
|
||||||
|
List<SAMFileReader> lst = GetReaderList();
|
||||||
|
|
||||||
|
BoundedReadIterator bound;
|
||||||
|
|
||||||
|
// now merge the headers
|
||||||
|
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
|
||||||
|
|
||||||
|
// make a merging iterator for this record
|
||||||
|
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
||||||
|
if (!includeUnmappedReads) {
|
||||||
|
return fastMappedReadSeek(readCount, iter);
|
||||||
|
} else {
|
||||||
|
return unmappedReadSeek(readCount, iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seek, if we want unmapped reads. This method will be faster then the unmapped read method, but you cannot extract the
|
||||||
|
* unmapped reads.
|
||||||
|
*
|
||||||
|
* @param readCount how many reads to retrieve
|
||||||
|
* @param iter the iterator to use
|
||||||
|
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||||
|
* @throws SimpleDataSourceLoadException
|
||||||
|
*/
|
||||||
|
private BoundedReadIterator unmappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
||||||
|
BoundedReadIterator bound;// is this the first time we're doing this?
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
// throw away as many reads as it takes
|
||||||
|
while (count < this.readsTaken && iter.hasNext()) {
|
||||||
|
iter.next();
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check to see what happened, did we run out of reads?
|
||||||
|
if (count != this.readsTaken) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we're good, increment our read cout
|
||||||
|
this.readsTaken += readCount;
|
||||||
|
return new BoundedReadIterator(iter,readCount);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seek, if we only want mapped reads. This method will be faster then the unmapped read method, but you cannot extract the
|
||||||
|
* unmapped reads.
|
||||||
|
*
|
||||||
|
* @param readCount how many reads to retrieve
|
||||||
|
* @param iter the iterator to use
|
||||||
|
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||||
|
* @throws SimpleDataSourceLoadException
|
||||||
|
*/
|
||||||
|
private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
||||||
|
BoundedReadIterator bound;// is this the first time we're doing this?
|
||||||
|
if (lastReadPos == null) {
|
||||||
|
lastReadPos = new GenomeLoc(iter.getMergedHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
|
||||||
|
bound = new BoundedReadIterator(iter, readCount);
|
||||||
|
this.readsTaken = readCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we're not at the beginning, not at the end, so we move forward with our ghastly plan...
|
||||||
|
else {
|
||||||
|
|
||||||
|
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
|
||||||
|
|
||||||
|
// move the number of reads we read from the last pos
|
||||||
|
while (iter.hasNext() && this.readsAtLastPos > 0) {
|
||||||
|
iter.next();
|
||||||
|
--readsAtLastPos;
|
||||||
|
}
|
||||||
|
if (readsAtLastPos != 0) {
|
||||||
|
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
int x = 0;
|
||||||
|
SAMRecord rec = null;
|
||||||
|
int lastPos = 0;
|
||||||
|
|
||||||
|
while (x < readsTaken) {
|
||||||
|
if (iter.hasNext()) {
|
||||||
|
rec = iter.next();
|
||||||
|
if (lastPos == rec.getAlignmentStart()) {
|
||||||
|
this.readsAtLastPos++;
|
||||||
|
} else {
|
||||||
|
this.readsAtLastPos = 1;
|
||||||
|
}
|
||||||
|
lastPos = rec.getAlignmentStart();
|
||||||
|
//System.out.print("t" + rec.getAlignmentStart() + " ");
|
||||||
|
++x;
|
||||||
|
} else {
|
||||||
|
// jump contigs
|
||||||
|
if (lastReadPos.toNextContig() == false) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
iter.close();
|
||||||
|
// now merge the headers
|
||||||
|
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||||
|
List<SAMFileReader> lst2 = GetReaderList();
|
||||||
|
SamFileHeaderMerger mg = new SamFileHeaderMerger(lst2, SORT_ORDER);
|
||||||
|
iter = new MergingSamRecordIterator2(mg);
|
||||||
|
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// if we're off the end of the last contig (into unmapped territory)
|
||||||
|
if (rec != null && rec.getAlignmentStart() == 0) {
|
||||||
|
readsTaken += readCount;
|
||||||
|
intoUnmappedReads = true;
|
||||||
|
}
|
||||||
|
// else we're not off the end, store our location
|
||||||
|
else if (rec != null) {
|
||||||
|
int stopPos = rec.getAlignmentStart();
|
||||||
|
if (stopPos < lastReadPos.getStart()) {
|
||||||
|
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
|
||||||
|
} else {
|
||||||
|
lastReadPos.setStop(rec.getAlignmentStart());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// in case we're run out of reads, get out
|
||||||
|
else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
bound = new BoundedReadIterator(iter, readCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// return the iterator
|
||||||
|
return bound;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A private function that, given the internal file list, generates a SamFileReader
|
||||||
|
* list of validated files.
|
||||||
|
*
|
||||||
|
* @return a list of SAMFileReaders that represent the stored file names
|
||||||
|
* @throws SimpleDataSourceLoadException if there's a problem loading the files
|
||||||
|
*/
|
||||||
|
private List<SAMFileReader> GetReaderList() throws SimpleDataSourceLoadException {
|
||||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||||
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
||||||
for (File f : this.samFileList) {
|
for (File f : this.samFileList) {
|
||||||
|
|
@ -108,25 +315,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
}
|
}
|
||||||
lst.add(reader);
|
lst.add(reader);
|
||||||
}
|
}
|
||||||
|
return lst;
|
||||||
// now merge the headers
|
|
||||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
|
|
||||||
|
|
||||||
// make a merging iterator for this record
|
|
||||||
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
|
||||||
|
|
||||||
|
|
||||||
// we do different things for locus and read modes
|
|
||||||
if (locusMode) {
|
|
||||||
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop()+1);
|
|
||||||
} else {
|
|
||||||
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop()+1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// return the iterator
|
|
||||||
return iter;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,11 +37,19 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
|
||||||
// the genome loc we're bounding
|
// the genome loc we're bounding
|
||||||
final private long readCount;
|
final private long readCount;
|
||||||
private long currentCount = 0;
|
private long currentCount = 0;
|
||||||
|
|
||||||
// the iterator we want to decorate
|
// the iterator we want to decorate
|
||||||
private CloseableIterator<SAMRecord> iterator;
|
private final CloseableIterator<SAMRecord> iterator;
|
||||||
|
|
||||||
|
// our unmapped read flag
|
||||||
|
private boolean doNotUseThatUnmappedReadPile = false;
|
||||||
|
|
||||||
// are we open
|
// are we open
|
||||||
private boolean isOpen = false;
|
private boolean isOpen = false;
|
||||||
|
|
||||||
|
// the next read we've buffered
|
||||||
|
private SAMRecord record = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* constructor
|
* constructor
|
||||||
* @param iter
|
* @param iter
|
||||||
|
|
@ -50,11 +58,16 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
|
||||||
public BoundedReadIterator(CloseableIterator<SAMRecord> iter, long readCount) {
|
public BoundedReadIterator(CloseableIterator<SAMRecord> iter, long readCount) {
|
||||||
if (iter != null) {
|
if (iter != null) {
|
||||||
isOpen = true;
|
isOpen = true;
|
||||||
this.iterator = iter;
|
|
||||||
}
|
}
|
||||||
|
this.iterator = iter;
|
||||||
this.readCount = readCount;
|
this.readCount = readCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void useUnmappedReads(boolean useThem) {
|
||||||
|
this.doNotUseThatUnmappedReadPile = useThem;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Do we have a next? If the iterator has a read and we're not over the read
|
* Do we have a next? If the iterator has a read and we're not over the read
|
||||||
|
|
@ -63,6 +76,11 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
|
||||||
*/
|
*/
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
if (isOpen && iterator.hasNext() && currentCount < readCount) {
|
if (isOpen && iterator.hasNext() && currentCount < readCount) {
|
||||||
|
record = iterator.next();
|
||||||
|
++currentCount;
|
||||||
|
if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
if (isOpen) {
|
if (isOpen) {
|
||||||
|
|
@ -77,11 +95,7 @@ public class BoundedReadIterator implements CloseableIterator<SAMRecord>, Iterab
|
||||||
* @return SAMRecord representing the next read
|
* @return SAMRecord representing the next read
|
||||||
*/
|
*/
|
||||||
public SAMRecord next() {
|
public SAMRecord next() {
|
||||||
if (!isOpen) {
|
return record;
|
||||||
throw new UnsupportedOperationException("You cannot call next on a closed iterator");
|
|
||||||
}
|
|
||||||
++currentCount;
|
|
||||||
return iterator.next();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -110,7 +110,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns true if any of the underlying iterators has more records, otherwise false. */
|
/** Returns true if any of the underlying iterators has more records, otherwise false. */
|
||||||
public synchronized boolean hasNext() {
|
public boolean hasNext() {
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
lazyInitialization();
|
lazyInitialization();
|
||||||
}
|
}
|
||||||
|
|
@ -118,7 +118,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next record from the top most iterator during merging. */
|
/** Returns the next record from the top most iterator during merging. */
|
||||||
public synchronized SAMRecord next() {
|
public SAMRecord next() {
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
lazyInitialization();
|
lazyInitialization();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -284,6 +284,24 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move this Genome loc to the next contig, with a start
|
||||||
|
* and stop of 1.
|
||||||
|
* @return true if we are not out of contigs, otherwise false if we're
|
||||||
|
* at the end of the genome (no more contigs to jump to).
|
||||||
|
*/
|
||||||
|
public boolean toNextContig() {
|
||||||
|
if (contigIndex < GenomeLoc.contigInfo.size()) {
|
||||||
|
this.contigIndex++;
|
||||||
|
this.start = 1;
|
||||||
|
this.stop = 1;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true iff we have a specified series of locations to process AND we are past the last
|
* Returns true iff we have a specified series of locations to process AND we are past the last
|
||||||
* location in the list. It means that, in a serial processing of the genome, that we are done.
|
* location in the list. It means that, in a serial processing of the genome, that we are done.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue