First successful test of new sharding system prototype. Can traverse over reads from a single
BAM file. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2587 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
db9570ae29
commit
b19bb19f3d
|
|
@ -36,6 +36,7 @@ public class BAMChunkIterator implements Iterator<Chunk> {
|
||||||
this.blockIterator = blockIterator;
|
this.blockIterator = blockIterator;
|
||||||
this.prefetchedSegments = new LinkedList<BlockSegment>();
|
this.prefetchedSegments = new LinkedList<BlockSegment>();
|
||||||
this.filters = new PriorityQueue<Chunk>(filters);
|
this.filters = new PriorityQueue<Chunk>(filters);
|
||||||
|
seedNextSegments();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,8 @@ import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a BAM file header from an file, optionally providing its position
|
* Loads a BAM file header from an file, optionally providing its position
|
||||||
|
|
@ -27,6 +29,8 @@ public class BAMFileHeaderLoader {
|
||||||
*/
|
*/
|
||||||
private final Chunk location;
|
private final Chunk location;
|
||||||
|
|
||||||
|
public static final Chunk preambleLocation = new Chunk(0<<16 | 0, 0<<16 | 3);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load the header from the given file.
|
* Load the header from the given file.
|
||||||
* @param header the parsed haeder for the BAM file.
|
* @param header the parsed haeder for the BAM file.
|
||||||
|
|
@ -69,9 +73,55 @@ public class BAMFileHeaderLoader {
|
||||||
headerCodec.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
headerCodec.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
||||||
SAMFileHeader header = headerCodec.decode(new StringLineReader(textHeader),file.getAbsolutePath());
|
SAMFileHeader header = headerCodec.decode(new StringLineReader(textHeader),file.getAbsolutePath());
|
||||||
|
|
||||||
|
// directly copied from BAMFileReader...
|
||||||
|
final int sequenceCount = binaryCodec.readInt();
|
||||||
|
if (header.getSequenceDictionary().size() > 0) {
|
||||||
|
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
||||||
|
if (sequenceCount != header.getSequenceDictionary().size()) {
|
||||||
|
throw new SAMFormatException("Number of sequences in text header (" +
|
||||||
|
header.getSequenceDictionary().size() +
|
||||||
|
") != number of sequences in binary header (" + sequenceCount + ") for file " + file);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < sequenceCount; i++) {
|
||||||
|
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(binaryCodec,file);
|
||||||
|
final SAMSequenceRecord sequenceRecord = header.getSequence(i);
|
||||||
|
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||||
|
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
||||||
|
binaryCodec);
|
||||||
|
}
|
||||||
|
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||||
|
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||||
|
binaryCodec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If only binary sequences are present, copy them into mFileHeader
|
||||||
|
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
||||||
|
for (int i = 0; i < sequenceCount; i++) {
|
||||||
|
sequences.add(readSequenceRecord(binaryCodec,file));
|
||||||
|
}
|
||||||
|
header.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
||||||
|
}
|
||||||
inputStream.close();
|
inputStream.close();
|
||||||
|
|
||||||
return new BAMFileHeaderLoader(header,new Chunk(buffer.length,inputStream.getFilePointer()));
|
return new BAMFileHeaderLoader(header,new Chunk(buffer.length,inputStream.getFilePointer()-1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a single binary sequence record from the file or stream
|
||||||
|
* @param binaryCodec stream to read from.
|
||||||
|
* @param file Note that this is used only for reporting errors.
|
||||||
|
* @return an individual sequence record.
|
||||||
|
*/
|
||||||
|
private static SAMSequenceRecord readSequenceRecord(final BinaryCodec binaryCodec, final File file) {
|
||||||
|
final int nameLength = binaryCodec.readInt();
|
||||||
|
if (nameLength <= 1) {
|
||||||
|
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + file.getAbsolutePath());
|
||||||
|
}
|
||||||
|
final String sequenceName = binaryCodec.readString(nameLength - 1);
|
||||||
|
// Skip the null terminator
|
||||||
|
binaryCodec.readByte();
|
||||||
|
final int sequenceLength = binaryCodec.readInt();
|
||||||
|
return new SAMSequenceRecord(sequenceName, sequenceLength);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -490,10 +490,7 @@ class BAMFileReader2
|
||||||
mFilePointerLimit = endOffset;
|
mFilePointerLimit = endOffset;
|
||||||
}
|
}
|
||||||
// Pull next record from stream
|
// Pull next record from stream
|
||||||
final SAMRecord record = super.getNextRecord();
|
return super.getNextRecord();
|
||||||
if (record == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ class BlockSegment {
|
||||||
* @return the chunk equivalent of this block.
|
* @return the chunk equivalent of this block.
|
||||||
*/
|
*/
|
||||||
public Chunk toChunk() {
|
public Chunk toChunk() {
|
||||||
return new Chunk(position << 16 & blockStart,position << 16 & blockStop);
|
return new Chunk(position << 16 | blockStart,position << 16 | blockStop);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ import java.util.ArrayList;
|
||||||
* @author mhanna
|
* @author mhanna
|
||||||
* @version 0.1
|
* @version 0.1
|
||||||
*/
|
*/
|
||||||
class Chunk implements Comparable<Chunk> {
|
public class Chunk implements Comparable<Chunk> {
|
||||||
|
|
||||||
private long mChunkStart;
|
private long mChunkStart;
|
||||||
private long mChunkEnd;
|
private long mChunkEnd;
|
||||||
|
|
|
||||||
|
|
@ -26,13 +26,14 @@
|
||||||
package org.broadinstitute.sting.gatk;
|
package org.broadinstitute.sting.gatk;
|
||||||
|
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
|
||||||
import net.sf.picard.filter.SamRecordFilter;
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
|
|
@ -329,11 +330,11 @@ public class GenomeAnalysisEngine {
|
||||||
public List<Set<String>> getSamplesByReaders() {
|
public List<Set<String>> getSamplesByReaders() {
|
||||||
|
|
||||||
|
|
||||||
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
|
Collection<SAMFileReader> readers = getDataSource().getReaders();
|
||||||
|
|
||||||
List<Set<String>> sample_sets = new ArrayList<Set<String>>(hm.getReaders().size());
|
List<Set<String>> sample_sets = new ArrayList<Set<String>>(readers.size());
|
||||||
|
|
||||||
for (SAMFileReader r : hm.getReaders()) {
|
for (SAMFileReader r : readers) {
|
||||||
|
|
||||||
Set<String> samples = new HashSet<String>(1);
|
Set<String> samples = new HashSet<String>(1);
|
||||||
sample_sets.add(samples);
|
sample_sets.add(samples);
|
||||||
|
|
@ -358,11 +359,11 @@ public class GenomeAnalysisEngine {
|
||||||
public List<Set<String>> getLibrariesByReaders() {
|
public List<Set<String>> getLibrariesByReaders() {
|
||||||
|
|
||||||
|
|
||||||
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
|
Collection<SAMFileReader> readers = getDataSource().getReaders();
|
||||||
|
|
||||||
List<Set<String>> lib_sets = new ArrayList<Set<String>>(hm.getReaders().size());
|
List<Set<String>> lib_sets = new ArrayList<Set<String>>(readers.size());
|
||||||
|
|
||||||
for (SAMFileReader r : hm.getReaders()) {
|
for (SAMFileReader r : readers) {
|
||||||
|
|
||||||
Set<String> libs = new HashSet<String>(2);
|
Set<String> libs = new HashSet<String>(2);
|
||||||
lib_sets.add(libs);
|
lib_sets.add(libs);
|
||||||
|
|
@ -387,20 +388,20 @@ public class GenomeAnalysisEngine {
|
||||||
public List<Set<String>> getMergedReadGroupsByReaders() {
|
public List<Set<String>> getMergedReadGroupsByReaders() {
|
||||||
|
|
||||||
|
|
||||||
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
|
Collection<SAMFileReader> readers = getDataSource().getReaders();
|
||||||
|
|
||||||
List<Set<String>> rg_sets = new ArrayList<Set<String>>(hm.getReaders().size());
|
List<Set<String>> rg_sets = new ArrayList<Set<String>>(readers.size());
|
||||||
|
|
||||||
for (SAMFileReader r : hm.getReaders()) {
|
for (SAMFileReader r : readers) {
|
||||||
|
|
||||||
Set<String> groups = new HashSet<String>(5);
|
Set<String> groups = new HashSet<String>(5);
|
||||||
rg_sets.add(groups);
|
rg_sets.add(groups);
|
||||||
|
|
||||||
for (SAMReadGroupRecord g : r.getFileHeader().getReadGroups()) {
|
for (SAMReadGroupRecord g : r.getFileHeader().getReadGroups()) {
|
||||||
if (hm.hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
|
if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so:
|
||||||
// use HeaderMerger to translate original read group id from the reader into the read group id in the
|
// use HeaderMerger to translate original read group id from the reader into the read group id in the
|
||||||
// merged stream, and save that remapped read group id to associate it with specific reader
|
// merged stream, and save that remapped read group id to associate it with specific reader
|
||||||
groups.add(hm.getReadGroupId(r, g.getReadGroupId()));
|
groups.add(getDataSource().getReadGroupId(r, g.getReadGroupId()));
|
||||||
} else {
|
} else {
|
||||||
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
||||||
groups.add(g.getReadGroupId());
|
groups.add(g.getReadGroupId());
|
||||||
|
|
@ -609,26 +610,29 @@ public class GenomeAnalysisEngine {
|
||||||
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
|
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
|
||||||
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
|
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
|
||||||
|
|
||||||
shardStrategy = ShardStrategyFactory.shatter(shardType,
|
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||||
|
shardType,
|
||||||
drivingDataSource.getSequenceDictionary(),
|
drivingDataSource.getSequenceDictionary(),
|
||||||
SHARD_SIZE,
|
SHARD_SIZE,
|
||||||
intervals, maxIterations);
|
intervals, maxIterations);
|
||||||
} else
|
} else
|
||||||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
||||||
drivingDataSource.getSequenceDictionary(),
|
drivingDataSource.getSequenceDictionary(),
|
||||||
SHARD_SIZE, maxIterations);
|
SHARD_SIZE, maxIterations);
|
||||||
} else if (walker instanceof ReadWalker ||
|
} else if (walker instanceof ReadWalker ||
|
||||||
walker instanceof DuplicateWalker) {
|
walker instanceof DuplicateWalker) {
|
||||||
|
if(argCollection.experimentalSharding)
|
||||||
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS;
|
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
|
||||||
|
else
|
||||||
|
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS;
|
||||||
|
|
||||||
if (intervals != null && !intervals.isEmpty()) {
|
if (intervals != null && !intervals.isEmpty()) {
|
||||||
shardStrategy = ShardStrategyFactory.shatter(shardType,
|
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,shardType,
|
||||||
drivingDataSource.getSequenceDictionary(),
|
drivingDataSource.getSequenceDictionary(),
|
||||||
SHARD_SIZE,
|
SHARD_SIZE,
|
||||||
intervals, maxIterations);
|
intervals, maxIterations);
|
||||||
} else {
|
} else {
|
||||||
shardStrategy = ShardStrategyFactory.shatter(shardType,
|
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,shardType,
|
||||||
drivingDataSource.getSequenceDictionary(),
|
drivingDataSource.getSequenceDictionary(),
|
||||||
SHARD_SIZE, maxIterations);
|
SHARD_SIZE, maxIterations);
|
||||||
}
|
}
|
||||||
|
|
@ -636,7 +640,8 @@ public class GenomeAnalysisEngine {
|
||||||
if ((intervals == null || intervals.isEmpty()) && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST))
|
if ((intervals == null || intervals.isEmpty()) && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST))
|
||||||
Utils.warnUser("walker is of type LocusWindow (which operates over intervals), but no intervals were provided." +
|
Utils.warnUser("walker is of type LocusWindow (which operates over intervals), but no intervals were provided." +
|
||||||
"This may be unintentional, check your command-line arguments.");
|
"This may be unintentional, check your command-line arguments.");
|
||||||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL,
|
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||||
|
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL,
|
||||||
drivingDataSource.getSequenceDictionary(),
|
drivingDataSource.getSequenceDictionary(),
|
||||||
SHARD_SIZE,
|
SHARD_SIZE,
|
||||||
intervals, maxIterations);
|
intervals, maxIterations);
|
||||||
|
|
@ -657,7 +662,11 @@ public class GenomeAnalysisEngine {
|
||||||
if (reads.getReadsFiles().size() == 0)
|
if (reads.getReadsFiles().size() == 0)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
SAMDataSource dataSource = new SAMDataSource(reads);
|
SAMDataSource dataSource = null;
|
||||||
|
if(argCollection.experimentalSharding)
|
||||||
|
dataSource = new BlockDrivenSAMDataSource(reads);
|
||||||
|
else
|
||||||
|
dataSource = new IndexDrivenSAMDataSource(reads);
|
||||||
|
|
||||||
return dataSource;
|
return dataSource;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -146,6 +146,9 @@ public class GATKArgumentCollection {
|
||||||
@Argument(fullName = "enableRodWalkers", shortName = "erw", doc = "Enable experimental rodWalker support. TEMPORARY HACK TO ALLOW EXPERIMENTATION WITH ROD WALKERS. [default is false]}.", required = false)
|
@Argument(fullName = "enableRodWalkers", shortName = "erw", doc = "Enable experimental rodWalker support. TEMPORARY HACK TO ALLOW EXPERIMENTATION WITH ROD WALKERS. [default is false]}.", required = false)
|
||||||
public boolean enableRodWalkers = false;
|
public boolean enableRodWalkers = false;
|
||||||
|
|
||||||
|
@Element(required = false)
|
||||||
|
@Argument(fullName = "experimental_sharding",shortName="es", doc="Use the experimental sharding strategy. Will not work for all traversal types.", required = false)
|
||||||
|
public boolean experimentalSharding = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* marshal the data out to a object
|
* marshal the data out to a object
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.shards;
|
||||||
|
|
||||||
|
import net.sf.samtools.Chunk;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expresses a shard of read data in block format.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BlockDelimitedReadShard extends ReadShard {
|
||||||
|
/**
|
||||||
|
* The list of chunks to retrieve when loading this shard.
|
||||||
|
*/
|
||||||
|
private final List<Chunk> chunks;
|
||||||
|
|
||||||
|
public BlockDelimitedReadShard(List<Chunk> chunks) {
|
||||||
|
this.chunks = chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the list of chunks delimiting this shard.
|
||||||
|
* @return a list of chunks that contain data for this shard.
|
||||||
|
*/
|
||||||
|
public List<Chunk> getChunks() {
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* String representation of this shard.
|
||||||
|
* @return A string representation of the boundaries of this shard.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for(Chunk chunk : chunks) {
|
||||||
|
sb.append(chunk);
|
||||||
|
sb.append(' ');
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,101 @@
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.shards;
|
||||||
|
|
||||||
|
import net.sf.samtools.Chunk;
|
||||||
|
import net.sf.samtools.BAMFileHeaderLoader;
|
||||||
|
import net.sf.samtools.BAMChunkIterator;
|
||||||
|
import net.sf.samtools.BAMBlockIterator;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A read shard strategy that delimits based on the number of
|
||||||
|
* blocks in the BAM file.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BlockDelimitedReadShardStrategy extends ReadShardStrategy {
|
||||||
|
/**
|
||||||
|
* Number of blocks in a given shard.
|
||||||
|
*/
|
||||||
|
protected int blockCount = 100;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The actual chunks streaming into the file.
|
||||||
|
*/
|
||||||
|
private final BAMChunkIterator chunkIterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The data backing the next chunks to deliver to the traversal engine.
|
||||||
|
*/
|
||||||
|
private final List<Chunk> nextChunks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new read shard strategy, loading read shards from the given BAM file.
|
||||||
|
* @param dataSource Data source from which to load shards.
|
||||||
|
*/
|
||||||
|
public BlockDelimitedReadShardStrategy(SAMDataSource dataSource) {
|
||||||
|
if(dataSource.getReadsInfo().getReadsFiles().size() > 1)
|
||||||
|
throw new UnsupportedOperationException("Experimental sharding only works with a single BAM at the moment.");
|
||||||
|
File bamFile = dataSource.getReadsInfo().getReadsFiles().get(0);
|
||||||
|
try {
|
||||||
|
Chunk headerLocation = BAMFileHeaderLoader.load(bamFile).getLocation();
|
||||||
|
chunkIterator = new BAMChunkIterator(new BAMBlockIterator(bamFile),Arrays.asList(BAMFileHeaderLoader.preambleLocation,headerLocation));
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new StingException("Unable to open BAM file for sharding.");
|
||||||
|
}
|
||||||
|
nextChunks = new ArrayList<Chunk>();
|
||||||
|
|
||||||
|
advance();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* do we have another read shard?
|
||||||
|
* @return True if any more data is available. False otherwise.
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
return nextChunks.size() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the next shard, if available.
|
||||||
|
* @return The next shard, if available.
|
||||||
|
* @throws NoSuchElementException if no such shard is available.
|
||||||
|
*/
|
||||||
|
public Shard next() {
|
||||||
|
if(!hasNext())
|
||||||
|
throw new NoSuchElementException("No such element available: SAM reader has arrived at last shard.");
|
||||||
|
Shard shard = new BlockDelimitedReadShard(Collections.unmodifiableList(new ArrayList<Chunk>(nextChunks)));
|
||||||
|
advance();
|
||||||
|
return shard;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @throws UnsupportedOperationException always.
|
||||||
|
*/
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Remove not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convenience method for using ShardStrategy in an foreach loop.
|
||||||
|
* @return A iterator over shards.
|
||||||
|
*/
|
||||||
|
public Iterator<Shard> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void advance() {
|
||||||
|
nextChunks.clear();
|
||||||
|
int chunksCopied = 0;
|
||||||
|
|
||||||
|
while(chunksCopied++ < blockCount && chunkIterator.hasNext())
|
||||||
|
nextChunks.add(chunkIterator.next());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -62,4 +62,13 @@ public class IntervalShard implements Shard {
|
||||||
public Shard.ShardType getShardType() {
|
public Shard.ShardType getShardType() {
|
||||||
return mType;
|
return mType;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* String representation of this shard.
|
||||||
|
* @return A string representation of the boundaries of this shard.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return mSet.toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,51 @@
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.shards;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A read shard delimited by an actual read count, rather than blocks or any other
|
||||||
|
* physical mapping of the BAM file.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class ReadDelimitedReadShard extends ReadShard {
|
||||||
|
// the count of the reads we want to copy off
|
||||||
|
private int size = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* our tie in for the shard strategy. This allows us to signal to the shard
|
||||||
|
* strategy that we've finished process, so it can indicate that we're out of reads
|
||||||
|
*/
|
||||||
|
private final ReadDelimitedReadShardStrategy strat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a read shard, given a read size
|
||||||
|
* @param strat The sharding strategy used to create this shard.
|
||||||
|
* @param size Size of the shard, in reads.
|
||||||
|
*/
|
||||||
|
ReadDelimitedReadShard(ReadDelimitedReadShardStrategy strat, int size) {
|
||||||
|
this.size = size;
|
||||||
|
this.strat = strat;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @return the genome location represented by this shard */
|
||||||
|
public int getSize() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* this method is used as a backend, to signal to the sharding strategy that we've
|
||||||
|
* finished processing. When we move to a more read-aware bam system this method could disappear.
|
||||||
|
*/
|
||||||
|
public void signalDone() {
|
||||||
|
strat.signalDone();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* String representation of this shard.
|
||||||
|
* @return A string representation of the boundaries of this shard.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String.format("%d reads", size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,86 @@
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.shards;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A shard strategy that breaks up shards based on how many reads are
|
||||||
|
* in each.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class ReadDelimitedReadShardStrategy extends ReadShardStrategy {
|
||||||
|
// our read bucket size, default
|
||||||
|
protected long readCount = 100000L;
|
||||||
|
|
||||||
|
// our hasnext flag
|
||||||
|
boolean hasNext = true;
|
||||||
|
|
||||||
|
// our limiting factor
|
||||||
|
long limitedSize = -1;
|
||||||
|
boolean stopDueToLimitingFactor = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the default constructor
|
||||||
|
* @param size the read count to iterate over
|
||||||
|
* @param limitedSize limit the shard to this length
|
||||||
|
*/
|
||||||
|
ReadDelimitedReadShardStrategy(long size, long limitedSize) {
|
||||||
|
readCount = size;
|
||||||
|
this.limitedSize = limitedSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* do we have another read shard?
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
if (stopDueToLimitingFactor) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return hasNext;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Shard next() {
|
||||||
|
if (limitedSize > 0) {
|
||||||
|
if (limitedSize > readCount) {
|
||||||
|
limitedSize = limitedSize - readCount;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
readCount = limitedSize;
|
||||||
|
limitedSize = 0;
|
||||||
|
stopDueToLimitingFactor = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ReadDelimitedReadShard(this,(int)readCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Remove not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator<Shard> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* set the next shards size
|
||||||
|
*
|
||||||
|
* @param size adjust the next size to this
|
||||||
|
*/
|
||||||
|
public void adjustNextShardSize(long size) {
|
||||||
|
readCount = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* this function is a work-around for the fact that
|
||||||
|
* we don't know when we're out of reads until the SAM data source
|
||||||
|
* tells us so.
|
||||||
|
*/
|
||||||
|
public void signalDone() {
|
||||||
|
hasNext = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -18,68 +18,16 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* the base class for read shards.
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* <p/>
|
|
||||||
* ReadShard
|
|
||||||
* <p/>
|
|
||||||
* the base class for read shards.
|
|
||||||
*/
|
*/
|
||||||
public class ReadShard implements Shard {
|
public abstract class ReadShard implements Shard {
|
||||||
|
|
||||||
// the count of the reads we want to copy off
|
|
||||||
private int size = 0;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* our tie in for the shard strategy. This allows us to signal to the shard
|
|
||||||
* strategy that we've finished process, so it can indicate that we're out of reads
|
|
||||||
*/
|
|
||||||
private final ReadShardStrategy str;
|
|
||||||
|
|
||||||
// the reference back to our read shard strategy
|
|
||||||
private final ReadShardStrategy strat;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* create a read shard, given a read size
|
|
||||||
*
|
|
||||||
* @param size
|
|
||||||
*/
|
|
||||||
ReadShard(int size, ReadShardStrategy strat) {
|
|
||||||
this.str = null;
|
|
||||||
this.size = size;
|
|
||||||
this.strat = strat;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* create a read shard, given a read size
|
|
||||||
*
|
|
||||||
* @param size
|
|
||||||
*/
|
|
||||||
ReadShard(ReadShardStrategy caller, int size, ReadShardStrategy strat) {
|
|
||||||
this.str = caller;
|
|
||||||
this.size = size;
|
|
||||||
this.strat = strat;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** @return the genome location represented by this shard */
|
/** @return the genome location represented by this shard */
|
||||||
public GenomeLoc getGenomeLoc() {
|
public GenomeLoc getGenomeLoc() {
|
||||||
throw new UnsupportedOperationException("ReadShard isn't genome loc aware");
|
throw new UnsupportedOperationException("ReadShard isn't genome loc aware");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return the genome location represented by this shard */
|
|
||||||
public int getSize() {
|
|
||||||
return size;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* this method is used as a backend, to signal to the sharding strategy that we've
|
|
||||||
* finished processing. When we move to a more read-aware bam system this method could disappear.
|
|
||||||
*/
|
|
||||||
public void signalDone() {
|
|
||||||
strat.signalDone();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* what kind of shard do we return
|
* what kind of shard do we return
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -40,85 +40,12 @@ import java.util.Iterator;
|
||||||
* The sharding strategy for reads using a simple counting mechanism. Each read shard
|
* The sharding strategy for reads using a simple counting mechanism. Each read shard
|
||||||
* has a specific number of reads (default to 100K) which is configured in the constructor.
|
* has a specific number of reads (default to 100K) which is configured in the constructor.
|
||||||
*/
|
*/
|
||||||
public class ReadShardStrategy implements ShardStrategy {
|
public abstract class ReadShardStrategy implements ShardStrategy {
|
||||||
|
|
||||||
// do we use unmapped reads in the sharding strategy
|
// do we use unmapped reads in the sharding strategy
|
||||||
private boolean unMappedReads = true;
|
private boolean unMappedReads = true;
|
||||||
|
|
||||||
// our read bucket size, default
|
|
||||||
protected long readCount = 100000L;
|
|
||||||
|
|
||||||
// our sequence dictionary
|
|
||||||
final private SAMSequenceDictionary dic;
|
|
||||||
|
|
||||||
// our hasnext flag
|
|
||||||
boolean hasNext = true;
|
|
||||||
|
|
||||||
// our limiting factor
|
|
||||||
long limitedSize = -1;
|
|
||||||
boolean stopDueToLimitingFactor = false;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* the default constructor
|
|
||||||
* @param dic the sequence dictionary to use
|
|
||||||
* @param size the read count to iterate over
|
|
||||||
*/
|
|
||||||
ReadShardStrategy(SAMSequenceDictionary dic, long size, long limitedSize) {
|
|
||||||
this.dic = dic;
|
|
||||||
readCount = size;
|
|
||||||
this.limitedSize = limitedSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* do we have another read shard?
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public boolean hasNext() {
|
|
||||||
if (stopDueToLimitingFactor) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return hasNext;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Shard next() {
|
|
||||||
if (limitedSize > 0) {
|
|
||||||
if (limitedSize > readCount) {
|
|
||||||
limitedSize = limitedSize - readCount;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
readCount = limitedSize;
|
|
||||||
limitedSize = 0;
|
|
||||||
stopDueToLimitingFactor = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new ReadShard((int)readCount, this);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("Remove not supported");
|
|
||||||
}
|
|
||||||
|
|
||||||
public Iterator<Shard> iterator() {
|
public Iterator<Shard> iterator() {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* set the next shards size
|
|
||||||
*
|
|
||||||
* @param size adjust the next size to this
|
|
||||||
*/
|
|
||||||
public void adjustNextShardSize(long size) {
|
|
||||||
readCount = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* this function is a work-around for the fact that
|
|
||||||
* we don't know when we're out of reads until the SAM data source
|
|
||||||
* tells us so.
|
|
||||||
*/
|
|
||||||
public void signalDone() {
|
|
||||||
hasNext = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,13 +28,4 @@ import java.util.Iterator;
|
||||||
* class, but not this will be an interface to accomidate read based sharding
|
* class, but not this will be an interface to accomidate read based sharding
|
||||||
*/
|
*/
|
||||||
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
|
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
|
||||||
|
|
||||||
/**
|
|
||||||
* set the next shards size
|
|
||||||
*
|
|
||||||
* @param size adjust the next size to this
|
|
||||||
*/
|
|
||||||
public abstract void adjustNextShardSize(long size);
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,9 @@ import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
|
@ -37,6 +40,7 @@ public class ShardStrategyFactory {
|
||||||
LINEAR,
|
LINEAR,
|
||||||
EXPONENTIAL,
|
EXPONENTIAL,
|
||||||
READS,
|
READS,
|
||||||
|
READS_EXPERIMENTAL,
|
||||||
INTERVAL,
|
INTERVAL,
|
||||||
MONOLITHIC // Put all of the available data into one shard.
|
MONOLITHIC // Put all of the available data into one shard.
|
||||||
}
|
}
|
||||||
|
|
@ -48,31 +52,35 @@ public class ShardStrategyFactory {
|
||||||
/**
|
/**
|
||||||
* get a new shatter strategy
|
* get a new shatter strategy
|
||||||
*
|
*
|
||||||
|
* @param dataSource File pointer to BAM. TODO: Eliminate this argument; pass a data source instead!
|
||||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||||
* @param dic the seq dictionary
|
* @param dic the seq dictionary
|
||||||
* @param startingSize the starting size
|
* @param startingSize the starting size
|
||||||
* @return
|
* @return a shard strategy capable of dividing input data into shards.
|
||||||
*/
|
*/
|
||||||
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize) {
|
static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize) {
|
||||||
return ShardStrategyFactory.shatter(strat, dic, startingSize, -1L);
|
return ShardStrategyFactory.shatter(dataSource, strat, dic, startingSize, -1L);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get a new shatter strategy
|
* get a new shatter strategy
|
||||||
*
|
*
|
||||||
|
* @param dataSource File pointer to BAM.
|
||||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||||
* @param dic the seq dictionary
|
* @param dic the seq dictionary
|
||||||
* @param startingSize the starting size
|
* @param startingSize the starting size
|
||||||
* @return
|
* @return a shard strategy capable of dividing input data into shards.
|
||||||
*/
|
*/
|
||||||
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, long limitByCount) {
|
static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, long limitByCount) {
|
||||||
switch (strat) {
|
switch (strat) {
|
||||||
case LINEAR:
|
case LINEAR:
|
||||||
return new LinearLocusShardStrategy(dic, startingSize, limitByCount);
|
return new LinearLocusShardStrategy(dic, startingSize, limitByCount);
|
||||||
case EXPONENTIAL:
|
case EXPONENTIAL:
|
||||||
return new ExpGrowthLocusShardStrategy(dic, startingSize, limitByCount);
|
return new ExpGrowthLocusShardStrategy(dic, startingSize, limitByCount);
|
||||||
case READS:
|
case READS:
|
||||||
return new ReadShardStrategy(dic, startingSize, limitByCount);
|
return new ReadDelimitedReadShardStrategy(startingSize, limitByCount);
|
||||||
|
case READS_EXPERIMENTAL:
|
||||||
|
return new BlockDelimitedReadShardStrategy(dataSource);
|
||||||
case INTERVAL:
|
case INTERVAL:
|
||||||
throw new StingException("Requested trategy: " + strat + " doesn't work with the limiting count (-M) command line option");
|
throw new StingException("Requested trategy: " + strat + " doesn't work with the limiting count (-M) command line option");
|
||||||
default:
|
default:
|
||||||
|
|
@ -90,8 +98,8 @@ public class ShardStrategyFactory {
|
||||||
* @param startingSize the starting size
|
* @param startingSize the starting size
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst) {
|
static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst) {
|
||||||
return ShardStrategyFactory.shatter(strat, dic, startingSize, lst, -1l);
|
return ShardStrategyFactory.shatter(dataSource, strat, dic, startingSize, lst, -1l);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -103,7 +111,7 @@ public class ShardStrategyFactory {
|
||||||
* @param startingSize the starting size
|
* @param startingSize the starting size
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst, long limitDataCount) {
|
static public ShardStrategy shatter(SAMDataSource dataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst, long limitDataCount) {
|
||||||
switch (strat) {
|
switch (strat) {
|
||||||
case LINEAR:
|
case LINEAR:
|
||||||
return new LinearLocusShardStrategy(dic, startingSize, lst, limitDataCount);
|
return new LinearLocusShardStrategy(dic, startingSize, lst, limitDataCount);
|
||||||
|
|
@ -113,6 +121,8 @@ public class ShardStrategyFactory {
|
||||||
return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.LOCUS_INTERVAL);
|
return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.LOCUS_INTERVAL);
|
||||||
case READS:
|
case READS:
|
||||||
return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.READ_INTERVAL);
|
return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.READ_INTERVAL);
|
||||||
|
case READS_EXPERIMENTAL:
|
||||||
|
throw new UnsupportedOperationException("Cannot do experimental read sharding with intervals");
|
||||||
default:
|
default:
|
||||||
throw new StingException("Strategy: " + strat + " isn't implemented");
|
throw new StingException("Strategy: " + strat + " isn't implemented");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,88 @@
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.BlockDelimitedReadShard;
|
||||||
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMFileReader2;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An iterator that's aware of how data is stored on disk in SAM format.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BlockDrivenSAMDataSource extends SAMDataSource {
|
||||||
|
|
||||||
|
private final SAMFileReader2 reader;
|
||||||
|
/**
|
||||||
|
* Create a new block-aware SAM data source given the supplied read metadata.
|
||||||
|
* @param reads The read metadata.
|
||||||
|
*/
|
||||||
|
public BlockDrivenSAMDataSource(Reads reads) {
|
||||||
|
super(reads);
|
||||||
|
|
||||||
|
if(reads.getReadsFiles().size() > 1)
|
||||||
|
throw new StingException("Experimental sharding strategy cannot handle multiple BAM files at this point.");
|
||||||
|
|
||||||
|
File readsFile = reads.getReadsFiles().get(0);
|
||||||
|
reader = new SAMFileReader2(readsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasIndex() {
|
||||||
|
return reader.hasIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
public StingSAMIterator seek(Shard shard) {
|
||||||
|
if(!(shard instanceof BlockDelimitedReadShard))
|
||||||
|
throw new StingException("Currently unable to operate on types other than block delimited read shards.");
|
||||||
|
CloseableIterator<SAMRecord> iterator = reader.iterator(((BlockDelimitedReadShard)shard).getChunks());
|
||||||
|
return applyDecoratingIterators(true,
|
||||||
|
StingSAMIteratorAdapter.adapt(reads, iterator),
|
||||||
|
reads.getDownsamplingFraction(),
|
||||||
|
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||||
|
reads.getSupplementalFilters());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the merged header from the SAM file.
|
||||||
|
* @return The merged header.
|
||||||
|
*/
|
||||||
|
public SAMFileHeader getHeader() {
|
||||||
|
return reader.getFileHeader();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Currently unsupported.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Collection<SAMFileReader> getReaders() {
|
||||||
|
throw new StingException("Currently unable to get readers for shard-based fields.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* No read group collisions at this time because only one SAM file is currently supported.
|
||||||
|
* @return False always.
|
||||||
|
*/
|
||||||
|
public boolean hasReadGroupCollisions() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Currently unsupported.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) {
|
||||||
|
throw new UnsupportedOperationException("Getting read group ID from this experimental SAM reader is not currently supported.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,423 @@
|
||||||
|
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
|
import net.sf.picard.filter.FilteringIterator;
|
||||||
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
|
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShard;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.shards.ReadDelimitedReadShard;
|
||||||
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: aaron
|
||||||
|
* Date: Mar 26, 2009
|
||||||
|
* Time: 2:36:16 PM
|
||||||
|
* <p/>
|
||||||
|
* Converts shards to SAM iterators over the specified region
|
||||||
|
*/
|
||||||
|
public class IndexDrivenSAMDataSource extends SAMDataSource {
|
||||||
|
// used for the reads case, the last count of reads retrieved
|
||||||
|
long readsTaken = 0;
|
||||||
|
|
||||||
|
// our last genome loc position
|
||||||
|
protected GenomeLoc lastReadPos = null;
|
||||||
|
|
||||||
|
// do we take unmapped reads
|
||||||
|
private boolean includeUnmappedReads = true;
|
||||||
|
|
||||||
|
// reads based traversal variables
|
||||||
|
private boolean intoUnmappedReads = false;
|
||||||
|
private int readsSeenAtLastPos = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A histogram of exactly what reads were removed from the input stream and why.
|
||||||
|
*/
|
||||||
|
private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
|
||||||
|
|
||||||
|
// A pool of SAM iterators.
|
||||||
|
private SAMResourcePool resourcePool = null;
|
||||||
|
|
||||||
|
private GenomeLoc mLastInterval = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a histogram of reads that were screened out, grouped by the nature of the error.
|
||||||
|
* @return Histogram of reads. Will not be null.
|
||||||
|
*/
|
||||||
|
public SAMReadViolationHistogram getViolationHistogram() {
|
||||||
|
return violations;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* constructor, given sam files
|
||||||
|
*
|
||||||
|
* @param reads the list of sam files
|
||||||
|
*/
|
||||||
|
public IndexDrivenSAMDataSource( Reads reads ) throws SimpleDataSourceLoadException {
|
||||||
|
super(reads);
|
||||||
|
resourcePool = new SAMResourcePool(reads);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do all BAM files backing this data source have an index? The case where hasIndex() is false
|
||||||
|
* is supported, but only in a few extreme cases.
|
||||||
|
* @return True if an index is present; false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean hasIndex() {
|
||||||
|
return resourcePool.hasIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the (potentially merged) SAM file header.
|
||||||
|
*
|
||||||
|
* @return SAM file header.
|
||||||
|
*/
|
||||||
|
public SAMFileHeader getHeader() {
|
||||||
|
return resourcePool.getHeader();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
|
||||||
|
* information about how they are downsampled, sorted, and filtered
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Reads getReadsInfo() { return reads; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns header merger: a class that keeps the mapping between original read groups and read groups
|
||||||
|
* of the merged stream; merger also provides access to the individual file readers (and hence headers
|
||||||
|
* prior to the merging too) maintained by the system.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Collection<SAMFileReader> getReaders() { return resourcePool.getHeaderMerger().getReaders(); }
|
||||||
|
|
||||||
|
/** Returns true if there are read group duplicates within the merged headers. */
|
||||||
|
public boolean hasReadGroupCollisions() {
|
||||||
|
return resourcePool.getHeaderMerger().hasReadGroupCollisions();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the read group id that should be used for the input read and RG id. */
|
||||||
|
public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) {
|
||||||
|
return resourcePool.getHeaderMerger().getReadGroupId(reader,originalReadGroupId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param shard the shard to get data for
|
||||||
|
*
|
||||||
|
* @return an iterator for that region
|
||||||
|
*/
|
||||||
|
public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException {
|
||||||
|
// setup the iterator pool if it's not setup
|
||||||
|
boolean queryOverlapping = ( shard.getShardType() == Shard.ShardType.READ ) ? false : true;
|
||||||
|
resourcePool.setQueryOverlapping(queryOverlapping);
|
||||||
|
|
||||||
|
StingSAMIterator iterator = null;
|
||||||
|
if (shard.getShardType() == Shard.ShardType.READ) {
|
||||||
|
iterator = seekRead(shard);
|
||||||
|
iterator = applyDecoratingIterators(true,
|
||||||
|
iterator,
|
||||||
|
reads.getDownsamplingFraction(),
|
||||||
|
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||||
|
reads.getSupplementalFilters());
|
||||||
|
} else if (shard.getShardType() == Shard.ShardType.LOCUS) {
|
||||||
|
iterator = seekLocus(shard);
|
||||||
|
iterator = applyDecoratingIterators(false,
|
||||||
|
iterator,
|
||||||
|
reads.getDownsamplingFraction(),
|
||||||
|
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||||
|
reads.getSupplementalFilters());
|
||||||
|
} else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) ||
|
||||||
|
(shard.getShardType() == Shard.ShardType.READ_INTERVAL)) {
|
||||||
|
iterator = seekLocus(shard);
|
||||||
|
iterator = applyDecoratingIterators(false,
|
||||||
|
iterator,
|
||||||
|
reads.getDownsamplingFraction(),
|
||||||
|
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||||
|
reads.getSupplementalFilters());
|
||||||
|
|
||||||
|
// add the new overlapping detection iterator, if we have a last interval and we're a read based shard
|
||||||
|
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL )
|
||||||
|
iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false));
|
||||||
|
mLastInterval = shard.getGenomeLoc();
|
||||||
|
} else {
|
||||||
|
|
||||||
|
throw new StingException("seek: Unknown shard type");
|
||||||
|
}
|
||||||
|
|
||||||
|
return iterator;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* seekLocus
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param shard the shard containing the genome location to extract data for
|
||||||
|
*
|
||||||
|
* @return an iterator for that region
|
||||||
|
*/
|
||||||
|
private StingSAMIterator seekLocus( Shard shard ) throws SimpleDataSourceLoadException {
|
||||||
|
if(shard instanceof MonolithicShard)
|
||||||
|
return createIterator(new EntireStream());
|
||||||
|
|
||||||
|
if( getHeader().getSequenceDictionary().getSequences().size() == 0 )
|
||||||
|
throw new StingException("Unable to seek to the given locus; reads data source has no alignment information.");
|
||||||
|
return createIterator( new MappedStreamSegment(shard.getGenomeLoc()) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* seek
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param shard the read shard to extract from
|
||||||
|
*
|
||||||
|
* @return an iterator for that region
|
||||||
|
*/
|
||||||
|
private StingSAMIterator seekRead( Shard shard ) throws SimpleDataSourceLoadException {
|
||||||
|
if(shard instanceof MonolithicShard)
|
||||||
|
return createIterator(new EntireStream());
|
||||||
|
|
||||||
|
ReadDelimitedReadShard readShard = (ReadDelimitedReadShard)shard;
|
||||||
|
StingSAMIterator iter = null;
|
||||||
|
|
||||||
|
// If there are no entries in the sequence dictionary, there can't possibly be any unmapped reads. Force state to 'unmapped'.
|
||||||
|
if( isSequenceDictionaryEmpty() )
|
||||||
|
intoUnmappedReads = true;
|
||||||
|
|
||||||
|
if (!intoUnmappedReads) {
|
||||||
|
if (lastReadPos == null) {
|
||||||
|
lastReadPos = GenomeLocParser.createGenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
|
||||||
|
iter = createIterator(new MappedStreamSegment(lastReadPos));
|
||||||
|
return InitialReadIterator(readShard.getSize(), iter);
|
||||||
|
} else {
|
||||||
|
lastReadPos = GenomeLocParser.setStop(lastReadPos,-1);
|
||||||
|
iter = fastMappedReadSeek(readShard.getSize(), StingSAMIteratorAdapter.adapt(reads, createIterator(new MappedStreamSegment(lastReadPos))));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (intoUnmappedReads && !includeUnmappedReads)
|
||||||
|
readShard.signalDone();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (intoUnmappedReads && includeUnmappedReads) {
|
||||||
|
if (iter != null)
|
||||||
|
iter.close();
|
||||||
|
iter = toUnmappedReads(readShard.getSize());
|
||||||
|
if (!iter.hasNext())
|
||||||
|
readShard.signalDone();
|
||||||
|
}
|
||||||
|
|
||||||
|
return iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If we're in by-read mode, this indicates if we want
|
||||||
|
* to see unmapped reads too. Only seeing mapped reads
|
||||||
|
* is much faster, but most BAM files have significant
|
||||||
|
* unmapped read counts.
|
||||||
|
*
|
||||||
|
* @param seeUnMappedReads true to see unmapped reads, false otherwise
|
||||||
|
*/
|
||||||
|
public void viewUnmappedReads( boolean seeUnMappedReads ) {
|
||||||
|
includeUnmappedReads = seeUnMappedReads;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For unit testing, add a custom iterator pool.
|
||||||
|
*
|
||||||
|
* @param resourcePool Custom mock iterator pool.
|
||||||
|
*/
|
||||||
|
void setResourcePool( SAMResourcePool resourcePool ) {
|
||||||
|
this.resourcePool = resourcePool;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieve unmapped reads.
|
||||||
|
*
|
||||||
|
* @param readCount how many reads to retrieve
|
||||||
|
*
|
||||||
|
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||||
|
*/
|
||||||
|
StingSAMIterator toUnmappedReads( long readCount ) {
|
||||||
|
StingSAMIterator iter = createIterator(new UnmappedStreamSegment(readsTaken, readCount));
|
||||||
|
readsTaken += readCount;
|
||||||
|
return iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A seek function for mapped reads.
|
||||||
|
*
|
||||||
|
* @param readCount how many reads to retrieve
|
||||||
|
* @param iter the iterator to use, seeked to the correct start location
|
||||||
|
*
|
||||||
|
* @return the bounded iterator that you can use to get the intervaled reads from. Will be a zero-length
|
||||||
|
* iterator if no reads are available.
|
||||||
|
* @throws SimpleDataSourceLoadException
|
||||||
|
*/
|
||||||
|
StingSAMIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
|
||||||
|
BoundedReadIterator bound;
|
||||||
|
correctForReadPileupSeek(iter);
|
||||||
|
if (readsTaken == 0) {
|
||||||
|
return InitialReadIterator(readCount, iter);
|
||||||
|
}
|
||||||
|
int x = 0;
|
||||||
|
SAMRecord rec = null;
|
||||||
|
|
||||||
|
// Assuming that lastReadPos should never be null, because this is a mappedReadSeek
|
||||||
|
// and initial queries are handled by the previous conditional.
|
||||||
|
int lastContig = lastReadPos.getContigIndex();
|
||||||
|
int lastPos = (int)lastReadPos.getStart();
|
||||||
|
|
||||||
|
while (x < readsTaken) {
|
||||||
|
if (iter.hasNext()) {
|
||||||
|
rec = iter.next();
|
||||||
|
if (lastContig == rec.getReferenceIndex() && lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos;
|
||||||
|
else this.readsSeenAtLastPos = 1;
|
||||||
|
lastPos = rec.getAlignmentStart();
|
||||||
|
++x;
|
||||||
|
} else {
|
||||||
|
iter.close();
|
||||||
|
|
||||||
|
// jump contigs
|
||||||
|
lastReadPos = GenomeLocParser.toNextContig(lastReadPos);
|
||||||
|
if (lastReadPos == null) {
|
||||||
|
// check to see if we're using unmapped reads, if not return, we're done
|
||||||
|
readsTaken = 0;
|
||||||
|
intoUnmappedReads = true;
|
||||||
|
|
||||||
|
// fastMappedReadSeek must return an iterator, even if that iterator iterates through nothing.
|
||||||
|
return new NullSAMIterator(reads);
|
||||||
|
} else {
|
||||||
|
readsTaken = readCount;
|
||||||
|
readsSeenAtLastPos = 0;
|
||||||
|
lastReadPos = GenomeLocParser.setStop(lastReadPos,-1);
|
||||||
|
CloseableIterator<SAMRecord> ret = createIterator(new MappedStreamSegment(lastReadPos));
|
||||||
|
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we're off the end of the last contig (into unmapped territory)
|
||||||
|
if (rec != null && rec.getAlignmentStart() == 0) {
|
||||||
|
readsTaken += readCount;
|
||||||
|
intoUnmappedReads = true;
|
||||||
|
}
|
||||||
|
// else we're not off the end, store our location
|
||||||
|
else if (rec != null) {
|
||||||
|
int stopPos = rec.getAlignmentStart();
|
||||||
|
if (stopPos < lastReadPos.getStart()) {
|
||||||
|
lastReadPos = GenomeLocParser.createGenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
|
||||||
|
} else {
|
||||||
|
lastReadPos = GenomeLocParser.setStart(lastReadPos,rec.getAlignmentStart());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// in case we're run out of reads, get out
|
||||||
|
else {
|
||||||
|
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
|
||||||
|
//return null;
|
||||||
|
}
|
||||||
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
|
|
||||||
|
|
||||||
|
// return the iterator
|
||||||
|
return bound;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines whether the BAM file is completely unsequenced. Requires that the resource pool be initialized.
|
||||||
|
* @return True if the sequence dictionary is completely empty. False otherwise.
|
||||||
|
*/
|
||||||
|
private boolean isSequenceDictionaryEmpty() {
|
||||||
|
return getHeader().getSequenceDictionary().isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Even though the iterator has seeked to the correct location, there may be multiple reads at that location,
|
||||||
|
* and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable
|
||||||
|
*
|
||||||
|
* @param iter the iterator
|
||||||
|
*/
|
||||||
|
private void correctForReadPileupSeek( StingSAMIterator iter ) {
|
||||||
|
// move the number of reads we read from the last pos
|
||||||
|
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
|
||||||
|
for(int i = 0; i < this.readsSeenAtLastPos && iter.hasNext(); i++,iter.next())
|
||||||
|
atLeastOneReadSeen = true;
|
||||||
|
if (readsSeenAtLastPos > 0 && !atLeastOneReadSeen) {
|
||||||
|
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* set the initial iterator
|
||||||
|
*
|
||||||
|
* @param readCount the number of reads
|
||||||
|
* @param iter the merging iterator
|
||||||
|
*
|
||||||
|
* @return a bounded read iterator at the first read position in the file.
|
||||||
|
*/
|
||||||
|
private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator<SAMRecord> iter ) {
|
||||||
|
BoundedReadIterator bound;
|
||||||
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
|
this.readsTaken = readCount;
|
||||||
|
return bound;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an iterator over the selected segment, from a resource pulled from the pool.
|
||||||
|
* @param segment Segment over which to gather reads.
|
||||||
|
* @return An iterator over just the reads in the given segment.
|
||||||
|
*/
|
||||||
|
private StingSAMIterator createIterator( DataStreamSegment segment ) {
|
||||||
|
StingSAMIterator iterator = resourcePool.iterator(segment);
|
||||||
|
StingSAMIterator malformedWrappedIterator = new MalformedSAMFilteringIterator( getHeader(), iterator, violations );
|
||||||
|
StingSAMIterator readWrappingIterator = new ReadWrappingIterator(malformedWrappedIterator);
|
||||||
|
return readWrappingIterator;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -2,21 +2,17 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
||||||
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
import net.sf.samtools.util.CloseableIterator;
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
import net.sf.picard.filter.FilteringIterator;
|
import net.sf.picard.filter.FilteringIterator;
|
||||||
import net.sf.picard.filter.SamRecordFilter;
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShard;
|
|
||||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
import org.broadinstitute.sting.gatk.iterators.*;
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
||||||
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
|
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
@ -54,36 +50,21 @@ import java.util.Collection;
|
||||||
* <p/>
|
* <p/>
|
||||||
* Converts shards to SAM iterators over the specified region
|
* Converts shards to SAM iterators over the specified region
|
||||||
*/
|
*/
|
||||||
public class SAMDataSource implements SimpleDataSource {
|
public abstract class SAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
/** Backing support for reads. */
|
/** Backing support for reads. */
|
||||||
private final Reads reads;
|
protected final Reads reads;
|
||||||
|
|
||||||
/** our log, which we want to capture anything from this class */
|
/** our log, which we want to capture anything from this class */
|
||||||
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||||
|
|
||||||
// used for the reads case, the last count of reads retrieved
|
|
||||||
long readsTaken = 0;
|
|
||||||
|
|
||||||
// our last genome loc position
|
|
||||||
protected GenomeLoc lastReadPos = null;
|
|
||||||
|
|
||||||
// do we take unmapped reads
|
// do we take unmapped reads
|
||||||
private boolean includeUnmappedReads = true;
|
protected boolean includeUnmappedReads = true;
|
||||||
|
|
||||||
// reads based traversal variables
|
|
||||||
private boolean intoUnmappedReads = false;
|
|
||||||
private int readsSeenAtLastPos = 0;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A histogram of exactly what reads were removed from the input stream and why.
|
* A histogram of exactly what reads were removed from the input stream and why.
|
||||||
*/
|
*/
|
||||||
private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
|
protected SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
|
||||||
|
|
||||||
// A pool of SAM iterators.
|
|
||||||
private SAMResourcePool resourcePool = null;
|
|
||||||
|
|
||||||
private GenomeLoc mLastInterval = null;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a histogram of reads that were screened out, grouped by the nature of the error.
|
* Returns a histogram of reads that were screened out, grouped by the nature of the error.
|
||||||
|
|
@ -110,7 +91,6 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
|
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
resourcePool = new SAMResourcePool(reads);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -118,18 +98,14 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* is supported, but only in a few extreme cases.
|
* is supported, but only in a few extreme cases.
|
||||||
* @return True if an index is present; false otherwise.
|
* @return True if an index is present; false otherwise.
|
||||||
*/
|
*/
|
||||||
public boolean hasIndex() {
|
public abstract boolean hasIndex();
|
||||||
return resourcePool.hasIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the (potentially merged) SAM file header.
|
* Gets the (potentially merged) SAM file header.
|
||||||
*
|
*
|
||||||
* @return SAM file header.
|
* @return SAM file header.
|
||||||
*/
|
*/
|
||||||
public SAMFileHeader getHeader() {
|
public abstract SAMFileHeader getHeader();
|
||||||
return resourcePool.getHeader();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -140,12 +116,15 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
public Reads getReadsInfo() { return reads; }
|
public Reads getReadsInfo() { return reads; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns header merger: a class that keeps the mapping between original read groups and read groups
|
* Returns readers used by this data source.
|
||||||
* of the merged stream; merger also provides access to the individual file readers (and hence headers
|
|
||||||
* prior to the merging too) maintained by the system.
|
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public SamFileHeaderMerger getHeaderMerger() { return resourcePool.getHeaderMerger(); }
|
public abstract Collection<SAMFileReader> getReaders();
|
||||||
|
|
||||||
|
/** Returns true if there are read group duplicates within the merged headers. */
|
||||||
|
public abstract boolean hasReadGroupCollisions();
|
||||||
|
|
||||||
|
/** Returns the read group id that should be used for the input read and RG id. */
|
||||||
|
public abstract String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
|
@ -153,110 +132,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
*
|
*
|
||||||
* @return an iterator for that region
|
* @return an iterator for that region
|
||||||
*/
|
*/
|
||||||
public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException {
|
public abstract StingSAMIterator seek(Shard shard);
|
||||||
// setup the iterator pool if it's not setup
|
|
||||||
boolean queryOverlapping = ( shard.getShardType() == Shard.ShardType.READ ) ? false : true;
|
|
||||||
resourcePool.setQueryOverlapping(queryOverlapping);
|
|
||||||
|
|
||||||
StingSAMIterator iterator = null;
|
|
||||||
if (shard.getShardType() == Shard.ShardType.READ) {
|
|
||||||
iterator = seekRead(shard);
|
|
||||||
iterator = applyDecoratingIterators(true,
|
|
||||||
iterator,
|
|
||||||
reads.getDownsamplingFraction(),
|
|
||||||
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
|
||||||
reads.getSupplementalFilters());
|
|
||||||
} else if (shard.getShardType() == Shard.ShardType.LOCUS) {
|
|
||||||
iterator = seekLocus(shard);
|
|
||||||
iterator = applyDecoratingIterators(false,
|
|
||||||
iterator,
|
|
||||||
reads.getDownsamplingFraction(),
|
|
||||||
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
|
||||||
reads.getSupplementalFilters());
|
|
||||||
} else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) ||
|
|
||||||
(shard.getShardType() == Shard.ShardType.READ_INTERVAL)) {
|
|
||||||
iterator = seekLocus(shard);
|
|
||||||
iterator = applyDecoratingIterators(false,
|
|
||||||
iterator,
|
|
||||||
reads.getDownsamplingFraction(),
|
|
||||||
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
|
||||||
reads.getSupplementalFilters());
|
|
||||||
|
|
||||||
// add the new overlapping detection iterator, if we have a last interval and we're a read based shard
|
|
||||||
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL )
|
|
||||||
iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false));
|
|
||||||
mLastInterval = shard.getGenomeLoc();
|
|
||||||
} else {
|
|
||||||
|
|
||||||
throw new StingException("seek: Unknown shard type");
|
|
||||||
}
|
|
||||||
|
|
||||||
return iterator;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* <p>
|
|
||||||
* seekLocus
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @param shard the shard containing the genome location to extract data for
|
|
||||||
*
|
|
||||||
* @return an iterator for that region
|
|
||||||
*/
|
|
||||||
private StingSAMIterator seekLocus( Shard shard ) throws SimpleDataSourceLoadException {
|
|
||||||
if(shard instanceof MonolithicShard)
|
|
||||||
return createIterator(new EntireStream());
|
|
||||||
|
|
||||||
if( getHeader().getSequenceDictionary().getSequences().size() == 0 )
|
|
||||||
throw new StingException("Unable to seek to the given locus; reads data source has no alignment information.");
|
|
||||||
return createIterator( new MappedStreamSegment(shard.getGenomeLoc()) );
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* <p>
|
|
||||||
* seek
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @param shard the read shard to extract from
|
|
||||||
*
|
|
||||||
* @return an iterator for that region
|
|
||||||
*/
|
|
||||||
private StingSAMIterator seekRead( Shard shard ) throws SimpleDataSourceLoadException {
|
|
||||||
if(shard instanceof MonolithicShard)
|
|
||||||
return createIterator(new EntireStream());
|
|
||||||
|
|
||||||
ReadShard readShard = (ReadShard)shard;
|
|
||||||
StingSAMIterator iter = null;
|
|
||||||
|
|
||||||
// If there are no entries in the sequence dictionary, there can't possibly be any unmapped reads. Force state to 'unmapped'.
|
|
||||||
if( isSequenceDictionaryEmpty() )
|
|
||||||
intoUnmappedReads = true;
|
|
||||||
|
|
||||||
if (!intoUnmappedReads) {
|
|
||||||
if (lastReadPos == null) {
|
|
||||||
lastReadPos = GenomeLocParser.createGenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
|
|
||||||
iter = createIterator(new MappedStreamSegment(lastReadPos));
|
|
||||||
return InitialReadIterator(readShard.getSize(), iter);
|
|
||||||
} else {
|
|
||||||
lastReadPos = GenomeLocParser.setStop(lastReadPos,-1);
|
|
||||||
iter = fastMappedReadSeek(readShard.getSize(), StingSAMIteratorAdapter.adapt(reads, createIterator(new MappedStreamSegment(lastReadPos))));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (intoUnmappedReads && !includeUnmappedReads)
|
|
||||||
readShard.signalDone();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (intoUnmappedReads && includeUnmappedReads) {
|
|
||||||
if (iter != null)
|
|
||||||
iter.close();
|
|
||||||
iter = toUnmappedReads(readShard.getSize());
|
|
||||||
if (!iter.hasNext())
|
|
||||||
readShard.signalDone();
|
|
||||||
}
|
|
||||||
|
|
||||||
return iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If we're in by-read mode, this indicates if we want
|
* If we're in by-read mode, this indicates if we want
|
||||||
|
|
@ -270,160 +146,6 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
includeUnmappedReads = seeUnMappedReads;
|
includeUnmappedReads = seeUnMappedReads;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* For unit testing, add a custom iterator pool.
|
|
||||||
*
|
|
||||||
* @param resourcePool Custom mock iterator pool.
|
|
||||||
*/
|
|
||||||
void setResourcePool( SAMResourcePool resourcePool ) {
|
|
||||||
this.resourcePool = resourcePool;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieve unmapped reads.
|
|
||||||
*
|
|
||||||
* @param readCount how many reads to retrieve
|
|
||||||
*
|
|
||||||
* @return the bounded iterator that you can use to get the intervaled reads from
|
|
||||||
*/
|
|
||||||
StingSAMIterator toUnmappedReads( long readCount ) {
|
|
||||||
StingSAMIterator iter = createIterator(new UnmappedStreamSegment(readsTaken, readCount));
|
|
||||||
readsTaken += readCount;
|
|
||||||
return iter;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A seek function for mapped reads.
|
|
||||||
*
|
|
||||||
* @param readCount how many reads to retrieve
|
|
||||||
* @param iter the iterator to use, seeked to the correct start location
|
|
||||||
*
|
|
||||||
* @return the bounded iterator that you can use to get the intervaled reads from. Will be a zero-length
|
|
||||||
* iterator if no reads are available.
|
|
||||||
* @throws SimpleDataSourceLoadException
|
|
||||||
*/
|
|
||||||
StingSAMIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
|
|
||||||
BoundedReadIterator bound;
|
|
||||||
correctForReadPileupSeek(iter);
|
|
||||||
if (readsTaken == 0) {
|
|
||||||
return InitialReadIterator(readCount, iter);
|
|
||||||
}
|
|
||||||
int x = 0;
|
|
||||||
SAMRecord rec = null;
|
|
||||||
|
|
||||||
// Assuming that lastReadPos should never be null, because this is a mappedReadSeek
|
|
||||||
// and initial queries are handled by the previous conditional.
|
|
||||||
int lastContig = lastReadPos.getContigIndex();
|
|
||||||
int lastPos = (int)lastReadPos.getStart();
|
|
||||||
|
|
||||||
while (x < readsTaken) {
|
|
||||||
if (iter.hasNext()) {
|
|
||||||
rec = iter.next();
|
|
||||||
if (lastContig == rec.getReferenceIndex() && lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos;
|
|
||||||
else this.readsSeenAtLastPos = 1;
|
|
||||||
lastPos = rec.getAlignmentStart();
|
|
||||||
++x;
|
|
||||||
} else {
|
|
||||||
iter.close();
|
|
||||||
|
|
||||||
// jump contigs
|
|
||||||
lastReadPos = GenomeLocParser.toNextContig(lastReadPos);
|
|
||||||
if (lastReadPos == null) {
|
|
||||||
// check to see if we're using unmapped reads, if not return, we're done
|
|
||||||
readsTaken = 0;
|
|
||||||
intoUnmappedReads = true;
|
|
||||||
|
|
||||||
// fastMappedReadSeek must return an iterator, even if that iterator iterates through nothing.
|
|
||||||
return new NullSAMIterator(reads);
|
|
||||||
} else {
|
|
||||||
readsTaken = readCount;
|
|
||||||
readsSeenAtLastPos = 0;
|
|
||||||
lastReadPos = GenomeLocParser.setStop(lastReadPos,-1);
|
|
||||||
CloseableIterator<SAMRecord> ret = createIterator(new MappedStreamSegment(lastReadPos));
|
|
||||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we're off the end of the last contig (into unmapped territory)
|
|
||||||
if (rec != null && rec.getAlignmentStart() == 0) {
|
|
||||||
readsTaken += readCount;
|
|
||||||
intoUnmappedReads = true;
|
|
||||||
}
|
|
||||||
// else we're not off the end, store our location
|
|
||||||
else if (rec != null) {
|
|
||||||
int stopPos = rec.getAlignmentStart();
|
|
||||||
if (stopPos < lastReadPos.getStart()) {
|
|
||||||
lastReadPos = GenomeLocParser.createGenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
|
|
||||||
} else {
|
|
||||||
lastReadPos = GenomeLocParser.setStart(lastReadPos,rec.getAlignmentStart());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// in case we're run out of reads, get out
|
|
||||||
else {
|
|
||||||
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
|
|
||||||
//return null;
|
|
||||||
}
|
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
|
||||||
|
|
||||||
|
|
||||||
// return the iterator
|
|
||||||
return bound;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines whether the BAM file is completely unsequenced. Requires that the resource pool be initialized.
|
|
||||||
* @return True if the sequence dictionary is completely empty. False otherwise.
|
|
||||||
*/
|
|
||||||
private boolean isSequenceDictionaryEmpty() {
|
|
||||||
return getHeader().getSequenceDictionary().isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Even though the iterator has seeked to the correct location, there may be multiple reads at that location,
|
|
||||||
* and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable
|
|
||||||
*
|
|
||||||
* @param iter the iterator
|
|
||||||
*/
|
|
||||||
private void correctForReadPileupSeek( StingSAMIterator iter ) {
|
|
||||||
// move the number of reads we read from the last pos
|
|
||||||
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
|
|
||||||
for(int i = 0; i < this.readsSeenAtLastPos && iter.hasNext(); i++,iter.next())
|
|
||||||
atLeastOneReadSeen = true;
|
|
||||||
if (readsSeenAtLastPos > 0 && !atLeastOneReadSeen) {
|
|
||||||
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* set the initial iterator
|
|
||||||
*
|
|
||||||
* @param readCount the number of reads
|
|
||||||
* @param iter the merging iterator
|
|
||||||
*
|
|
||||||
* @return a bounded read iterator at the first read position in the file.
|
|
||||||
*/
|
|
||||||
private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator<SAMRecord> iter ) {
|
|
||||||
BoundedReadIterator bound;
|
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
|
||||||
this.readsTaken = readCount;
|
|
||||||
return bound;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an iterator over the selected segment, from a resource pulled from the pool.
|
|
||||||
* @param segment Segment over which to gather reads.
|
|
||||||
* @return An iterator over just the reads in the given segment.
|
|
||||||
*/
|
|
||||||
private StingSAMIterator createIterator( DataStreamSegment segment ) {
|
|
||||||
StingSAMIterator iterator = resourcePool.iterator(segment);
|
|
||||||
StingSAMIterator malformedWrappedIterator = new MalformedSAMFilteringIterator( getHeader(), iterator, violations );
|
|
||||||
StingSAMIterator readWrappingIterator = new ReadWrappingIterator(malformedWrappedIterator);
|
|
||||||
return readWrappingIterator;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter reads based on user-specified criteria.
|
* Filter reads based on user-specified criteria.
|
||||||
*
|
*
|
||||||
|
|
@ -434,11 +156,11 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* @param supplementalFilters additional filters to apply to the reads.
|
* @param supplementalFilters additional filters to apply to the reads.
|
||||||
* @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
|
* @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
|
||||||
*/
|
*/
|
||||||
private StingSAMIterator applyDecoratingIterators(boolean enableVerification,
|
protected StingSAMIterator applyDecoratingIterators(boolean enableVerification,
|
||||||
StingSAMIterator wrappedIterator,
|
StingSAMIterator wrappedIterator,
|
||||||
Double downsamplingFraction,
|
Double downsamplingFraction,
|
||||||
Boolean noValidationOfReadOrder,
|
Boolean noValidationOfReadOrder,
|
||||||
Collection<SamRecordFilter> supplementalFilters) {
|
Collection<SamRecordFilter> supplementalFilters) {
|
||||||
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
||||||
// as there is no reason to sort something that we will end of throwing away
|
// as there is no reason to sort something that we will end of throwing away
|
||||||
if (downsamplingFraction != null)
|
if (downsamplingFraction != null)
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource
|
||||||
import org.broadinstitute.sting.gatk.traversals.*;
|
import org.broadinstitute.sting.gatk.traversals.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|
||||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
|
||||||
|
|
@ -71,12 +71,7 @@ public class TraverseReads extends TraversalEngine {
|
||||||
ShardDataProvider dataProvider,
|
ShardDataProvider dataProvider,
|
||||||
T sum) {
|
T sum) {
|
||||||
|
|
||||||
if (shard instanceof ReadShard) {
|
logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", shard));
|
||||||
logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
|
|
||||||
} else if (shard instanceof IntervalShard) {
|
|
||||||
logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((IntervalShard) shard).getGenomeLoc()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (!(walker instanceof ReadWalker))
|
if (!(walker instanceof ReadWalker))
|
||||||
throw new IllegalArgumentException("Walker isn't a read walker!");
|
throw new IllegalArgumentException("Walker isn't a read walker!");
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ public class ShardStrategyFactoryTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReadNonInterval() {
|
public void testReadNonInterval() {
|
||||||
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100);
|
ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100);
|
||||||
assertTrue(st instanceof ReadShardStrategy);
|
assertTrue(st instanceof ReadShardStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -57,19 +57,19 @@ public class ShardStrategyFactoryTest extends BaseTest {
|
||||||
public void testReadInterval() {
|
public void testReadInterval() {
|
||||||
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
|
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
|
||||||
set.add(l);
|
set.add(l);
|
||||||
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set);
|
ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set);
|
||||||
assertTrue(st instanceof IntervalShardStrategy);
|
assertTrue(st instanceof IntervalShardStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLinearNonInterval() {
|
public void testLinearNonInterval() {
|
||||||
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100);
|
ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100);
|
||||||
assertTrue(st instanceof LinearLocusShardStrategy);
|
assertTrue(st instanceof LinearLocusShardStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExpNonInterval() {
|
public void testExpNonInterval() {
|
||||||
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100);
|
ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100);
|
||||||
assertTrue(st instanceof ExpGrowthLocusShardStrategy);
|
assertTrue(st instanceof ExpGrowthLocusShardStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -77,7 +77,7 @@ public class ShardStrategyFactoryTest extends BaseTest {
|
||||||
public void testExpInterval() {
|
public void testExpInterval() {
|
||||||
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
|
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
|
||||||
set.add(l);
|
set.add(l);
|
||||||
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100,set);
|
ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100,set);
|
||||||
assertTrue(st instanceof ExpGrowthLocusShardStrategy);
|
assertTrue(st instanceof ExpGrowthLocusShardStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -85,7 +85,7 @@ public class ShardStrategyFactoryTest extends BaseTest {
|
||||||
public void testLinearInterval() {
|
public void testLinearInterval() {
|
||||||
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
|
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
|
||||||
set.add(l);
|
set.add(l);
|
||||||
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100,set);
|
ShardStrategy st = ShardStrategyFactory.shatter(null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100,set);
|
||||||
assertTrue(st instanceof LinearLocusShardStrategy);
|
assertTrue(st instanceof LinearLocusShardStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -81,16 +81,17 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testLinearBreakIterateAll() {
|
public void testLinearBreakIterateAll() {
|
||||||
logger.warn("Executing testLinearBreakIterateAll");
|
logger.warn("Executing testLinearBreakIterateAll");
|
||||||
// the sharding strat.
|
|
||||||
ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
// setup the data
|
// setup the data
|
||||||
fl.add(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"));
|
fl.add(new File(validationDataLocation + "/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"));
|
||||||
Reads reads = new Reads(fl);
|
Reads reads = new Reads(fl);
|
||||||
|
|
||||||
|
// the sharding strat.
|
||||||
|
SAMDataSource data = new IndexDrivenSAMDataSource(reads);
|
||||||
|
ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
|
||||||
for (Shard sh : strat) {
|
for (Shard sh : strat) {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
count++;
|
count++;
|
||||||
|
|
@ -124,13 +125,14 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testMergingTwoBAMFiles() {
|
public void testMergingTwoBAMFiles() {
|
||||||
logger.warn("Executing testMergingTwoBAMFiles");
|
logger.warn("Executing testMergingTwoBAMFiles");
|
||||||
// the sharding strat.
|
|
||||||
ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
|
||||||
|
|
||||||
|
|
||||||
// setup the test files
|
// setup the test files
|
||||||
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam"));
|
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam"));
|
||||||
Reads reads = new Reads(fl);
|
Reads reads = new Reads(fl);
|
||||||
|
|
||||||
|
// the sharding strat.
|
||||||
|
SAMDataSource data = new IndexDrivenSAMDataSource(reads);
|
||||||
|
ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
|
|
||||||
ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
|
ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
|
||||||
ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
|
ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
|
||||||
|
|
@ -140,7 +142,6 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
|
||||||
for (Shard sh : strat) {
|
for (Shard sh : strat) {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
count++;
|
count++;
|
||||||
|
|
@ -173,11 +174,11 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
||||||
|
|
||||||
count = 0;
|
count = 0;
|
||||||
// the sharding strat.
|
// the sharding strat.
|
||||||
strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
data = new IndexDrivenSAMDataSource(reads);
|
||||||
|
strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
|
|
||||||
logger.debug("Pile two:");
|
logger.debug("Pile two:");
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
|
||||||
for (Shard sh : strat) {
|
for (Shard sh : strat) {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
count++;
|
count++;
|
||||||
|
|
|
||||||
|
|
@ -101,11 +101,11 @@ public class SAMByIntervalTest extends BaseTest {
|
||||||
int unmappedReadsSeen = 0;
|
int unmappedReadsSeen = 0;
|
||||||
int iterations = 0;
|
int iterations = 0;
|
||||||
|
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
|
||||||
data.setResourcePool(gen);
|
data.setResourcePool(gen);
|
||||||
GenomeLocSortedSet set = new GenomeLocSortedSet();
|
GenomeLocSortedSet set = new GenomeLocSortedSet();
|
||||||
set.add(GenomeLocParser.createGenomeLoc(0, start, stop));
|
set.add(GenomeLocParser.createGenomeLoc(0, start, stop));
|
||||||
ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, gen.getHeader().getSequenceDictionary(), UNMAPPED_READ_COUNT, set);
|
ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, gen.getHeader().getSequenceDictionary(), UNMAPPED_READ_COUNT, set);
|
||||||
|
|
||||||
StingSAMIterator iter = data.seek(strat.next());
|
StingSAMIterator iter = data.seek(strat.next());
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,7 @@ public class SAMByReadsTest extends BaseTest {
|
||||||
int unmappedReadsSeen = 0;
|
int unmappedReadsSeen = 0;
|
||||||
int iterations = 0;
|
int iterations = 0;
|
||||||
|
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
|
||||||
data.setResourcePool(gen);
|
data.setResourcePool(gen);
|
||||||
++iterations;
|
++iterations;
|
||||||
StingSAMIterator ret = data.toUnmappedReads(100);
|
StingSAMIterator ret = data.toUnmappedReads(100);
|
||||||
|
|
@ -109,10 +109,10 @@ public class SAMByReadsTest extends BaseTest {
|
||||||
targetReadCount = 5;
|
targetReadCount = 5;
|
||||||
try {
|
try {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
|
||||||
|
|
||||||
data.setResourcePool(gen);
|
data.setResourcePool(gen);
|
||||||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
shardStrategy = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
||||||
while (shardStrategy.hasNext()) {
|
while (shardStrategy.hasNext()) {
|
||||||
StingSAMIterator ret = data.seek(shardStrategy.next());
|
StingSAMIterator ret = data.seek(shardStrategy.next());
|
||||||
assertTrue(ret != null);
|
assertTrue(ret != null);
|
||||||
|
|
@ -140,11 +140,11 @@ public class SAMByReadsTest extends BaseTest {
|
||||||
targetReadCount = 3;
|
targetReadCount = 3;
|
||||||
try {
|
try {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
|
||||||
|
|
||||||
|
|
||||||
data.setResourcePool(gen);
|
data.setResourcePool(gen);
|
||||||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
shardStrategy = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
||||||
while (shardStrategy.hasNext()) {
|
while (shardStrategy.hasNext()) {
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SimpleDataSourceLoadException;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SimpleDataSourceLoadException;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
|
@ -83,22 +84,21 @@ public class BoundedReadIteratorTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testBounding() {
|
public void testBounding() {
|
||||||
logger.warn("Executing testBounding");
|
logger.warn("Executing testBounding");
|
||||||
// the sharding strat.
|
|
||||||
ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
|
|
||||||
// setup the test files
|
// setup the test files
|
||||||
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam"));
|
fl.add(new File(seqLocation + "/dirseq/analysis/cancer_exome/twoflowcell_sams/TCGA-06-0188.aligned.duplicates_marked.bam"));
|
||||||
Reads reads = new Reads(fl);
|
Reads reads = new Reads(fl);
|
||||||
|
|
||||||
|
SAMDataSource data = new IndexDrivenSAMDataSource(reads);
|
||||||
|
// the sharding strat.
|
||||||
|
ShardStrategy strat = ShardStrategyFactory.shatter(data,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
// our target read
|
// our target read
|
||||||
final long boundedReadCount = 100;
|
final long boundedReadCount = 100;
|
||||||
long shardReadCount = 0;
|
long shardReadCount = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
|
||||||
|
|
||||||
// make sure we have a shard
|
// make sure we have a shard
|
||||||
if (!strat.hasNext()) {
|
if (!strat.hasNext()) {
|
||||||
fail("Our shatter didn't give us a single piece, this is bad");
|
fail("Our shatter didn't give us a single piece, this is bad");
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
|
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
@ -115,13 +116,12 @@ public class TraverseReadsTest extends BaseTest {
|
||||||
}
|
}
|
||||||
GenomeLocParser.setupRefContigOrdering(ref);
|
GenomeLocParser.setupRefContigOrdering(ref);
|
||||||
|
|
||||||
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,
|
SAMDataSource dataSource = new IndexDrivenSAMDataSource(new Reads(bamList));
|
||||||
|
dataSource.viewUnmappedReads(false);
|
||||||
|
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ShardStrategyFactory.SHATTER_STRATEGY.READS,
|
||||||
ref.getSequenceDictionary(),
|
ref.getSequenceDictionary(),
|
||||||
readSize);
|
readSize);
|
||||||
|
|
||||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
|
|
||||||
dataSource.viewUnmappedReads(false);
|
|
||||||
|
|
||||||
countReadWalker.initialize();
|
countReadWalker.initialize();
|
||||||
Object accumulator = countReadWalker.reduceInit();
|
Object accumulator = countReadWalker.reduceInit();
|
||||||
|
|
||||||
|
|
@ -162,13 +162,12 @@ public class TraverseReadsTest extends BaseTest {
|
||||||
}
|
}
|
||||||
GenomeLocParser.setupRefContigOrdering(ref);
|
GenomeLocParser.setupRefContigOrdering(ref);
|
||||||
|
|
||||||
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,
|
SAMDataSource dataSource = new IndexDrivenSAMDataSource(new Reads(bamList));
|
||||||
|
dataSource.viewUnmappedReads(true);
|
||||||
|
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ShardStrategyFactory.SHATTER_STRATEGY.READS,
|
||||||
ref.getSequenceDictionary(),
|
ref.getSequenceDictionary(),
|
||||||
readSize);
|
readSize);
|
||||||
|
|
||||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
|
|
||||||
dataSource.viewUnmappedReads(true);
|
|
||||||
|
|
||||||
countReadWalker.initialize();
|
countReadWalker.initialize();
|
||||||
Object accumulator = countReadWalker.reduceInit();
|
Object accumulator = countReadWalker.reduceInit();
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue