GATK Engine: special-case "monolithic" FilePointers, and allow them to represent multiple contigs
Sometimes the GATK engine creates a single monolithic FilePointer representing all regions in all BAM files. In such cases, the monolithic FilePointer is the only FilePointer emitted by the BAMScheduler, and it's safe to allow it to contain regions and intervals from multiple contigs. This fixes support for reading unindexed BAM files (since an unindexed BAM is one case in which the engine creates a monolithic FilePointer).
This commit is contained in:
parent
a96ed385df
commit
118e974731
|
|
@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
*/
|
||||
private FilePointer generatePointerOverEntireFileset() {
|
||||
FilePointer filePointer = new FilePointer();
|
||||
|
||||
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
|
||||
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
|
||||
// multiple contigs
|
||||
filePointer.setIsMonolithic(true);
|
||||
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
|
||||
|
||||
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
*/
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
|
|
@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
|
|
@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
|
|
|
|||
|
|
@ -50,6 +50,14 @@ public class FilePointer {
|
|||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*/
|
||||
private boolean isMonolithic = false;
|
||||
|
||||
public FilePointer( List<GenomeLoc> locations ) {
|
||||
this.locations.addAll(locations);
|
||||
this.isRegionUnmapped = checkUnmappedStatus();
|
||||
|
|
@ -81,7 +89,8 @@ public class FilePointer {
|
|||
}
|
||||
|
||||
private void validateLocations() {
|
||||
if ( isRegionUnmapped ) {
|
||||
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
|
||||
if ( isRegionUnmapped || isMonolithic ) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -123,6 +132,29 @@ public class FilePointer {
|
|||
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*
|
||||
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
|
||||
*/
|
||||
public boolean isMonolithic() {
|
||||
return isMonolithic;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
|
||||
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
|
||||
* FP may contain intervals from more than one contig.
|
||||
*
|
||||
* @param isMonolithic set this FP's monolithic status to this value
|
||||
*/
|
||||
public void setIsMonolithic( boolean isMonolithic ) {
|
||||
this.isMonolithic = isMonolithic;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if(!(other instanceof FilePointer))
|
||||
|
|
|
|||
Loading…
Reference in New Issue