SAMDataSource is now exposed by GATK engine; SamFileHeaderMerger is exposed from Resources all the way up to SAMDataSource, so now we can see underlying individual readers should we need them; GATK engine has new methods getSamplesByReaders(), getLibrariesByReaders(), and getMergedReadGroupsByReaders(): each of these methods returns a list of sets, with each element (set) holding, respectively, samples, libraries, or (merged) read groups coming from an individual input bam file (so now when using multiple -I options we can still find out which of the input bams each read comes from)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1315 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2024fb3e32
commit
a361e7b342
|
|
@ -27,8 +27,13 @@ package org.broadinstitute.sting.gatk;
|
|||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
|
|
@ -40,6 +45,7 @@ import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
|||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
|
@ -51,6 +57,7 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
// our traversal engine
|
||||
private TraversalEngine engine = null;
|
||||
private SAMDataSource dataSource = null;
|
||||
|
||||
// our argument collection
|
||||
private GATKArgumentCollection argCollection;
|
||||
|
|
@ -189,6 +196,8 @@ public class GenomeAnalysisEngine {
|
|||
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
|
||||
}
|
||||
|
||||
dataSource = microScheduler.getSAMDataSource();
|
||||
|
||||
return microScheduler;
|
||||
}
|
||||
|
||||
|
|
@ -212,6 +221,92 @@ public class GenomeAnalysisEngine {
|
|||
return locs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names
|
||||
* found in the corresponding bam file.
|
||||
* @return
|
||||
*/
|
||||
public List< Set<String> > getSamplesByReaders() {
|
||||
|
||||
|
||||
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
|
||||
|
||||
List< Set<String> > sample_sets = new ArrayList<Set<String>>(hm.getReaders().size());
|
||||
|
||||
for ( SAMFileReader r : hm.getReaders() ) {
|
||||
|
||||
Set<String> samples = new HashSet<String>(1);
|
||||
sample_sets.add(samples);
|
||||
|
||||
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
|
||||
samples.add(g.getSample());
|
||||
}
|
||||
}
|
||||
|
||||
return sample_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of library names
|
||||
* found in the corresponding bam file.
|
||||
* @return
|
||||
*/
|
||||
public List< Set<String> > getLibrariesByReaders() {
|
||||
|
||||
|
||||
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
|
||||
|
||||
List< Set<String> > lib_sets = new ArrayList<Set<String>>(hm.getReaders().size());
|
||||
|
||||
for ( SAMFileReader r : hm.getReaders() ) {
|
||||
|
||||
Set<String> libs = new HashSet<String>(2);
|
||||
lib_sets.add(libs);
|
||||
|
||||
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
|
||||
libs.add(g.getLibrary());
|
||||
}
|
||||
}
|
||||
|
||||
return lib_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying
|
||||
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
|
||||
* returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups
|
||||
* (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file.
|
||||
* @return
|
||||
*/
|
||||
public List< Set<String> > getMergedReadGroupsByReaders() {
|
||||
|
||||
|
||||
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
|
||||
|
||||
List< Set<String> > rg_sets = new ArrayList<Set<String>>(hm.getReaders().size());
|
||||
|
||||
for ( SAMFileReader r : hm.getReaders() ) {
|
||||
|
||||
Set<String> groups = new HashSet<String>(5);
|
||||
rg_sets.add(groups);
|
||||
|
||||
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
|
||||
// use HeaderMerger to translate original read group id from the reader into the read group id in the
|
||||
// merged stream, and save that remapped read group id to associate it with specific reader
|
||||
groups.add( hm.getReadGroupId(r, g.getReadGroupId()) );
|
||||
}
|
||||
}
|
||||
|
||||
return rg_sets;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Bundles all the source information about the reads into a unified data structure.
|
||||
*
|
||||
|
|
@ -319,6 +414,15 @@ public class GenomeAnalysisEngine {
|
|||
return this.engine;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source object encapsulating all essential info and handlers used to traverse
|
||||
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
|
||||
* @return
|
||||
*/
|
||||
public SAMDataSource getDataSource() {
|
||||
return this.dataSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the collection of GATK main application arguments for enhanced walker validation.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -67,7 +67,26 @@ abstract class ReadStreamPointer {
|
|||
for (SAMFileReader reader : headerMerger.getReaders())
|
||||
reader.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns Reads data structure containing information about the reads data sources as well as
|
||||
* information about how they are downsampled, sorted, and filtered
|
||||
* @return
|
||||
*/
|
||||
public Reads getReadsInfo() {
|
||||
return sourceInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns header merger: a class that keeps the mapping between original read groups and read groups
|
||||
* of the merged stream; merger also provides access to the individual file readers (and hence headers
|
||||
* too) maintained by the system.
|
||||
* @return
|
||||
*/
|
||||
public SamFileHeaderMerger getHeaderMerger() {
|
||||
return headerMerger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove an iterator from service.
|
||||
* @param iterator The iterator to remove from service. Must not be null.
|
||||
|
|
|
|||
|
|
@ -72,6 +72,21 @@ class ReadStreamResource {
|
|||
public SAMFileHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns Reads data structure containing information about the reads data sources as well as
|
||||
* information about how they are downsampled, sorted, and filtered
|
||||
* @return
|
||||
*/
|
||||
public Reads getReadsInfo() { return readStreamPointer.getReadsInfo(); }
|
||||
|
||||
/**
|
||||
* Returns header merger: a class that keeps the mapping between original read groups and read groups
|
||||
* of the merged stream; merger also provides access to the individual file readers (and hence headers
|
||||
* too) maintained by the system.
|
||||
* @return
|
||||
*/
|
||||
public SamFileHeaderMerger getHeaderMerger() { return readStreamPointer.getHeaderMerger(); }
|
||||
|
||||
public boolean canAccessSegmentEfficiently(DataStreamSegment segment) {
|
||||
return readStreamPointer.canAccessSegmentEfficiently(segment);
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
|
|||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.filter.FilteringIterator;
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
|
|
@ -116,6 +118,22 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
return resourcePool.getHeader();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
|
||||
* information about how they are downsampled, sorted, and filtered
|
||||
* @return
|
||||
*/
|
||||
public Reads getReadsInfo() { return reads; }
|
||||
|
||||
/**
|
||||
* Returns header merger: a class that keeps the mapping between original read groups and read groups
|
||||
* of the merged stream; merger also provides access to the individual file readers (and hence headers
|
||||
* prior to the merging too) maintained by the system.
|
||||
* @return
|
||||
*/
|
||||
public SamFileHeaderMerger getHeaderMerger() { return resourcePool.getHeaderMerger(); }
|
||||
|
||||
/**
|
||||
*
|
||||
* @param shard the shard to get data for
|
||||
|
|
|
|||
|
|
@ -29,6 +29,8 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
|||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -44,6 +46,7 @@ import java.util.List;
|
|||
class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator> {
|
||||
/** Source information about the reads. */
|
||||
protected Reads reads;
|
||||
protected SamFileHeaderMerger headerMerger;
|
||||
|
||||
/** Is this a by-reads traversal or a by-locus? */
|
||||
protected boolean queryOverlapping;
|
||||
|
|
@ -60,6 +63,7 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
|
|||
|
||||
ReadStreamResource streamResource = createNewResource();
|
||||
this.header = streamResource.getHeader();
|
||||
this.headerMerger = streamResource.getHeaderMerger();
|
||||
// Add this resource to the pool.
|
||||
this.addNewResource(streamResource);
|
||||
}
|
||||
|
|
@ -69,6 +73,21 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
|
|||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
|
||||
* information about how they are downsampled, sorted, and filtered
|
||||
* @return
|
||||
*/
|
||||
public Reads getReadsInfo() { return reads; }
|
||||
|
||||
/**
|
||||
* Returns header merger: a class that keeps the mapping between original read groups and read groups
|
||||
* of the merged stream; merger also provides access to the individual file readers (and hence headers
|
||||
* too) maintained by the system.
|
||||
* @return
|
||||
*/
|
||||
public SamFileHeaderMerger getHeaderMerger() { return headerMerger; }
|
||||
|
||||
protected ReadStreamResource selectBestExistingResource( DataStreamSegment segment, List<ReadStreamResource> resources ) {
|
||||
for (ReadStreamResource resource : resources) {
|
||||
if (resource.canAccessSegmentEfficiently(segment)) {
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ public abstract class MicroScheduler {
|
|||
} else {
|
||||
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
||||
}
|
||||
this.reads = getReadsDataSource(reads);
|
||||
this.reads = setupReadsDataSource(reads);
|
||||
this.reference = openReferenceSequenceFile(refFile);
|
||||
this.rods = getReferenceOrderedDataSources(rods);
|
||||
|
||||
|
|
@ -235,7 +235,7 @@ public abstract class MicroScheduler {
|
|||
*
|
||||
* @return A data source for the given set of reads.
|
||||
*/
|
||||
private SAMDataSource getReadsDataSource(Reads reads) {
|
||||
private SAMDataSource setupReadsDataSource(Reads reads) {
|
||||
// By reference traversals are happy with no reads. Make sure that case is handled.
|
||||
if (reads.getReadsFiles().size() == 0)
|
||||
return null;
|
||||
|
|
@ -249,6 +249,12 @@ public abstract class MicroScheduler {
|
|||
|
||||
return dataSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source maintained by this scheduler
|
||||
* @return
|
||||
*/
|
||||
public SAMDataSource getSAMDataSource() { return reads; }
|
||||
|
||||
/**
|
||||
* Open the reference-ordered data sources.
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
|
|
@ -53,6 +54,7 @@ public abstract class TraversalEngine {
|
|||
this.myHeader = myHeader;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param curTime (current runtime, in millisecs)
|
||||
*
|
||||
|
|
|
|||
Loading…
Reference in New Issue