SAMDataSource is now exposed by GATK engine; SamFileHeaderMerger is exposed from Resources all the way up to SAMDataSource, so now we can see underlying individual readers should we need them; GATK engine has new methods getSamplesByReaders(), getLibrariesByReaders(), and getMergedReadGroupsByReaders(): each of these methods returns a list of sets, with each element (set) holding, respectively, samples, libraries, or (merged) read groups coming from an individual input bam file (so now when using multiple -I options we can still find out which of the input bams each read comes from)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1315 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-07-24 22:59:49 +00:00
parent 2024fb3e32
commit a361e7b342
7 changed files with 185 additions and 2 deletions

View File

@ -27,8 +27,13 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
@ -40,6 +45,7 @@ import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -51,6 +57,7 @@ public class GenomeAnalysisEngine {
// our traversal engine
private TraversalEngine engine = null;
private SAMDataSource dataSource = null;
// our argument collection
private GATKArgumentCollection argCollection;
@ -189,6 +196,8 @@ public class GenomeAnalysisEngine {
Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
}
dataSource = microScheduler.getSAMDataSource();
return microScheduler;
}
@ -212,6 +221,92 @@ public class GenomeAnalysisEngine {
return locs;
}
/**
* Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
* returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names
* found in the corresponding bam file.
* @return
*/
public List< Set<String> > getSamplesByReaders() {
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
List< Set<String> > sample_sets = new ArrayList<Set<String>>(hm.getReaders().size());
for ( SAMFileReader r : hm.getReaders() ) {
Set<String> samples = new HashSet<String>(1);
sample_sets.add(samples);
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
samples.add(g.getSample());
}
}
return sample_sets;
}
/**
* Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
* returned by this method will contain 3 elements (one for each reader), with each element being a set of library names
* found in the corresponding bam file.
* @return
*/
public List< Set<String> > getLibrariesByReaders() {
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
List< Set<String> > lib_sets = new ArrayList<Set<String>>(hm.getReaders().size());
for ( SAMFileReader r : hm.getReaders() ) {
Set<String> libs = new HashSet<String>(2);
lib_sets.add(libs);
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
libs.add(g.getLibrary());
}
}
return lib_sets;
}
/**
* Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying
* individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
* returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups
* (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file.
* @return
*/
public List< Set<String> > getMergedReadGroupsByReaders() {
SamFileHeaderMerger hm = getDataSource().getHeaderMerger();
List< Set<String> > rg_sets = new ArrayList<Set<String>>(hm.getReaders().size());
for ( SAMFileReader r : hm.getReaders() ) {
Set<String> groups = new HashSet<String>(5);
rg_sets.add(groups);
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
// use HeaderMerger to translate original read group id from the reader into the read group id in the
// merged stream, and save that remapped read group id to associate it with specific reader
groups.add( hm.getReadGroupId(r, g.getReadGroupId()) );
}
}
return rg_sets;
}
/**
* Bundles all the source information about the reads into a unified data structure.
*
@ -319,6 +414,15 @@ public class GenomeAnalysisEngine {
return this.engine;
}
/**
* Returns data source object encapsulating all essential info and handlers used to traverse
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
* @return
*/
public SAMDataSource getDataSource() {
return this.dataSource;
}
/**
* Gets the collection of GATK main application arguments for enhanced walker validation.
*

View File

@ -67,7 +67,26 @@ abstract class ReadStreamPointer {
for (SAMFileReader reader : headerMerger.getReaders())
reader.close();
}
/**
* Returns Reads data structure containing information about the reads data sources as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public Reads getReadsInfo() {
return sourceInfo;
}
/**
* Returns header merger: a class that keeps the mapping between original read groups and read groups
* of the merged stream; merger also provides access to the individual file readers (and hence headers
* too) maintained by the system.
* @return
*/
public SamFileHeaderMerger getHeaderMerger() {
return headerMerger;
}
/**
* Remove an iterator from service.
* @param iterator The iterator to remove from service. Must not be null.

View File

@ -72,6 +72,21 @@ class ReadStreamResource {
public SAMFileHeader getHeader() {
return header;
}
/**
* Returns Reads data structure containing information about the reads data sources as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public Reads getReadsInfo() { return readStreamPointer.getReadsInfo(); }
/**
* Returns header merger: a class that keeps the mapping between original read groups and read groups
* of the merged stream; merger also provides access to the individual file readers (and hence headers
* too) maintained by the system.
* @return
*/
public SamFileHeaderMerger getHeaderMerger() { return readStreamPointer.getHeaderMerger(); }
public boolean canAccessSegmentEfficiently(DataStreamSegment segment) {
return readStreamPointer.canAccessSegmentEfficiently(segment);

View File

@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.filter.FilteringIterator;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.picard.sam.SamFileHeaderMerger;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
@ -116,6 +118,22 @@ public class SAMDataSource implements SimpleDataSource {
return resourcePool.getHeader();
}
/**
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public Reads getReadsInfo() { return reads; }
/**
* Returns header merger: a class that keeps the mapping between original read groups and read groups
* of the merged stream; merger also provides access to the individual file readers (and hence headers
* prior to the merging too) maintained by the system.
* @return
*/
public SamFileHeaderMerger getHeaderMerger() { return resourcePool.getHeaderMerger(); }
/**
*
* @param shard the shard to get data for

View File

@ -29,6 +29,8 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.StingException;
import org.apache.log4j.Logger;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
@ -44,6 +46,7 @@ import java.util.List;
class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator> {
/** Source information about the reads. */
protected Reads reads;
protected SamFileHeaderMerger headerMerger;
/** Is this a by-reads traversal or a by-locus? */
protected boolean queryOverlapping;
@ -60,6 +63,7 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
ReadStreamResource streamResource = createNewResource();
this.header = streamResource.getHeader();
this.headerMerger = streamResource.getHeaderMerger();
// Add this resource to the pool.
this.addNewResource(streamResource);
}
@ -69,6 +73,21 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
return header;
}
/**
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public Reads getReadsInfo() { return reads; }
/**
* Returns header merger: a class that keeps the mapping between original read groups and read groups
* of the merged stream; merger also provides access to the individual file readers (and hence headers
* too) maintained by the system.
* @return
*/
public SamFileHeaderMerger getHeaderMerger() { return headerMerger; }
protected ReadStreamResource selectBestExistingResource( DataStreamSegment segment, List<ReadStreamResource> resources ) {
for (ReadStreamResource resource : resources) {
if (resource.canAccessSegmentEfficiently(segment)) {

View File

@ -114,7 +114,7 @@ public abstract class MicroScheduler {
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
}
this.reads = getReadsDataSource(reads);
this.reads = setupReadsDataSource(reads);
this.reference = openReferenceSequenceFile(refFile);
this.rods = getReferenceOrderedDataSources(rods);
@ -235,7 +235,7 @@ public abstract class MicroScheduler {
*
* @return A data source for the given set of reads.
*/
private SAMDataSource getReadsDataSource(Reads reads) {
private SAMDataSource setupReadsDataSource(Reads reads) {
// By reference traversals are happy with no reads. Make sure that case is handled.
if (reads.getReadsFiles().size() == 0)
return null;
@ -249,6 +249,12 @@ public abstract class MicroScheduler {
return dataSource;
}
/**
* Returns data source maintained by this scheduler
* @return
*/
public SAMDataSource getSAMDataSource() { return reads; }
/**
* Open the reference-ordered data sources.

View File

@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -53,6 +54,7 @@ public abstract class TraversalEngine {
this.myHeader = myHeader;
}
/**
* @param curTime (current runtime, in millisecs)
*