diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 0a04e0f60..c3cdc13a5 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -27,8 +27,13 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFileFactory; +import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMReadGroupRecord; + import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; @@ -40,6 +45,7 @@ import org.broadinstitute.sting.utils.cmdLine.ArgumentException; import java.io.File; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Set; @@ -51,6 +57,7 @@ public class GenomeAnalysisEngine { // our traversal engine private TraversalEngine engine = null; + private SAMDataSource dataSource = null; // our argument collection private GATKArgumentCollection argCollection; @@ -189,6 +196,8 @@ public class GenomeAnalysisEngine { Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName)); } + dataSource = microScheduler.getSAMDataSource(); + return microScheduler; } @@ -212,6 +221,92 @@ public class GenomeAnalysisEngine { return locs; } + /** + * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names + * found in the corresponding bam file. + * @return + */ + public List< Set > getSamplesByReaders() { + + + SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); + + List< Set > sample_sets = new ArrayList>(hm.getReaders().size()); + + for ( SAMFileReader r : hm.getReaders() ) { + + Set samples = new HashSet(1); + sample_sets.add(samples); + + for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) { + samples.add(g.getSample()); + } + } + + return sample_sets; + + } + + /** + * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names + * found in the corresponding bam file. + * @return + */ + public List< Set > getLibrariesByReaders() { + + + SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); + + List< Set > lib_sets = new ArrayList>(hm.getReaders().size()); + + for ( SAMFileReader r : hm.getReaders() ) { + + Set libs = new HashSet(2); + lib_sets.add(libs); + + for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) { + libs.add(g.getLibrary()); + } + } + + return lib_sets; + + } + + /** + * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying + * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list + * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups + * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. + * @return + */ + public List< Set > getMergedReadGroupsByReaders() { + + + SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); + + List< Set > rg_sets = new ArrayList>(hm.getReaders().size()); + + for ( SAMFileReader r : hm.getReaders() ) { + + Set groups = new HashSet(5); + rg_sets.add(groups); + + for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) { + // use HeaderMerger to translate original read group id from the reader into the read group id in the + // merged stream, and save that remapped read group id to associate it with specific reader + groups.add( hm.getReadGroupId(r, g.getReadGroupId()) ); + } + } + + return rg_sets; + + } + /** * Bundles all the source information about the reads into a unified data structure. * @@ -319,6 +414,15 @@ public class GenomeAnalysisEngine { return this.engine; } + /** + * Returns data source object encapsulating all essential info and handlers used to traverse + * reads; header merger, individual file readers etc can be accessed through the returned data source object. + * @return + */ + public SAMDataSource getDataSource() { + return this.dataSource; + } + /** * Gets the collection of GATK main application arguments for enhanced walker validation. * diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java index fd2d57f4c..5df542a09 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java @@ -67,7 +67,26 @@ abstract class ReadStreamPointer { for (SAMFileReader reader : headerMerger.getReaders()) reader.close(); } + + /** + * Returns Reads data structure containing information about the reads data sources as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public Reads getReadsInfo() { + return sourceInfo; + } + /** + * Returns header merger: a class that keeps the mapping between original read groups and read groups + * of the merged stream; merger also provides access to the individual file readers (and hence headers + * too) maintained by the system. + * @return + */ + public SamFileHeaderMerger getHeaderMerger() { + return headerMerger; + } + /** * Remove an iterator from service. * @param iterator The iterator to remove from service. Must not be null. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java index e0000562e..ce91d3a99 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java @@ -72,6 +72,21 @@ class ReadStreamResource { public SAMFileHeader getHeader() { return header; } + + /** + * Returns Reads data structure containing information about the reads data sources as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public Reads getReadsInfo() { return readStreamPointer.getReadsInfo(); } + + /** + * Returns header merger: a class that keeps the mapping between original read groups and read groups + * of the merged stream; merger also provides access to the individual file readers (and hence headers + * too) maintained by the system. + * @return + */ + public SamFileHeaderMerger getHeaderMerger() { return readStreamPointer.getHeaderMerger(); } public boolean canAccessSegmentEfficiently(DataStreamSegment segment) { return readStreamPointer.canAccessSegmentEfficiently(segment); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 2b90c0be9..3341d573b 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; +import net.sf.picard.sam.SamFileHeaderMerger; + import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; import org.broadinstitute.sting.gatk.datasources.shards.Shard; @@ -116,6 +118,22 @@ public class SAMDataSource implements SimpleDataSource { return resourcePool.getHeader(); } + + /** + * Returns Reads data structure containing information about the reads data sources placed in this pool as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public Reads getReadsInfo() { return reads; } + + /** + * Returns header merger: a class that keeps the mapping between original read groups and read groups + * of the merged stream; merger also provides access to the individual file readers (and hence headers + * prior to the merging too) maintained by the system. + * @return + */ + public SamFileHeaderMerger getHeaderMerger() { return resourcePool.getHeaderMerger(); } + /** * * @param shard the shard to get data for diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java index dc213b23d..b2ac4ea71 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java @@ -29,6 +29,8 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.StingException; import org.apache.log4j.Logger; + +import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; @@ -44,6 +46,7 @@ import java.util.List; class SAMResourcePool extends ResourcePool { /** Source information about the reads. */ protected Reads reads; + protected SamFileHeaderMerger headerMerger; /** Is this a by-reads traversal or a by-locus? */ protected boolean queryOverlapping; @@ -60,6 +63,7 @@ class SAMResourcePool extends ResourcePool ReadStreamResource streamResource = createNewResource(); this.header = streamResource.getHeader(); + this.headerMerger = streamResource.getHeaderMerger(); // Add this resource to the pool. this.addNewResource(streamResource); } @@ -69,6 +73,21 @@ class SAMResourcePool extends ResourcePool return header; } + /** + * Returns Reads data structure containing information about the reads data sources placed in this pool as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public Reads getReadsInfo() { return reads; } + + /** + * Returns header merger: a class that keeps the mapping between original read groups and read groups + * of the merged stream; merger also provides access to the individual file readers (and hence headers + * too) maintained by the system. + * @return + */ + public SamFileHeaderMerger getHeaderMerger() { return headerMerger; } + protected ReadStreamResource selectBestExistingResource( DataStreamSegment segment, List resources ) { for (ReadStreamResource resource : resources) { if (resource.canAccessSegmentEfficiently(segment)) { diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index c2ac3c528..ae948a261 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -114,7 +114,7 @@ public abstract class MicroScheduler { } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } - this.reads = getReadsDataSource(reads); + this.reads = setupReadsDataSource(reads); this.reference = openReferenceSequenceFile(refFile); this.rods = getReferenceOrderedDataSources(rods); @@ -235,7 +235,7 @@ public abstract class MicroScheduler { * * @return A data source for the given set of reads. */ - private SAMDataSource getReadsDataSource(Reads reads) { + private SAMDataSource setupReadsDataSource(Reads reads) { // By reference traversals are happy with no reads. Make sure that case is handled. if (reads.getReadsFiles().size() == 0) return null; @@ -249,6 +249,12 @@ public abstract class MicroScheduler { return dataSource; } + + /** + * Returns data source maintained by this scheduler + * @return + */ + public SAMDataSource getSAMDataSource() { return reads; } /** * Open the reference-ordered data sources. diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 610dd3169..f9e18ee8b 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.shards.Shard; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; @@ -53,6 +54,7 @@ public abstract class TraversalEngine { this.myHeader = myHeader; } + /** * @param curTime (current runtime, in millisecs) *