SAMDataSource is now exposed by GATK engine; SamFileHeaderMerger is exposed from Resources all the way up to SAMDataSource, so now we can see underlying individual readers should we need them; GATK engine has new methods getSamplesByReaders(), getLibrariesByReaders(), and getMergedReadGroupsByReaders(): each of these methods returns a list of sets, with each element (set) holding, respectively, samples, libraries, or (merged) read groups coming from an individual input bam file (so now when using multiple -I options we can still find out which of the input bams each read comes from)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1315 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-24 22:59:49 +00:00 · 2009-07-24 22:59:49 +00:00 · a361e7b342
parent 2024fb3e32
commit a361e7b342
7 changed files with 185 additions and 2 deletions
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -27,8 +27,13 @@ package org.broadinstitute.sting.gatk;

 import net.sf.picard.reference.ReferenceSequenceFile;
 import net.sf.picard.reference.ReferenceSequenceFileFactory;
+import net.sf.picard.sam.SamFileHeaderMerger;
 import net.sf.picard.filter.SamRecordFilter;
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMReadGroupRecord;
+
 import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
 import org.broadinstitute.sting.gatk.executive.MicroScheduler;
 import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
 import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
@ -40,6 +45,7 @@ import org.broadinstitute.sting.utils.cmdLine.ArgumentException;

 import java.io.File;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

@ -51,6 +57,7 @@ public class GenomeAnalysisEngine {

    // our traversal engine
    private TraversalEngine engine = null;
+    private SAMDataSource dataSource = null;

    // our argument collection
    private GATKArgumentCollection argCollection;
@ -189,6 +196,8 @@ public class GenomeAnalysisEngine {
            Utils.scareUser(String.format("Unable to create the appropriate TraversalEngine for analysis type " + argCollection.analysisName));
        }

+        dataSource = microScheduler.getSAMDataSource();
+        
        return microScheduler;
    }

@ -212,6 +221,92 @@ public class GenomeAnalysisEngine {
        return locs;
    }

+	/**
+	 * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying
+	 * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
+	 * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names
+	 * found in the corresponding bam file.
+	 * @return
+	 */
+	public List< Set<String> > getSamplesByReaders() {
+		
+		
+		SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); 
+		
+		List< Set<String> > sample_sets = new ArrayList<Set<String>>(hm.getReaders().size()); 
+		
+		for ( SAMFileReader r : hm.getReaders() ) {
+			
+			Set<String> samples = new HashSet<String>(1);
+			sample_sets.add(samples);
+			
+			for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
+				samples.add(g.getSample());
+			}
+		}
+		
+		return sample_sets;
+		
+	}
+
+	/**
+	 * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying
+	 * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
+	 * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names
+	 * found in the corresponding bam file.
+	 * @return
+	 */
+	public List< Set<String> > getLibrariesByReaders() {
+		
+		
+		SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); 
+		
+		List< Set<String> > lib_sets = new ArrayList<Set<String>>(hm.getReaders().size()); 
+		
+		for ( SAMFileReader r : hm.getReaders() ) {
+			
+			Set<String> libs = new HashSet<String>(2);
+			lib_sets.add(libs);
+			
+			for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
+				libs.add(g.getLibrary());
+			}
+		}
+		
+		return lib_sets;
+		
+	}
+
+	/**
+	 * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying
+	 * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list
+	 * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups
+	 * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file.
+	 * @return
+	 */
+	public List< Set<String> > getMergedReadGroupsByReaders() {
+		
+		
+		SamFileHeaderMerger hm = getDataSource().getHeaderMerger(); 
+		
+		List< Set<String> > rg_sets = new ArrayList<Set<String>>(hm.getReaders().size()); 
+		
+		for ( SAMFileReader r : hm.getReaders() ) {
+			
+			Set<String> groups = new HashSet<String>(5);
+			rg_sets.add(groups);
+			
+			for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
+				// use HeaderMerger to translate original read group id from the reader into the read group id in the 
+				// merged stream, and save that remapped read group id to associate it with specific reader
+				groups.add( hm.getReadGroupId(r, g.getReadGroupId()) );
+			}
+		}
+		
+		return rg_sets;
+		
+	}
+    
    /**
     * Bundles all the source information about the reads into a unified data structure.
     *
@ -319,6 +414,15 @@ public class GenomeAnalysisEngine {
        return this.engine;
    }

+    /**
+     * Returns data source object encapsulating all essential info and handlers used to traverse
+     * reads; header merger, individual file readers etc can be accessed through the returned data source object.
+     * @return
+     */
+    public SAMDataSource getDataSource() {
+        return this.dataSource;
+    }
+
    /**
     * Gets the collection of GATK main application arguments for enhanced walker validation.
     *
--- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java
@ -67,7 +67,26 @@ abstract class ReadStreamPointer {
        for (SAMFileReader reader : headerMerger.getReaders())
            reader.close();
    }
+    
+    /**
+     * Returns Reads data structure containing information about the reads data sources as well as
+     * information about how they are downsampled, sorted, and filtered
+     * @return
+     */
+    public Reads getReadsInfo() {
+    	return sourceInfo;
+    }

+    /** 
+     * Returns header merger: a class that keeps the mapping between original read groups and read groups
+     * of the merged stream; merger also provides access to the individual file readers (and hence headers
+     * too) maintained by the system. 
+     * @return
+     */
+    public SamFileHeaderMerger getHeaderMerger() {
+    	return headerMerger;
+    }
+    
    /**
     * Remove an iterator from service.
     * @param iterator The iterator to remove from service.  Must not be null.
--- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java
@ -72,6 +72,21 @@ class ReadStreamResource {
    public SAMFileHeader getHeader() {
        return header;
    }
+    
+    /**
+     * Returns Reads data structure containing information about the reads data sources as well as
+     * information about how they are downsampled, sorted, and filtered
+     * @return
+     */
+    public Reads getReadsInfo() { return readStreamPointer.getReadsInfo(); }
+    
+    /** 
+     * Returns header merger: a class that keeps the mapping between original read groups and read groups
+     * of the merged stream; merger also provides access to the individual file readers (and hence headers
+     * too) maintained by the system. 
+     * @return
+     */
+   public SamFileHeaderMerger getHeaderMerger() { return readStreamPointer.getHeaderMerger(); }

    public boolean canAccessSegmentEfficiently(DataStreamSegment segment) {
        return readStreamPointer.canAccessSegmentEfficiently(segment);
--- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java
@ -5,6 +5,8 @@ import net.sf.samtools.SAMRecord;
 import net.sf.samtools.util.CloseableIterator;
 import net.sf.picard.filter.FilteringIterator;
 import net.sf.picard.filter.SamRecordFilter;
+import net.sf.picard.sam.SamFileHeaderMerger;
+
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
 import org.broadinstitute.sting.gatk.datasources.shards.Shard;
@ -116,6 +118,22 @@ public class SAMDataSource implements SimpleDataSource {
        return resourcePool.getHeader();
    }

+    
+    /**
+     * Returns Reads data structure containing information about the reads data sources placed in this pool as well as
+     * information about how they are downsampled, sorted, and filtered
+     * @return
+     */
+    public Reads getReadsInfo() { return reads; }
+    
+    /** 
+     * Returns header merger: a class that keeps the mapping between original read groups and read groups
+     * of the merged stream; merger also provides access to the individual file readers (and hence headers
+     * prior to the merging too) maintained by the system. 
+     * @return
+     */
+    public SamFileHeaderMerger getHeaderMerger() { return resourcePool.getHeaderMerger(); }
+
    /**
     * 
     * @param shard the shard to get data for
--- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java
+++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java
@ -29,6 +29,8 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
 import org.broadinstitute.sting.gatk.Reads;
 import org.broadinstitute.sting.utils.StingException;
 import org.apache.log4j.Logger;
+
+import net.sf.picard.sam.SamFileHeaderMerger;
 import net.sf.samtools.SAMFileHeader;
 import net.sf.samtools.SAMRecord;

@ -44,6 +46,7 @@ import java.util.List;
 class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator> {
    /** Source information about the reads. */
    protected Reads reads;
+    protected SamFileHeaderMerger headerMerger;

    /** Is this a by-reads traversal or a by-locus? */
    protected boolean queryOverlapping;
@ -60,6 +63,7 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>

        ReadStreamResource streamResource = createNewResource();
        this.header = streamResource.getHeader();
+        this.headerMerger = streamResource.getHeaderMerger();
        // Add this resource to the pool.
        this.addNewResource(streamResource);
    }
@ -69,6 +73,21 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
        return header;
    }

+    /**
+     * Returns Reads data structure containing information about the reads data sources placed in this pool as well as
+     * information about how they are downsampled, sorted, and filtered
+     * @return
+     */
+    public Reads getReadsInfo() { return reads; }
+    
+    /** 
+     * Returns header merger: a class that keeps the mapping between original read groups and read groups
+     * of the merged stream; merger also provides access to the individual file readers (and hence headers
+     * too) maintained by the system. 
+     * @return
+     */
+   public SamFileHeaderMerger getHeaderMerger() { return headerMerger; }
+
    protected ReadStreamResource selectBestExistingResource( DataStreamSegment segment, List<ReadStreamResource> resources ) {
        for (ReadStreamResource resource : resources) {
            if (resource.canAccessSegmentEfficiently(segment)) {
--- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@ -114,7 +114,7 @@ public abstract class MicroScheduler {
        } else {
            throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
        }
-        this.reads = getReadsDataSource(reads);
+        this.reads = setupReadsDataSource(reads);
        this.reference = openReferenceSequenceFile(refFile);
        this.rods = getReferenceOrderedDataSources(rods);

@ -235,7 +235,7 @@ public abstract class MicroScheduler {
     *
     * @return A data source for the given set of reads.
     */
-    private SAMDataSource getReadsDataSource(Reads reads) {
+    private SAMDataSource setupReadsDataSource(Reads reads) {
        // By reference traversals are happy with no reads.  Make sure that case is handled.
        if (reads.getReadsFiles().size() == 0)
            return null;
@ -249,6 +249,12 @@ public abstract class MicroScheduler {

        return dataSource;
    }
+    
+    /**
+     * Returns data source maintained by this scheduler
+     * @return
+     */
+    public SAMDataSource getSAMDataSource() { return reads; }

    /**
     * Open the reference-ordered data sources.
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
 import org.broadinstitute.sting.gatk.datasources.shards.Shard;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
 import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.utils.GenomeLoc;

@ -53,6 +54,7 @@ public abstract class TraversalEngine {
        this.myHeader = myHeader;
    }

+
    /**
     * @param curTime (current runtime, in millisecs)
     *