From 4fcee248f9faa0951cda531bc582d4d59322a7d6 Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 31 Mar 2010 20:46:44 +0000 Subject: [PATCH] For Kristian: functions which, given a read, can uniquely identify the BAM file storing that read. Introducing this into the pile of code which peeks under the covers of the SAMDataSource in the hopes that this function can help to replace the others and provide a single path for crosstalk. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3103 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 18 +++++++++ .../BlockDrivenSAMDataSource.java | 37 +++++++++++++++++++ .../IndexDrivenSAMDataSource.java | 11 ++++++ .../simpleDataSources/ReadStreamResource.java | 11 ++++++ .../simpleDataSources/SAMDataSource.java | 10 ++++- .../simpleDataSources/SAMResourcePool.java | 2 + 6 files changed, 88 insertions(+), 1 deletion(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f5a6cd29f..7496f18fa 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -371,6 +371,24 @@ public class GenomeAnalysisEngine { return locs; } + /** + * Gets a unique identifier for the reader sourcing this read. + * @param read Read to examine. + * @return A unique identifier for the source file of this read. Exception if not found. + */ + public SAMReaderID getReaderIDForRead(final SAMRecord read) { + return getDataSource().getReaderID(read); + } + + /** + * Gets the source file for this read. + * @param id Unique identifier determining which input file to use. + * @return The source filename for this read. + */ + public File getSourceFileForReaderID(final SAMReaderID id) { + return getDataSource().getSAMFile(id); + } + /** * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index abe10b0c5..1a712cbc8 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -175,6 +175,15 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { iterator.close(); } + /** + * Retrieves the id of the reader which built the given read. + * @param read The read to test. + * @return ID of the reader. + */ + public SAMReaderID getReaderID(SAMRecord read) { + return resourcePool.getReaderID(read.getReader()); + } + /** * Adds this read to the given shard. * @param shard The shard to which to add the read. @@ -357,6 +366,20 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { availableResources.add(readers); } + /** + * Gets the reader id for the given reader. + * @param reader Reader for which to determine the id. + * @return id of the given reader. + */ + protected synchronized SAMReaderID getReaderID(SAMFileReader reader) { + for(SAMReaders readers: allResources) { + SAMReaderID id = readers.getReaderID(reader); + if(id != null) + return id; + } + throw new StingException("No such reader id is available"); + } + private synchronized void createNewResource() { if(allResources.size() > maxEntries) throw new StingException("Cannot create a new resource pool. All resources are in use."); @@ -413,6 +436,20 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return readers.get(id); } + /** + * Searches for the reader id of this reader. + * @param reader Reader for which to search. + * @return The id associated the given reader, or null if the reader is not present in this collection. + */ + protected SAMReaderID getReaderID(SAMFileReader reader) { + for(Map.Entry entry: readers.entrySet()) { + if(reader == entry.getValue()) + return entry.getKey(); + } + // Not found? return null. + return null; + } + /** * Returns an iterator over all readers in this structure. * @return An iterator over readers. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java index 322d83f6e..d35e9fa5e 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java @@ -120,6 +120,17 @@ public class IndexDrivenSAMDataSource extends SAMDataSource { return resourcePool.fileToReaderMap.get(id.samFile).getFileHeader(); } + /** + * Retrieves the id of the reader which built the given read. + * @param read The read to test. + * @return ID of the reader. + */ + public SAMReaderID getReaderID(SAMRecord read) { + if(resourcePool.readerToIDMap.containsKey(read.getReader())) + return resourcePool.readerToIDMap.get(read.getReader()); + throw new StingException("Unable to find reader id for record."); + } + /** * Returns Reads data structure containing information about the reads data sources placed in this pool as well as * information about how they are downsampled, sorted, and filtered diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java index 66a665171..38899a7c7 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java @@ -79,6 +79,11 @@ class ReadStreamResource { */ private Map fileToReaderMap = null; + /** + * A mapping from reader back to the ID uniquely identifying this input file. + */ + private Map readerToIDMap = null; + public ReadStreamResource( Reads sourceInfo ) { SamFileHeaderMerger headerMerger = createHeaderMerger(sourceInfo, SAMFileHeader.SortOrder.coordinate); @@ -146,6 +151,10 @@ class ReadStreamResource { return fileToReaderMap; } + public Map getReaderToIDMapping() { + return readerToIDMap; + } + /** * A private function that, given the internal file list, generates a merging construct for * all available files. @@ -160,9 +169,11 @@ class ReadStreamResource { // right now this is pretty damn heavy, it copies the file list into a reader list every time List lst = new ArrayList(); fileToReaderMap = new HashMap(); + readerToIDMap = new HashMap(); for (File f : reads.getReadsFiles()) { SAMFileReader reader = new SAMFileReader(f, eagerDecode); fileToReaderMap.put(f, reader); + readerToIDMap.put(reader,new SAMReaderID(f)); reader.setValidationStringency(reads.getValidationStringency()); final SAMFileHeader header = reader.getFileHeader(); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index f60e52a03..32775dbc6 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; @@ -122,6 +123,13 @@ public abstract class SAMDataSource implements SimpleDataSource { */ public abstract SAMFileHeader getHeader(SAMReaderID reader); + /** + * Retrieves the id of the reader which built the given read. + * @param read The read to test. + * @return ID of the reader. + */ + public abstract SAMReaderID getReaderID(SAMRecord read); + /** * Returns Reads data structure containing information about the reads data sources placed in this pool as well as * information about how they are downsampled, sorted, and filtered @@ -143,7 +151,7 @@ public abstract class SAMDataSource implements SimpleDataSource { */ public File getSAMFile(SAMReaderID id) { return id.samFile; - } + } /** Returns true if there are read group duplicates within the merged headers. */ public abstract boolean hasReadGroupCollisions(); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java index 5d6e73775..9a1504ecb 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java @@ -54,6 +54,7 @@ class SAMResourcePool extends ResourcePool protected Reads reads; protected SamFileHeaderMerger headerMerger; protected Map fileToReaderMap; + protected Map readerToIDMap; /** * Do all the constituent BAM files have indices? We support some very limited @@ -76,6 +77,7 @@ class SAMResourcePool extends ResourcePool this.headerMerger = streamResource.getHeaderMerger(); this.hasIndex = streamResource.hasIndex(); this.fileToReaderMap = streamResource.getFileToReaderMapping(); + this.readerToIDMap = streamResource.getReaderToIDMapping(); // Add this resource to the pool. this.addNewResource(streamResource);