From 023654696e14d1d680abe493e84f48e0ced6dd08 Mon Sep 17 00:00:00 2001 From: hanna Date: Thu, 4 Mar 2010 00:59:32 +0000 Subject: [PATCH] First pass at handling SAMFileReaders using a SAMReaderID. This allows us to firewall GATK users from the readers, which they could abuse in ways that could destabilize the GATK. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2923 348d0f76-0448-11de-a6fe-93d51630548a --- java/src/net/sf/samtools/SAMFileReader2.java | 12 - .../sting/gatk/GenomeAnalysisEngine.java | 54 ++- .../BlockDrivenSAMDataSource.java | 318 ++++++++++-------- .../IndexDrivenSAMDataSource.java | 24 +- .../simpleDataSources/SAMDataSource.java | 38 ++- .../simpleDataSources/SAMReaderID.java | 46 +++ .../gatk/walkers/indels/IndelRealigner.java | 8 +- 7 files changed, 285 insertions(+), 215 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMReaderID.java diff --git a/java/src/net/sf/samtools/SAMFileReader2.java b/java/src/net/sf/samtools/SAMFileReader2.java index a9377e17d..9943e6d53 100644 --- a/java/src/net/sf/samtools/SAMFileReader2.java +++ b/java/src/net/sf/samtools/SAMFileReader2.java @@ -185,16 +185,4 @@ public class SAMFileReader2 extends SAMFileReader { throw new StingException("Unable to run method findIndexFile",ex); } } - - @Override - public boolean equals(Object other) { - if(other == null) return false; - if(!(other instanceof SAMFileReader2)) return false; - return this.sourceFile.equals(((SAMFileReader2)other).sourceFile); - } - - @Override - public int hashCode() { - return sourceFile.hashCode(); - } } diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 759c0932d..75fd0ba2d 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -30,10 +30,7 @@ import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.*; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.*; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.shards.Shard; @@ -344,18 +341,16 @@ public class GenomeAnalysisEngine { * @return */ public List> getSamplesByReaders() { - - - Collection readers = getDataSource().getReaders(); + List readers = getDataSource().getReaderIDs(); List> sample_sets = new ArrayList>(readers.size()); - for (SAMFileReader r : readers) { + for (SAMReaderID r : readers) { Set samples = new HashSet(1); sample_sets.add(samples); - for (SAMReadGroupRecord g : r.getFileHeader().getReadGroups()) { + for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { samples.add(g.getSample()); } } @@ -375,16 +370,16 @@ public class GenomeAnalysisEngine { public List> getLibrariesByReaders() { - Collection readers = getDataSource().getReaders(); + List readers = getDataSource().getReaderIDs(); List> lib_sets = new ArrayList>(readers.size()); - for (SAMFileReader r : readers) { + for (SAMReaderID r : readers) { Set libs = new HashSet(2); lib_sets.add(libs); - for (SAMReadGroupRecord g : r.getFileHeader().getReadGroups()) { + for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { libs.add(g.getLibrary()); } } @@ -393,42 +388,30 @@ public class GenomeAnalysisEngine { } - /** - * Returns a mapping from original input files to the SAMFileReaders - * - * @return the mapping - */ - public Map getFileToReaderMapping() { - return getDataSource().getFileToReaderMapping(); - } - /** * Returns a mapping from original input files to their (merged) read group ids * * @return the mapping */ public Map> getFileToReadGroupIdMapping() { - Map fileToReaderMap = getFileToReaderMapping(); - // populate the file -> read group mapping Map> fileToReadGroupIdMap = new HashMap>(); - for (Map.Entry entry : fileToReaderMap.entrySet()) { - + for (SAMReaderID id: getDataSource().getReaderIDs()) { Set readGroups = new HashSet(5); - for (SAMReadGroupRecord g : entry.getValue().getFileHeader().getReadGroups()) { + for (SAMReadGroupRecord g : getDataSource().getHeader(id).getReadGroups()) { if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes. // If there were, use the SamFileHeaderMerger to translate from the // original read group id to the read group id in the merged stream - readGroups.add(getDataSource().getReadGroupId(entry.getValue(), g.getReadGroupId())); + readGroups.add(getDataSource().getReadGroupId(id,g.getReadGroupId())); } else { // otherwise, pass through the unmapped read groups since this is what Picard does as well readGroups.add(g.getReadGroupId()); } } - fileToReadGroupIdMap.put(entry.getKey(), readGroups); + fileToReadGroupIdMap.put(getDataSource().getSAMFile(id),readGroups); } return fileToReadGroupIdMap; @@ -447,16 +430,16 @@ public class GenomeAnalysisEngine { public List> getMergedReadGroupsByReaders() { - Collection readers = getDataSource().getReaders(); + List readers = getDataSource().getReaderIDs(); List> rg_sets = new ArrayList>(readers.size()); - for (SAMFileReader r : readers) { + for (SAMReaderID r : readers) { Set groups = new HashSet(5); rg_sets.add(groups); - for (SAMReadGroupRecord g : r.getFileHeader().getReadGroups()) { + for (SAMReadGroupRecord g : getDataSource().getHeader(r).getReadGroups()) { if (getDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: // use HeaderMerger to translate original read group id from the reader into the read group id in the // merged stream, and save that remapped read group id to associate it with specific reader @@ -789,6 +772,15 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(); } + /** + * Returns the unmerged SAM file header for an individual reader. + * @param reader The reader. + * @return Header for that reader. + */ + public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { + return readsDataSource.getHeader(reader); + } + /** * Returns data source object encapsulating all essential info and handlers used to traverse * reads; header merger, individual file readers etc can be accessed through the returned data source object. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java index ac6e5d2fa..0761e9480 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/BlockDrivenSAMDataSource.java @@ -23,7 +23,6 @@ import java.io.File; * @version 0.1 */ public class BlockDrivenSAMDataSource extends SAMDataSource { - /** * A collection of readers driving the merging process. */ @@ -32,7 +31,17 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { /** * The merged header. */ - private final SAMFileHeader header; + private final SAMFileHeader mergedHeader; + + /** + * Whether the read groups in overlapping files collide. + */ + private final boolean hasReadGroupCollisions; + + /** + * Maps the SAM readers' original read group ids to their revised ids. + */ + private final Map mergedReadGroupMappings = new HashMap(); /** * Create a new block-aware SAM data source given the supplied read metadata. @@ -44,32 +53,36 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { logger.warn("Experimental sharding is enabled. Many use cases are not supported. Please use with care."); resourcePool = new SAMResourcePool(Integer.MAX_VALUE); - Collection readers = resourcePool.getAvailableReaders(); - header = new SamFileHeaderMerger(readers,SAMFileHeader.SortOrder.coordinate,true).getMergedHeader(); + SAMReaders readers = resourcePool.getAvailableReaders(); + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers.values(),SAMFileHeader.SortOrder.coordinate,true); + mergedHeader = headerMerger.getMergedHeader(); + hasReadGroupCollisions = headerMerger.hasReadGroupCollisions(); + + for(SAMReaderID id: readerIDs) { + SAMFileReader reader = readers.getReader(id); + ReadGroupMapping mapping = new ReadGroupMapping(); + + List readGroups = reader.getFileHeader().getReadGroups(); + for(SAMReadGroupRecord readGroup: readGroups) + mapping.put(readGroup.getReadGroupId(),headerMerger.getReadGroupId(reader,readGroup.getReadGroupId())); + + mergedReadGroupMappings.put(id,mapping); + } + resourcePool.releaseReaders(readers); } - public boolean hasIndex() { - Collection readers = resourcePool.getAvailableReaders(); - try { - return hasIndex(readers); - } - finally { - resourcePool.releaseReaders(readers); - } - } - /** - * Report whether a given collection of SAM file readers is indexed. - * @param readers The collection of readers. - * @return True if the given collection of readers is indexed. + * True if all readers have an index. + * @return */ - private boolean hasIndex(Collection readers) { - for(SAMFileReader reader: readers) { + public boolean hasIndex() { + for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) { if(!reader.hasIndex()) return false; } - return true; + return true; } /** @@ -78,19 +91,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return A map of reader back to bin. */ public List getOverlappingBins(final GenomeLoc location) { - Collection readers = resourcePool.getAvailableReaders(); + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + if(readers.isEmpty()) + return Collections.emptyList(); - try { - if(readers.size() == 0) - return Collections.emptyList(); - - // All readers will have the same bin structure, so just use the first bin as an example. - SAMFileReader2 reader = (SAMFileReader2)readers.iterator().next(); - return reader.getOverlappingBins(location.getContig(),(int)location.getStart(),(int)location.getStop()); - } - finally { - resourcePool.releaseReaders(readers); - } + // All readers will have the same bin structure, so just use the first bin as an example. + SAMFileReader2 reader = (SAMFileReader2)readers.iterator().next(); + return reader.getOverlappingBins(location.getContig(),(int)location.getStart(),(int)location.getStop()); } /** @@ -99,18 +106,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return A map of the file pointers bounding the bin. */ public Map> getFilePointersBounding(Bin bin) { - Collection readers = resourcePool.getAvailableReaders(); - try { - Map> filePointers = new HashMap>(); - for(SAMFileReader reader: readers) { - SAMFileReader2 reader2 = (SAMFileReader2)reader; - filePointers.put(reader2,reader2.getFilePointersBounding(bin)); - } - return filePointers; - } - finally { - resourcePool.releaseReaders(readers); + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + Map> filePointers = new HashMap>(); + for(SAMFileReader reader: readers) { + SAMFileReader2 reader2 = (SAMFileReader2)reader; + filePointers.put(reader2,reader2.getFilePointersBounding(bin)); } + return filePointers; } /** @@ -118,18 +120,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return A mapping of reader to current position. */ public Map getCurrentPosition() { - Collection readers = resourcePool.getAvailableReaders(); - try { - Map currentPositions = new HashMap(); - for(SAMFileReader reader: readers) { - SAMFileReader2 reader2 = (SAMFileReader2)reader; - currentPositions.put(reader2,reader2.getCurrentPosition()); - } - return currentPositions; - } - finally { - resourcePool.releaseReaders(readers); + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + Map currentPositions = new HashMap(); + for(SAMFileReader reader: readers) { + SAMFileReader2 reader2 = (SAMFileReader2)reader; + currentPositions.put(reader2,reader2.getCurrentPosition()); } + return currentPositions; } /** @@ -137,18 +134,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return Number of levels in this index. */ public int getNumIndexLevels() { - Collection readers = resourcePool.getAvailableReaders(); - try { - if(readers.size() == 0) - throw new StingException("Unable to determine number of index levels; no BAMs are present."); - if(!hasIndex(readers)) - throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); - SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); - return firstReader.getNumIndexLevels(); - } - finally { - resourcePool.releaseReaders(readers); - } + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + if(readers.isEmpty()) + throw new StingException("Unable to determine number of index levels; no BAMs are present."); + if(!hasIndex()) + throw new SAMException("Unable to determine number of index levels; BAM file index is not present."); + SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); + return firstReader.getNumIndexLevels(); } /** @@ -157,18 +149,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return the level associated with the given bin number. */ public int getLevelForBin(final Bin bin) { - Collection readers = resourcePool.getAvailableReaders(); - try { - if(readers.size() == 0) - throw new StingException("Unable to determine number of level for bin; no BAMs are present."); - if(!hasIndex(readers)) - throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); - SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); - return firstReader.getLevelForBin(bin); - } - finally { - resourcePool.releaseReaders(readers); - } + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + if(readers.isEmpty()) + throw new StingException("Unable to determine number of level for bin; no BAMs are present."); + if(!hasIndex()) + throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); + SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); + return firstReader.getLevelForBin(bin); } /** @@ -177,18 +164,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return The last position that the given bin can represent. */ public int getFirstLocusInBin(final Bin bin) { - Collection readers = resourcePool.getAvailableReaders(); - try { - if(readers.size() == 0) - throw new StingException("Unable to determine number of level for bin; no BAMs are present."); - if(!hasIndex(readers)) - throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); - SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); - return firstReader.getFirstLocusInBin(bin); - } - finally { - resourcePool.releaseReaders(readers); - } + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + if(readers.isEmpty()) + throw new StingException("Unable to determine number of level for bin; no BAMs are present."); + if(!hasIndex()) + throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); + SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); + return firstReader.getFirstLocusInBin(bin); } /** @@ -197,18 +179,13 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return The last position that the given bin can represent. */ public int getLastLocusInBin(final Bin bin) { - Collection readers = resourcePool.getAvailableReaders(); - try { - if(readers.size() == 0) - throw new StingException("Unable to determine number of level for bin; no BAMs are present."); - if(!hasIndex(readers)) - throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); - SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); - return firstReader.getLastLocusInBin(bin); - } - finally { - resourcePool.releaseReaders(readers); - } + SAMReaders readers = resourcePool.getReadersWithoutLocking(); + if(readers.isEmpty()) + throw new StingException("Unable to determine number of level for bin; no BAMs are present."); + if(!hasIndex()) + throw new SAMException("Unable to determine number of level for bin; BAM file index is not present."); + SAMFileReader2 firstReader = (SAMFileReader2)readers.iterator().next(); + return firstReader.getLastLocusInBin(bin); } /** @@ -252,7 +229,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { } private StingSAMIterator getIterator(BAMFormatAwareShard shard, boolean enableVerification) { - Collection readers = resourcePool.getAvailableReaders(); + SAMReaders readers = resourcePool.getAvailableReaders(); Map> readerToIteratorMap = new HashMap>(); for(SAMFileReader reader: readers) { @@ -261,7 +238,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { readerToIteratorMap.put(reader2,reader2.iterator(chunks)); } - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers,SAMFileHeader.SortOrder.coordinate,true); + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers.values(),SAMFileHeader.SortOrder.coordinate,true); // Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set. CloseableIterator iterator = new MergingSamRecordIterator(headerMerger,readerToIteratorMap,true); @@ -280,15 +257,11 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return The merged header. */ public SAMFileHeader getHeader() { - return header; + return mergedHeader; } - /** - * Currently unsupported. - * @return - */ - public Collection getReaders() { - throw new StingException("Currently unable to get readers for shard-based fields."); + public SAMFileHeader getHeader(SAMReaderID id) { + return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader(); } /** @@ -296,15 +269,15 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { * @return False always. */ public boolean hasReadGroupCollisions() { - return false; + return hasReadGroupCollisions; } /** - * Currently unsupported. - * @return + * Gets the revised read group id mapped to this 'original' read group id. + * @return Merged read group ID. */ - public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { - throw new UnsupportedOperationException("Getting read group ID from this experimental SAM reader is not currently supported."); + public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { + return mergedReadGroupMappings.get(reader).get(originalReadGroupId); } private class SAMResourcePool { @@ -316,66 +289,130 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { /** * All iterators of this reference-ordered data. */ - private List allResources = new ArrayList(); + private List allResources = new ArrayList(); /** * All iterators that are not currently in service. */ - private List availableResources = new ArrayList(); + private List availableResources = new ArrayList(); public SAMResourcePool(final int maxEntries) { this.maxEntries = maxEntries; } + /** + * Dangerous internal method; retrieves any set of readers, whether in iteration or not. + * Used to handle non-exclusive, stateless operations, such as index queries. + * @return Any collection of SAMReaders, whether in iteration or not. + */ + protected SAMReaders getReadersWithoutLocking() { + synchronized(this) { + if(allResources.size() == 0) + createNewResource(); + } + return allResources.get(0); + } + /** * Choose a set of readers from the pool to use for this query. When complete, * @return */ - public synchronized Collection getAvailableReaders() { + public synchronized SAMReaders getAvailableReaders() { if(availableResources.size() == 0) createNewResource(); - SAMFileReaders readers = availableResources.get(0); + SAMReaders readers = availableResources.get(0); availableResources.remove(readers); return readers; } - public synchronized void releaseReaders(Collection readers) { + public synchronized void releaseReaders(SAMReaders readers) { if(!allResources.contains(readers)) throw new StingException("Tried to return readers from the pool that didn't originate in the pool."); - availableResources.add((SAMFileReaders)readers); + availableResources.add(readers); } private synchronized void createNewResource() { if(allResources.size() > maxEntries) throw new StingException("Cannot create a new resource pool. All resources are in use."); - SAMFileReaders readers = new SAMFileReaders(reads); + SAMReaders readers = new SAMReaders(reads); allResources.add(readers); availableResources.add(readers); } + } + + /** + * A collection of readers derived from a reads metadata structure. + */ + private class SAMReaders implements Iterable { /** - * A collection of readers derived from a reads metadata structure. + * Internal storage for a map of id -> reader. */ - private class SAMFileReaders extends ArrayList { - /** - * Derive a new set of readers from the Reads metadata. - * @param sourceInfo Metadata for the reads to load. - */ - public SAMFileReaders(Reads sourceInfo) { - for(File readsFile: sourceInfo.getReadsFiles()) { - SAMFileReader2 reader = new SAMFileReader2(readsFile); - reader.setValidationStringency(sourceInfo.getValidationStringency()); - add(reader); - } + private final Map readers = new LinkedHashMap(); + + /** + * Derive a new set of readers from the Reads metadata. + * @param sourceInfo Metadata for the reads to load. + */ + public SAMReaders(Reads sourceInfo) { + for(File readsFile: sourceInfo.getReadsFiles()) { + SAMFileReader2 reader = new SAMFileReader2(readsFile); + reader.setValidationStringency(sourceInfo.getValidationStringency()); + readers.put(new SAMReaderID(readsFile),reader); } - } + } + + /** + * Retrieve the reader from the data structure. + * @param id The ID of the reader to retrieve. + */ + public SAMFileReader getReader(SAMReaderID id) { + if(!readers.containsKey(id)) + throw new NoSuchElementException("No reader is associated with id " + id); + return readers.get(id); + } + + /** + * Convenience method to get the header associated with an individual ID. + * @param id ID for which to retrieve the header. + * @return Header for this SAM file. + */ + public SAMFileHeader getHeader(SAMReaderID id) { + if(!readers.containsKey(id)) + throw new NoSuchElementException("No reader is associated with id " + id); + return readers.get(id).getFileHeader(); + } + + /** + * Returns an iterator over all readers in this structure. + * @return An iterator over readers. + */ + public Iterator iterator() { + return readers.values().iterator(); + } + + /** + * Returns whether any readers are present in this structure. + * @return + */ + public boolean isEmpty() { + return readers.isEmpty(); + } + + /** + * Gets all the actual readers out of this data structure. + * @return A collection of the readers. + */ + public Collection values() { + return readers.values(); + } } private class ReleasingIterator implements StingSAMIterator { /** * The resource acting as the source of the data. */ - private final Collection resource; + private final SAMReaders resource; /** * The iterator to wrap. @@ -386,7 +423,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return wrappedIterator.getSourceInfo(); } - public ReleasingIterator( Collection resource, StingSAMIterator wrapped ) { + public ReleasingIterator(SAMReaders resource, StingSAMIterator wrapped) { this.resource = resource; this.wrappedIterator = wrapped; } @@ -412,4 +449,9 @@ public class BlockDrivenSAMDataSource extends SAMDataSource { return wrappedIterator.next(); } } + + /** + * Maps read groups in the original SAMFileReaders to read groups in + */ + private class ReadGroupMapping extends HashMap {} } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java index 34fabdd1d..bfa952cd7 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java @@ -4,7 +4,6 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileReader; import net.sf.samtools.util.CloseableIterator; -import net.sf.picard.sam.SamFileHeaderMerger; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShard; @@ -17,9 +16,6 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; -import java.util.*; -import java.io.File; - /* * Copyright (c) 2009 The Broad Institute * @@ -112,13 +108,8 @@ public class IndexDrivenSAMDataSource extends SAMDataSource { return resourcePool.getHeader(); } - /** - * Returns a mapping from original input files to the SAMFileReaders - * - * @return the mapping - */ - public Map getFileToReaderMapping() { - return resourcePool.getFileToReaderMapping(); + public SAMFileHeader getHeader(SAMReaderID id) { + return resourcePool.fileToReaderMap.get(id.samFile).getFileHeader(); } /** @@ -128,21 +119,14 @@ public class IndexDrivenSAMDataSource extends SAMDataSource { */ public Reads getReadsInfo() { return reads; } - /** - * Returns header merger: a class that keeps the mapping between original read groups and read groups - * of the merged stream; merger also provides access to the individual file readers (and hence headers - * prior to the merging too) maintained by the system. - * @return - */ - public Collection getReaders() { return resourcePool.getHeaderMerger().getReaders(); } - /** Returns true if there are read group duplicates within the merged headers. */ public boolean hasReadGroupCollisions() { return resourcePool.getHeaderMerger().hasReadGroupCollisions(); } /** Returns the read group id that should be used for the input read and RG id. */ - public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { + public String getReadGroupId(final SAMReaderID id, final String originalReadGroupId) { + SAMFileReader reader = resourcePool.getFileToReaderMapping().get(id.samFile); return resourcePool.getHeaderMerger().getReadGroupId(reader,originalReadGroupId); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index e0810736f..87167f942 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; import net.sf.picard.filter.FilteringIterator; import net.sf.picard.filter.SamRecordFilter; @@ -14,6 +13,8 @@ import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; import java.util.Collection; import java.util.Map; +import java.util.List; +import java.util.ArrayList; /* * Copyright (c) 2009 The Broad Institute @@ -48,10 +49,14 @@ import java.util.Map; * Converts shards to SAM iterators over the specified region */ public abstract class SAMDataSource implements SimpleDataSource { - /** Backing support for reads. */ protected final Reads reads; + /** + * Identifiers for the readers driving this data source. + */ + protected final List readerIDs = new ArrayList(); + /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(SAMDataSource.class); @@ -87,6 +92,7 @@ public abstract class SAMDataSource implements SimpleDataSource { if (!smFile.canRead()) { throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName()); } + readerIDs.add(new SAMReaderID(smFile)); } } @@ -104,6 +110,12 @@ public abstract class SAMDataSource implements SimpleDataSource { */ public abstract SAMFileHeader getHeader(); + /** + * Gets the (unmerged) header for the given reader. + * @param reader Unique identifier for the reader. + * @return Unmerged header. + */ + public abstract SAMFileHeader getHeader(SAMReaderID reader); /** * Returns Reads data structure containing information about the reads data sources placed in this pool as well as @@ -112,23 +124,27 @@ public abstract class SAMDataSource implements SimpleDataSource { */ public Reads getReadsInfo() { return reads; } - /** - * Returns a mapping from original input files to their (merged) read group ids - * - * @return the mapping - */ - public Map getFileToReaderMapping() { return null; } - /** * Returns readers used by this data source. */ - public abstract Collection getReaders(); + public List getReaderIDs() { + return readerIDs; + } + + /** + * Gets the SAM file associated with a given reader ID. + * @param id The reader for which to retrieve the source file. + * @return the file actually associated with the id. + */ + public File getSAMFile(SAMReaderID id) { + return id.samFile; + } /** Returns true if there are read group duplicates within the merged headers. */ public abstract boolean hasReadGroupCollisions(); /** Returns the read group id that should be used for the input read and RG id. */ - public abstract String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId); + public abstract String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId); /** * diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMReaderID.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMReaderID.java new file mode 100644 index 000000000..f5bf9f64e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMReaderID.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.gatk.datasources.simpleDataSources; + +import java.io.File; + +/** + * Uniquely identifies a SAM file reader. + * + * @author mhanna + * @version 0.1 + */ +public class SAMReaderID { + /** + * The SAM file at the heart of this reader. SAMReaderID + * currently supports only file-based readers. + */ + protected final File samFile; + + /** + * Creates an identifier for a SAM file based on read. + * @param samFile The source file for SAM data. + */ + protected SAMReaderID(File samFile) { + this.samFile = samFile; + } + + /** + * Compare two IDs to see whether they're equal. + * @param other The other identifier. + * @return True iff the two readers point to the same file. + */ + public boolean equals(Object other) { + if(other == null) return false; + if(!(other instanceof SAMReaderID)) return false; + + SAMReaderID otherID = (SAMReaderID)other; + return this.samFile.equals(otherID.samFile); + } + + /** + * Generate a hash code for this object. + * @return A hash code, based solely on the file name at this point. + */ + public int hashCode() { + return samFile.hashCode(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 6bcc9e173..77208e8d0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.utils.cmdLine.Argument; @@ -127,9 +128,10 @@ public class IndelRealigner extends ReadWalker { SAMFileWriterFactory factory = new SAMFileWriterFactory(); if ( NWAY_OUTPUT ) { - Map readerMap = getToolkit().getFileToReaderMapping(); - for ( File file : readerMap.keySet() ) { - SAMFileHeader header = readerMap.get(file).getFileHeader(); + List ids = getToolkit().getDataSource().getReaderIDs(); + for ( SAMReaderID id: ids ) { + File file = getToolkit().getDataSource().getSAMFile(id); + SAMFileHeader header = getToolkit().getSAMFileHeader(id); if ( SORTING_STRATEGY == RealignerSortingStrategy.NO_SORT ) header.setSortOrder(SAMFileHeader.SortOrder.unsorted); String newFileName = file.getName().substring(0, file.getName().length()-3) + outputSuffix + ".bam";