diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 1e99c7770..7da0bcd6f 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -377,13 +377,45 @@ public class GenomeAnalysisEngine { } + /** + * Returns a mapping from original input files to the SAMFileReaders + * + * @return the mapping + */ + public Map getFileToReaderMapping() { + return getDataSource().getFileToReaderMapping(); + } + /** * Returns a mapping from original input files to their (merged) read group ids * * @return the mapping */ public Map> getFileToReadGroupIdMapping() { - return getDataSource().getFileToReadGroupIdMapping(); + Map fileToReaderMap = getFileToReaderMapping(); + + // populate the file -> read group mapping + Map> fileToReadGroupIdMap = new HashMap>(); + for (Map.Entry entry : fileToReaderMap.entrySet()) { + + Set readGroups = new HashSet(5); + + for (SAMReadGroupRecord g : entry.getValue().getFileHeader().getReadGroups()) { + if (getDataSource().hasReadGroupCollisions()) { + // Check if there were read group clashes. + // If there were, use the SamFileHeaderMerger to translate from the + // original read group id to the read group id in the merged stream + readGroups.add(getDataSource().getReadGroupId(entry.getValue(), g.getReadGroupId())); + } else { + // otherwise, pass through the unmapped read groups since this is what Picard does as well + readGroups.add(g.getReadGroupId()); + } + } + + fileToReadGroupIdMap.put(entry.getKey(), readGroups); + } + + return fileToReadGroupIdMap; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java index 56b2df28d..a922e8aeb 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java @@ -113,12 +113,12 @@ public class IndexDrivenSAMDataSource extends SAMDataSource { } /** - * Returns a mapping from original input files to their (merged) read group ids + * Returns a mapping from original input files to the SAMFileReaders * * @return the mapping */ - public Map> getFileToReadGroupIdMapping() { - return resourcePool.getFileToReadGroupIdMapping(); + public Map getFileToReaderMapping() { + return resourcePool.getFileToReaderMapping(); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java index 20e8a7a9b..66a665171 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java @@ -77,7 +77,7 @@ class ReadStreamResource { /** * A mapping from original input file to merged read group record ids */ - private Map> fileToReadGroupIdMap = null; + private Map fileToReaderMap = null; public ReadStreamResource( Reads sourceInfo ) { SamFileHeaderMerger headerMerger = createHeaderMerger(sourceInfo, SAMFileHeader.SortOrder.coordinate); @@ -142,8 +142,8 @@ class ReadStreamResource { return readStreamPointer.getReadsOverlapping(segment); } - public Map> getFileToReadGroupIdMapping() { - return fileToReadGroupIdMap; + public Map getFileToReaderMapping() { + return fileToReaderMap; } /** @@ -159,7 +159,7 @@ class ReadStreamResource { // right now this is pretty damn heavy, it copies the file list into a reader list every time List lst = new ArrayList(); - Map fileToReaderMap = new HashMap(); + fileToReaderMap = new HashMap(); for (File f : reads.getReadsFiles()) { SAMFileReader reader = new SAMFileReader(f, eagerDecode); fileToReaderMap.put(f, reader); @@ -180,30 +180,6 @@ class ReadStreamResource { lst.add(reader); } - // create the header merger - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst,SORT_ORDER,true); - - // populate the file -> read group mapping - fileToReadGroupIdMap = new HashMap>(); - for (Map.Entry entry : fileToReaderMap.entrySet()) { - - Set readGroups = new HashSet(5); - - for (SAMReadGroupRecord g : entry.getValue().getFileHeader().getReadGroups()) { - if (headerMerger.hasReadGroupCollisions()) { - // Check if there were read group clashes. - // If there were, use the SamFileHeaderMerger to translate from the - // original read group id to the read group id in the merged stream - readGroups.add(headerMerger.getReadGroupId(entry.getValue(), g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - readGroups.add(g.getReadGroupId()); - } - } - - fileToReadGroupIdMap.put(entry.getKey(), readGroups); - } - - return headerMerger; + return new SamFileHeaderMerger(lst,SORT_ORDER,true); } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index da04745d3..e0810736f 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -13,7 +13,6 @@ import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; import java.util.Collection; -import java.util.Set; import java.util.Map; /* @@ -118,7 +117,7 @@ public abstract class SAMDataSource implements SimpleDataSource { * * @return the mapping */ - public Map> getFileToReadGroupIdMapping() { return null; } + public Map getFileToReaderMapping() { return null; } /** * Returns readers used by this data source. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java index 5df61de57..070a01077 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java @@ -32,9 +32,9 @@ import org.apache.log4j.Logger; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; import java.util.List; -import java.util.Set; import java.util.Map; import java.io.File; @@ -52,7 +52,7 @@ class SAMResourcePool extends ResourcePool /** Source information about the reads. */ protected Reads reads; protected SamFileHeaderMerger headerMerger; - protected Map> fileToReadGroupIdMap; + protected Map fileToReaderMap; /** * Do all the constituent BAM files have indices? We support some very limited @@ -74,7 +74,7 @@ class SAMResourcePool extends ResourcePool this.header = streamResource.getHeader(); this.headerMerger = streamResource.getHeaderMerger(); this.hasIndex = streamResource.hasIndex(); - this.fileToReadGroupIdMap = streamResource.getFileToReadGroupIdMapping(); + this.fileToReaderMap = streamResource.getFileToReaderMapping(); // Add this resource to the pool. this.addNewResource(streamResource); @@ -93,12 +93,12 @@ class SAMResourcePool extends ResourcePool public Reads getReadsInfo() { return reads; } /** - * Returns a mapping from original input files to their (merged) read group ids + * Returns a mapping from original input files to the SAMFileReaders * * @return the mapping */ - public Map> getFileToReadGroupIdMapping() { - return fileToReadGroupIdMap; + public Map getFileToReaderMapping() { + return fileToReaderMap; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java index cb050587a..45f234524 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java @@ -138,11 +138,11 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul } double[] likelihoods = gl.getLikelihoods(); beagleWriter.print(' '); - beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[GenotypeType.REF.ordinal()]))); + beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[refGenotype.ordinal()]))); beagleWriter.print(' '); - beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[GenotypeType.HET.ordinal()]))); + beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[hetGenotype.ordinal()]))); beagleWriter.print(' '); - beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[GenotypeType.HOM.ordinal()]))); + beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[homGenotype.ordinal()]))); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d6f82245a..21ce79e01 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.utils.cmdLine.Argument; import net.sf.samtools.*; @@ -35,11 +36,17 @@ public class IndelRealigner extends ReadWalker { protected boolean NWAY_OUTPUT = true; @Argument(fullName="outputSuffix", shortName="suffix", required=false, doc="Suffix to append to output bams (when using --NwayOutput) [default:'.cleaned']") - protected String outputSuffix = ".cleaned"; + protected String outputSuffix = "cleaned"; @Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]") protected Integer compressionLevel = 5; + @Argument(fullName="sortOnDisk", shortName="sortOnDisk", required=false, doc="Should we sort on disk instead of on the fly? This option is much slower but should be used when on-the-fly sorting fails because reads are too long [default:no]") + protected boolean SORT_ON_DISK = false; + + @Argument(fullName="knownIndels", shortName="knownIndels", required=false, doc="One or more rod triplets of known indels to try for alternate consenses; types must implement VariationRod") + protected ArrayList knownIndels = new ArrayList(); + // ADVANCED OPTIONS FOLLOW @Argument(fullName="outputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found") @@ -73,9 +80,14 @@ public class IndelRealigner extends ReadWalker { // the reads that fall into the current interval private ReadBin readsToClean = new ReadBin(); private ArrayList readsNotToClean = new ArrayList(); + private TreeSet knownIndelsToTry = new TreeSet(new Comparator(){ + public int compare(VariationRod rod1, VariationRod rod2) { + return (int)(rod1.getLocation().getStart() - rod2.getLocation().getStart()); + } + }); // the wrapper around the SAM writer - private Map writers = null; + private Map writers = null; // random number generator private Random generator; @@ -110,21 +122,24 @@ public class IndelRealigner extends ReadWalker { // set up the output writer(s) if ( baseWriterFilename != null ) { - writers = new HashMap(); + writers = new HashMap(); Map> readGroupMap = getToolkit().getFileToReadGroupIdMapping(); SAMFileWriterFactory factory = new SAMFileWriterFactory(); if ( NWAY_OUTPUT ) { - for ( File file : readGroupMap.keySet() ) { + Map readerMap = getToolkit().getFileToReaderMapping(); + for ( File file : readerMap.keySet() ) { String newFileName = file.getName().substring(0, file.getName().length()-3) + outputSuffix + ".bam"; - SAMFileWriter baseWriter = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), true, new File(baseWriterFilename, newFileName), compressionLevel); - SortingSAMFileWriter writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW); + SAMFileWriter writer = factory.makeBAMWriter(readerMap.get(file).getFileHeader(), !SORT_ON_DISK, new File(baseWriterFilename, newFileName), compressionLevel); + if ( !SORT_ON_DISK ) + writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW); for ( String rg : readGroupMap.get(file) ) writers.put(rg, writer); } } else { - SAMFileWriter baseWriter = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), true, new File(baseWriterFilename), compressionLevel); - SortingSAMFileWriter writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW); + SAMFileWriter writer = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), !SORT_ON_DISK, new File(baseWriterFilename), compressionLevel); + if ( !SORT_ON_DISK ) + writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW); for ( Set set : readGroupMap.values() ) { for ( String rg : set ) writers.put(rg, writer); @@ -135,6 +150,22 @@ public class IndelRealigner extends ReadWalker { // set up the random generator generator = new Random(RANDOM_SEED); + // set up the rods (since this is a ReadWalker we don't get rods from the traversal) + List> rods = new ArrayList>(); + ReferenceOrderedData.parseBindings(knownIndels, rods); + for ( ReferenceOrderedData rod : rods ) { + if ( !(rod instanceof VariationRod) ) + continue; + SeekableRODIterator iter = rod.iterator(); + while ( iter.hasNext() ) { + RODRecordList records = iter.next(); + for ( ReferenceOrderedDatum record : records ) { + if (((VariationRod)record).isIndel()) + knownIndelsToTry.add((VariationRod)record); + } + } + } + if ( OUT_INDELS != null ) { try { indelOutput = new FileWriter(new File(OUT_INDELS)); @@ -182,8 +213,8 @@ public class IndelRealigner extends ReadWalker { return; // break out the reads into sets for their respective writers - Map> bins = new HashMap>(); - for ( SortingSAMFileWriter writer : writers.values() ) + Map> bins = new HashMap>(); + for ( SAMFileWriter writer : writers.values() ) bins.put(writer, new HashSet()); for ( SAMRecord read : reads ) { @@ -197,8 +228,15 @@ public class IndelRealigner extends ReadWalker { } } - for ( Map.Entry> entry : bins.entrySet() ) - entry.getKey().addAlignments(entry.getValue()); + for ( Map.Entry> entry : bins.entrySet() ) { + if ( !SORT_ON_DISK ) { + // we can be efficient in this case by batching the reads all together + ((SortingSAMFileWriter)entry.getKey()).addAlignments(entry.getValue()); + } else { + for ( SAMRecord read : entry.getValue() ) + entry.getKey().addAlignment(read); + } + } } public Integer map(char[] ref, SAMRecord read) { @@ -237,6 +275,9 @@ public class IndelRealigner extends ReadWalker { } } else { // the read is past the current interval + + // TODO - NOW WE NEED TO GET THE APPROPRIATE KNOWN INDELS FOR THIS REGION + clean(readsToClean); // merge the two sets for emission @@ -272,13 +313,14 @@ public class IndelRealigner extends ReadWalker { } if ( writers != null ) { - for ( SortingSAMFileWriter writer : writers.values() ) { - writer.close(); - - // TODO -- fix me + for ( SAMFileWriter writer : writers.values() ) { + // TODO -- figure out why we're getting an exception thrown here + // TODO -- because we need to call close() to flush out the remaining reads from the writer try { - writer.getBaseWriter().close(); - } catch (net.sf.samtools.util.RuntimeIOException e) {} + writer.close(); + if ( !SORT_ON_DISK ) + ((SortingSAMFileWriter)writer).getBaseWriter().close(); + } catch (RuntimeException e) {} } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index f0405110c..39d697a73 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -191,7 +191,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testOtherOutput() { - String[] md5s = {"78482125d51f9eb2ee850a6b25921e84", "cf39a4a90e9f01b6f1183e6b1fd7520e"}; + String[] md5s = {"78482125d51f9eb2ee850a6b25921e84", "8cba0b8752f18fc620b4697840bc7291"}; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper" + " -R " + oneKGLocation + "reference/human_b36_both.fasta" +