diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 705dcdad7..1008a3edc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -31,7 +31,6 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -67,26 +66,15 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) protected double MISMATCH_THRESHOLD = 0.15; - @Argument(fullName="output", shortName="O", required=false, doc="Output bam (or directory if using --NwayOutput)") - protected String baseWriterFilename = null; - - @Argument(fullName="NWayOutput", shortName="nway", required=false, doc="Should the reads be emitted in a separate bam file for each one of the input bams? [default:no]") - protected boolean NWAY_OUTPUT = false; - - @Argument(fullName="outputSuffix", shortName="suffix", required=false, doc="Suffix to append to output bams (when using --NwayOutput) [default:'.cleaned']") - protected String outputSuffix = "cleaned"; + @Argument(fullName="output", shortName="O", required=false, doc="Output bam") + protected String writerFilename = null; @Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]") protected Integer compressionLevel = 5; - public enum RealignerSortingStrategy { - NO_SORT, - ON_DISK, - IN_MEMORY - } - - @Argument(fullName="sortStrategy", shortName="sort", required=false, doc="What type of sorting strategy should we use? Options include NO_SORT, ON_DISK, and IN_MEMORY. Sorting in memory is much faster than on disk but should be used with care - with too much coverage or with long reads it might generate failures [default:ON_DISK]") - protected RealignerSortingStrategy SORTING_STRATEGY = RealignerSortingStrategy.ON_DISK; + @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ + "If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false) + protected int MAX_RECORDS_IN_RAM = 500000; // ADVANCED OPTIONS FOLLOW @@ -109,14 +97,8 @@ public class IndelRealigner extends ReadWalker { "if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is", required=false) protected int MAX_READS = 20000; - @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time "+ - "when using ON_DISK sorting option. If too low, the tool may run out of system file descriptors needed to perform sorting; "+ - "if too high, the tool may run out of memory in the regions of unusually deep coverage (consider also increasing VM heap size if this happens)", - required=false) - protected int MAX_RECORDS_IN_RAM = 500000; - - @Argument(fullName="writerWindowSize", shortName="writerWindowSize", doc="the window over which the writer will store reads when --sortInMemory is enabled", required=false) - protected int SORTING_WRITER_WINDOW = 300; + @Argument(fullName="sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", required=false, doc="Should we sort the final bam in coordinate order even though it will be malformed because mate pairs of realigned reads will contain inaccurate information?") + protected boolean SORT_IN_COORDINATE_ORDER = false; @Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") protected boolean NO_PG_TAG = false; @@ -133,7 +115,7 @@ public class IndelRealigner extends ReadWalker { private final IdentityHashMap knownIndelsToTry = new IdentityHashMap(); // the wrapper around the SAM writer - private Map writers = null; + private SAMFileWriter writer = null; // random number generator private static final long RANDOM_SEED = 1252863495; @@ -173,32 +155,13 @@ public class IndelRealigner extends ReadWalker { currentInterval = intervals.hasNext() ? intervals.next() : null; // set up the output writer(s) - if ( baseWriterFilename != null ) { - writers = new HashMap(); - Map> readGroupMap = getToolkit().getFileToReadGroupIdMapping(); + if ( writerFilename != null ) { SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setMaxRecordsInRam(MAX_RECORDS_IN_RAM); - if ( NWAY_OUTPUT ) { - List ids = getToolkit().getDataSource().getReaderIDs(); - for ( SAMReaderID id: ids ) { - File file = getToolkit().getDataSource().getSAMFile(id); - SAMFileHeader header = getToolkit().getSAMFileHeader(); - String newFileName = file.getName().substring(0, file.getName().length()-3) + outputSuffix + ".bam"; - File newFile = new File(baseWriterFilename, newFileName); - SAMFileWriter writer = makeWriter(factory, header, newFile); - for ( String rg : readGroupMap.get(file) ) - writers.put(rg, writer); - } - } else { - SAMFileHeader header = getToolkit().getSAMFileHeader(); - File file = new File(baseWriterFilename); - SAMFileWriter writer = makeWriter(factory, header, file); - for ( Set set : readGroupMap.values() ) { - for ( String rg : set ) - writers.put(rg, writer); - } - } + SAMFileHeader header = getToolkit().getSAMFileHeader(); + File file = new File(writerFilename); + writer = makeWriter(factory, header, file); } if ( OUT_INDELS != null ) { @@ -231,8 +194,10 @@ public class IndelRealigner extends ReadWalker { } private SAMFileWriter makeWriter(SAMFileWriterFactory factory, SAMFileHeader header, File file) { - if ( SORTING_STRATEGY == RealignerSortingStrategy.NO_SORT ) - header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + if ( SORT_IN_COORDINATE_ORDER ) + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + else + header.setSortOrder(SAMFileHeader.SortOrder.queryname); if ( !NO_PG_TAG ) { final SAMProgramRecord programRecord = new SAMProgramRecord("GATK IndelRealigner"); @@ -241,61 +206,23 @@ public class IndelRealigner extends ReadWalker { header.addProgramRecord( programRecord ); } - SAMFileWriter writer = factory.makeBAMWriter(header, SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY, file, compressionLevel); - - if ( SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY ) - writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW); + SAMFileWriter writer = factory.makeBAMWriter(header, false, file, compressionLevel); return writer; } private void emit(final SAMRecord read) { - if ( writers != null ) { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if ( readGroup == null || readGroup.getReadGroupId() == null ) { - if ( writers.size() > 1 ) - throw new StingException("There are multiple output writers but read " + read.toString() + " has no read group"); - writers.values().iterator().next().addAlignment(read); - } else { - writers.get(readGroup.getReadGroupId()).addAlignment(read); - } - } + if ( writer != null ) + writer.addAlignment(read); } private void emit(final List reads) { - if ( writers == null ) - return; - - // break out the reads into sets for their respective writers - Map> bins = new HashMap>(); - for ( SAMFileWriter writer : writers.values() ) - bins.put(writer, new HashSet()); - - for ( SAMRecord read : reads ) { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if ( readGroup == null || readGroup.getReadGroupId() == null ) { - if ( writers.size() > 1 ) - throw new StingException("There are multiple output writers but read " + read.toString() + " has no read group"); - bins.get(writers.values().iterator().next()).add(read); - } else { - bins.get(writers.get(readGroup.getReadGroupId())).add(read); - } - } - - for ( Map.Entry> entry : bins.entrySet() ) { - if ( SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY ) { - // we can be efficient in this case by batching the reads all together - ((SortingSAMFileWriter)entry.getKey()).addAlignments(entry.getValue()); - } else { - for ( SAMRecord read : entry.getValue() ) - entry.getKey().addAlignment(read); - } + if ( writer != null ) { + for ( SAMRecord read : reads ) + writer.addAlignment(read); } } - long nPerfectMatches = 0; - long nReadsToClean = 0; - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { if ( currentInterval == null ) { emit(read); @@ -328,7 +255,6 @@ public class IndelRealigner extends ReadWalker { readsNotToClean.add(read); } else { - nReadsToClean++; readsToClean.add(read, ref.getBases()); // add the rods to the list of known variants populateKnownIndels(metaDataTracker, null); @@ -386,14 +312,8 @@ public class IndelRealigner extends ReadWalker { emit(readsNotToClean); } - if ( writers != null ) { - HashSet uniqueWriters = new HashSet(writers.values()); - for ( SAMFileWriter writer : uniqueWriters ) { - writer.close(); - if ( SORTING_STRATEGY == RealignerSortingStrategy.IN_MEMORY ) - ((SortingSAMFileWriter)writer).getBaseWriter().close(); - } - } + if ( writer != null ) + writer.close(); if ( OUT_INDELS != null ) { try { @@ -536,11 +456,6 @@ public class IndelRealigner extends ReadWalker { } // otherwise, we can emit it as is else { -// nPerfectMatches++; -// if ( nPerfectMatches % 1000 == 0 ) { -// logger.info(String.format("Perfect matching fraction: %d %d => %.2f", nPerfectMatches, nReadsToClean, 100.0 * nPerfectMatches / ( nReadsToClean + 1))); -// } - // if ( debugOn ) System.out.println("Emitting as is..."); //logger.debug("Adding " + aRead.getRead().getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads"); refReads.add(read); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java deleted file mode 100755 index 70bbafcae..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java +++ /dev/null @@ -1,125 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; - -import java.util.TreeSet; -import java.util.Iterator; -import java.util.Collection; - -/** - * @author ebanks - * SortingSAMFileWriter - * - * this class extends the samtools SAMFileWriter class and caches reads for N loci so that reads - * can be emitted out of order (provided they are within the N-locus window) - * - */ -public class SortingSAMFileWriter implements SAMFileWriter { - - // the base writer from Picard - private SAMFileWriter baseWriter; - - // the window over which we agree to accumulate reads - private int window; - - // the reads we are accumulating - private TreeSet cachedReads = new TreeSet(new SAMRecordCoordinateComparatorWithUnmappedReads()); - - - /** - * Constructor - * - * @param baseWriter the real SAMFileWriter - * @param window the window over which we agree to store reads - */ - public SortingSAMFileWriter(SAMFileWriter baseWriter, int window) { - this.baseWriter = baseWriter; - this.window = window; - } - - /** - * Add a read to the writer for emission - * - * @param read the read to emit - */ - public void addAlignment(SAMRecord read) { - // at a new contig, clear the cache - if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < read.getReferenceIndex() ) - clearCache(); - - long currentPos = read.getAlignmentStart(); - - Iterator iter = cachedReads.iterator(); - while ( iter.hasNext() ) { - SAMRecord cachedRead = iter.next(); - if ( currentPos - cachedRead.getAlignmentStart() >= window ) { - baseWriter.addAlignment(cachedRead); - iter.remove(); - } else { - break; - } - } - - cachedReads.add(read); - } - - /** - * Add a list of reads to the writer for emission; the reads do NOT need to be sorted - * - * @param reads the reads to emit - */ - public void addAlignments(Collection reads) { - if ( reads.size() == 0 ) - return; - - // at a new contig, clear the cache - if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < reads.iterator().next().getReferenceIndex() ) - clearCache(); - - cachedReads.addAll(reads); - - // get the last read in the cache - SAMRecord last = cachedReads.last(); - - long currentPos = last.getAlignmentStart(); - - Iterator iter = cachedReads.iterator(); - while ( iter.hasNext() ) { - SAMRecord cachedRead = iter.next(); - if ( currentPos - cachedRead.getAlignmentStart() >= window ) { - baseWriter.addAlignment(cachedRead); - iter.remove(); - } else { - break; - } - } - } - - /** - * @return the SAM file header - */ - public SAMFileHeader getFileHeader() { - return baseWriter.getFileHeader(); - } - - /** - * close this writer by clearing the cache (but DO NOT close the underlying SAMFileWriter) - */ - public void close() { - clearCache(); - } - - /** - * @return the underlying SAMFileWriter - */ - public SAMFileWriter getBaseWriter() { - return baseWriter; - } - - private void clearCache() { - Iterator iter = cachedReads.iterator(); - while ( iter.hasNext() ) - baseWriter.addAlignment(iter.next()); - cachedReads.clear(); - } -} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index 56107c997..e7f0b14bf 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -4,38 +4,23 @@ import org.broadinstitute.sting.WalkerTest; import org.junit.Test; import java.util.Arrays; -import java.io.File; public class IndelRealignerIntegrationTest extends WalkerTest { @Test public void testRealigner() { - String[] md5lod5 = {"96edef86cea95f312ee8295b38227eb8", "d4d8ff567b614729ab8c52bd7d6bef48"}; + String[] md5lod5 = {"d9cbff4832fc3ee7a7ad1c58cc891bdd", "d4d8ff567b614729ab8c52bd7d6bef48"}; WalkerTestSpec spec1 = new WalkerTestSpec( - "-T IndelRealigner -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023800-10332350 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -snps %s", + "-T IndelRealigner -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023800-10332350 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -snps %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", 2, Arrays.asList(md5lod5)); executeTest("test Lod5", spec1); - String[] md5lod200 = {"96edef86cea95f312ee8295b38227eb8", "d4d8ff567b614729ab8c52bd7d6bef48"}; + String[] md5lod200 = {"d9cbff4832fc3ee7a7ad1c58cc891bdd", "d4d8ff567b614729ab8c52bd7d6bef48"}; WalkerTestSpec spec2 = new WalkerTestSpec( - "-T IndelRealigner -noPG -LOD 200 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023800-10332350 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -snps %s", + "-T IndelRealigner -noPG -LOD 200 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023800-10332350 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -snps %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", 2, Arrays.asList(md5lod200)); executeTest("test Lod200", spec2); - - String filename1 = "NA12878.chrom1.SLX.SRP000032.2009_06"; - String filename2 = "low_coverage_CEU.chr1.10k-11k"; - WalkerTestSpec spec3 = new WalkerTestSpec( - "-T IndelRealigner -nway -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + filename1 + ".bam -I " + validationDataLocation + filename2 + ".bam -L 1:10023900-10024000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O /tmp -snps %s", - 1, - Arrays.asList("bd42a4fa66d7ec7a480c2b94313a78d3")); - File file1 = new File("/tmp/" + filename1 + ".cleaned.bam"); - file1.deleteOnExit(); - spec3.addAuxFile("1ceae553c8aa20681ed0736d4d2b4541", file1); - File file2 = new File("/tmp/" + filename2 + ".cleaned.bam"); - file2.deleteOnExit(); - spec3.addAuxFile("ce8ddeae5a5aab836ac1dde9448ccb66", file2); - executeTest("test NWay", spec3); } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java index 62d992d6f..63b154763 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java @@ -18,10 +18,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-5,650,000" + - " -compress 1" + - " -sort NO_SORT" + - " -targetIntervals " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.realigner.intervals" + - " -O /dev/null", + " -targetIntervals " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.realigner.intervals", 0, new ArrayList(0)); try { @@ -38,10 +35,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L chr1:1-150,000,000" + - " -compress 1" + - " -sort NO_SORT" + - " -targetIntervals " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.realigner.intervals" + - " -O /dev/null", + " -targetIntervals " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.realigner.intervals", 0, new ArrayList(0)); try {