From 36fb6ca3c5a4b4305e6c273ccdc21c067da4a31f Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 5 Jun 2009 08:48:34 +0000 Subject: [PATCH] Allow user to specify the compression to be used when writing out BAM files. Updated most of the walkers to reflect this change. Now it won't take forever to write BAMs! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@909 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GATKArgumentCollection.java | 8 ++++++++ .../sting/gatk/GenomeAnalysisEngine.java | 11 +++++++++++ .../sting/gatk/walkers/PrintReadsWalker.java | 5 ++--- .../playground/gatk/walkers/IOCrusherWalker.java | 7 +++---- .../gatk/walkers/LogisticRecalibrationWalker.java | 3 +-- .../playground/gatk/walkers/ReadFilterWalker.java | 6 +++--- .../sting/playground/gatk/walkers/ReplaceQuals.java | 4 ++-- .../walkers/duplicates/CombineDuplicatesWalker.java | 5 ++--- .../gatk/walkers/indels/IntervalCleanerWalker.java | 2 +- java/src/org/broadinstitute/sting/utils/Utils.java | 10 +++++++--- 10 files changed, 40 insertions(+), 21 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java index b67b7738c..c87ae5828 100755 --- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java @@ -112,6 +112,10 @@ public class GATKArgumentCollection { @Argument(fullName = "sort_on_the_fly", shortName = "sort", doc = "Maximum number of reads to sort on the fly", required = false) public Integer maximumReadSorts = null; + @Element(required=false) + @Argument(fullName = "bam_compression", shortName = "compress", doc = "Compression level to use for writing BAM files", required = false) + public Integer BAMcompression = null; + @Element(required=false) @Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level. Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false) public Boolean filterZeroMappingQualityReads = false; @@ -245,6 +249,10 @@ public class GATKArgumentCollection { (other.maximumReadSorts != null && !other.maximumReadSorts.equals(this.maximumReadSorts))) { return false; } + if ((other.BAMcompression == null && this.BAMcompression != null) || + (other.BAMcompression != null && !other.BAMcompression.equals(this.BAMcompression))) { + return false; + } if ((other.filterZeroMappingQualityReads == null && this.filterZeroMappingQualityReads != null) || (other.filterZeroMappingQualityReads != null && !other.filterZeroMappingQualityReads.equals(this.filterZeroMappingQualityReads))) { return false; diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 47e985ae2..6179d3297 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -280,6 +280,17 @@ public class GenomeAnalysisEngine { return strictness; } + /** + * Default to 5 (based on research by Alec Wysoker) + * + * @return the BAM compression + */ + public int getBAMCompression() { + return (argCollection.BAMcompression == null || + argCollection.BAMcompression < 1 || + argCollection.BAMcompression > 8) ? 5 : argCollection.BAMcompression; + } + /** * Convenience function that binds RODs using the old-style command line parser to the new style list for * a uniform processing. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index dae6c404b..e569552da 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -2,9 +2,9 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.Utils; import java.io.PrintStream; import java.io.FileNotFoundException; @@ -22,9 +22,8 @@ public class PrintReadsWalker extends ReadWalker { public SAMFileWriter reduceInit() { if ( outputBamFile != null ) { // ! outputBamFile.equals("") ) { - SAMFileWriterFactory fact = new SAMFileWriterFactory(); SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader(); - return fact.makeBAMWriter(header, true, new File(outputBamFile)); + return Utils.createSAMFileWriterWithCompression(header, true, outputBamFile, getToolkit().getBAMCompression()); } else { return null; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java index 15ba86991..0fcfa7980 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IOCrusherWalker.java @@ -4,9 +4,9 @@ import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; import net.sf.samtools.SAMFileHeader; import net.sf.picard.reference.ReferenceSequence; @@ -48,11 +48,10 @@ public class IOCrusherWalker extends ReadWalker reduceInit() { - SAMFileWriterFactory fact = new SAMFileWriterFactory(); ArrayList outputs = new ArrayList(nWaysOut); for ( int i = 0; i < nWaysOut; i++ ) { - SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader(); - outputs.add(fact.makeBAMWriter(header, true, new File(outputBase + "." + i + ".bam"))); + SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader(); + outputs.add(Utils.createSAMFileWriterWithCompression(header, true, outputBase + "." + i + ".bam", getToolkit().getBAMCompression())); } return outputs; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java index ac2d091da..2c5d2aae3 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LogisticRecalibrationWalker.java @@ -149,9 +149,8 @@ public class LogisticRecalibrationWalker extends ReadWalker { @Argument(fullName="output_file", shortName="O",doc="SAM or BAM file to write filtered reads into (will be overwritten if exists)",required=true ) public String output; @@ -18,8 +18,8 @@ public class ReadFilterWalker extends ReadWalker { private SAMFileWriter writer = null; public void initialize() { - SAMFileHeader header = getToolkit().getSamReader().getFileHeader(); - writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, header.getSortOrder() != SAMFileHeader.SortOrder.unsorted, new File(output)); + SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader(); + writer = Utils.createSAMFileWriterWithCompression(header, header.getSortOrder() != SAMFileHeader.SortOrder.unsorted, output, getToolkit().getBAMCompression()); } @Override diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java index 7f8c368ac..17d96ed56 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ReplaceQuals.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.Utils; import net.sf.samtools.*; import net.sf.picard.reference.ReferenceSequence; @@ -108,9 +109,8 @@ public class ReplaceQuals extends ReadWalker { public SAMFileWriter reduceInit() { if ( outputFilename != null ) { // ! outputBamFile.equals("") ) { - SAMFileWriterFactory fact = new SAMFileWriterFactory(); SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader(); - return fact.makeBAMWriter(header, true, new File(outputFilename)); + return Utils.createSAMFileWriterWithCompression(header, true, outputFilename, getToolkit().getBAMCompression()); } else { return null; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java index 50d8db818..d30f5753b 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/duplicates/CombineDuplicatesWalker.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.playground.gatk.walkers; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.duplicates.DupUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; @@ -11,7 +12,6 @@ import java.io.File; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; import net.sf.samtools.SAMFileHeader; /** @@ -49,9 +49,8 @@ public class CombineDuplicatesWalker extends DuplicateWalker throw new RuntimeException("LOD threshold cannot be a negative number"); SAMFileHeader header = getToolkit().getEngine().getSAMHeader(); - writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT)); + writer = Utils.createSAMFileWriterWithCompression(header, false, OUT, getToolkit().getBAMCompression()); if ( OUT_INDELS != null ) { try { indelOutput = new FileWriter(new File(OUT_INDELS)); diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index c9d531180..6ed01b089 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.utils; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceRecord; -import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.*; import net.sf.samtools.util.StringUtil; import net.sf.picard.reference.ReferenceSequenceFile; @@ -45,6 +43,12 @@ public class Utils { throw new RuntimeException(msg); } + public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { + if (file.endsWith(".bam")) + return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); + return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); + } + /** * Returns a new list built from those objects found in collection that satisfy the * predicate ( i.e. pred.apply() is true for the objects in th eresulting list ).