From 44f3c5639ad4152e7bea9573dbe525b0069bc494 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 24 Aug 2010 04:14:50 +0000 Subject: [PATCH] I have finally figured out that when you volunteer to do something in group meeting, you keep getting pestered about it on Mark's Omniplan doc until it gets done (except for contig aliasing, of course). As such... We can now emit bzipped VCFs from the GATK. Details: any walker that defines a VCFWriter for its @Output (i.e. pretty much every core walker from UG and on), also has associated with it the -bzip (--bzip_compression) boolean argument. When set, it will emit a VCF that is compressed with bzip2. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4093 348d0f76-0448-11de-a6fe-93d51630548a --- ivy.xml | 4 +- .../commandline/ArgumentTypeDescriptor.java | 15 ++++++- .../sting/gatk/io/CompressedVCFWriter.java | 45 +++++++++++++++++++ .../gatk/io/storage/VCFWriterStorage.java | 11 ++++- .../VCFWriterArgumentTypeDescriptor.java | 35 +++++++++++++-- .../sting/gatk/io/stubs/VCFWriterStub.java | 21 ++++++++- 6 files changed, 122 insertions(+), 9 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/io/CompressedVCFWriter.java diff --git a/ivy.xml b/ivy.xml index ba02d9b6b..c7549459c 100644 --- a/ivy.xml +++ b/ivy.xml @@ -27,9 +27,9 @@ - - + + diff --git a/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index acbef061e..06c220da6 100644 --- a/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -34,7 +34,6 @@ import org.apache.log4j.Logger; import java.lang.annotation.Annotation; import java.lang.reflect.*; import java.util.*; -import java.io.OutputStream; /** * An descriptor capable of providing parsers that can parse any type @@ -192,6 +191,20 @@ public abstract class ArgumentTypeDescriptor { return validOptions; } + /** + * Returns true if the argument with the given full name exists in the collection of ArgumentMatches. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return true if the argument is present, or false if not present. + */ + protected boolean argumentIsPresent( ArgumentDefinition definition, ArgumentMatches matches ) { + for( ArgumentMatch match: matches ) { + if( match.definition.equals(definition) ) + return true; + } + return false; + } + /** * Gets the value of an argument with the given full name, from the collection of ArgumentMatches. * If the argument matches multiple values, an exception will be thrown. diff --git a/java/src/org/broadinstitute/sting/gatk/io/CompressedVCFWriter.java b/java/src/org/broadinstitute/sting/gatk/io/CompressedVCFWriter.java new file mode 100755 index 000000000..d0f0fdd01 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/io/CompressedVCFWriter.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.io; + +import org.broad.tribble.vcf.StandardVCFWriter; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import java.io.OutputStream; +import java.io.IOException; + +/** + * A writer that will allow bzipped VCF files written on-the-fly. + * + * @author ebanks + * @version 0.1 + */ +public class CompressedVCFWriter extends StandardVCFWriter { + + public CompressedVCFWriter(OutputStream output) throws IOException { + super(new BZip2CompressorOutputStream(output)); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index d1422056e..23ff429fd 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -8,6 +8,7 @@ import org.broad.tribble.vcf.VCFWriter; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; +import org.broadinstitute.sting.gatk.io.CompressedVCFWriter; import java.io.*; import java.util.Set; @@ -45,7 +46,15 @@ public class VCFWriterStorage implements Storage, VCFWriter { else throw new StingException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); - writer = new StandardVCFWriter(stream); + if ( stub.isCompressed() ) { + try { + writer = new CompressedVCFWriter(stream); + } catch (IOException e) { + throw new StingException("Unable to create a compressed output stream: " + e.getMessage()); + } + } else { + writer = new StandardVCFWriter(stream); + } } /** diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java index 60306e805..398a7b032 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ b/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -41,6 +41,10 @@ import java.util.Arrays; * @version 0.1 */ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + private static final String COMPRESSION_FULLNAME = "bzip_compression"; + private static final String COMPRESSION_SHORTNAME = "bzip"; + /** * The engine into which output stubs should be fed. */ @@ -73,7 +77,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public List createArgumentDefinitions( ArgumentSource source ) { - return Arrays.asList( createDefaultArgumentDefinition(source) ); + return Arrays.asList( createDefaultArgumentDefinition(source), + createVCFCompressionArgumentDefinition(source) ); } /** @@ -87,7 +92,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public Object createTypeDefault(ArgumentSource source,Class type) { - VCFWriterStub stub = new VCFWriterStub(engine,defaultOutputStream); + VCFWriterStub stub = new VCFWriterStub(engine, defaultOutputStream, false); engine.addOutput(stub); return stub; } @@ -105,11 +110,35 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { String writerFileName = getArgumentValue(createDefaultArgumentDefinition(source),matches); File writerFile = writerFileName != null ? new File(writerFileName) : null; + // Should we compress the output stream? + boolean compress = argumentIsPresent(createVCFCompressionArgumentDefinition(source), matches); + // Create a stub for the given object. - VCFWriterStub stub = (writerFile != null) ? new VCFWriterStub(engine, writerFile) : new VCFWriterStub(engine,System.out); + VCFWriterStub stub = (writerFile != null) ? new VCFWriterStub(engine, writerFile, compress) : new VCFWriterStub(engine, System.out, compress); engine.addOutput(stub); return stub; } + + /** + * Creates the optional compression argument for the VCF file. + * @param source Argument source for the VCF file. Must not be null. + * @return Argument definition for the VCF file itself. Will not be null. + */ + private ArgumentDefinition createVCFCompressionArgumentDefinition(ArgumentSource source) { + return new ArgumentDefinition( ArgumentIOType.getIOType(getArgumentAnnotation(source)), + boolean.class, + COMPRESSION_FULLNAME, + COMPRESSION_SHORTNAME, + "Should we bzip the output VCF?", + false, + true, + false, + false, + null, + null, + null, + null ); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index ed27ed8f3..1842e3d40 100755 --- a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -61,6 +61,11 @@ public class VCFWriterStub implements Stub, VCFWriter { */ private final PrintStream genotypeStream; + /** + * Should we emit a compressed output stream? + */ + private final boolean isCompressed; + /** * Connects this stub with an external stream capable of serving the * requests of the consumer of this stub. @@ -71,22 +76,26 @@ public class VCFWriterStub implements Stub, VCFWriter { * Create a new stub given the requested file. * @param engine GATK engine. * @param genotypeFile file to (ultimately) create. + * @param isCompressed should we compress the output stream? */ - public VCFWriterStub(GenomeAnalysisEngine engine,File genotypeFile) { + public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed) { this.engine = engine; this.genotypeFile = genotypeFile; this.genotypeStream = null; + this.isCompressed = isCompressed; } /** * Create a new stub given the requested file. * @param engine GATK engine. * @param genotypeStream stream to (ultimately) write. + * @param isCompressed should we compress the output stream? */ - public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream) { + public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed) { this.engine = engine; this.genotypeFile = null; this.genotypeStream = new PrintStream(genotypeStream); + this.isCompressed = isCompressed; } /** @@ -105,6 +114,14 @@ public class VCFWriterStub implements Stub, VCFWriter { return genotypeStream; } + /** + * Retrieves the output stearm to which to (ultimately) write. + * @return The file. Can be null if genotypeFile is not. + */ + public boolean isCompressed() { + return isCompressed; + } + /** * Retrieves the header to use when creating the new file. * @return header to use when creating the new file.