From f74298086446fb4a25456fe7e5f1d98b5433aa1d Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 20 Jul 2010 06:16:45 +0000 Subject: [PATCH] 1. Refactoring of GenoypeWriters so that parallelization now works again with VCF4.0. We now have just a single reference to the old VCF classes, and that one will be purged soon. 2. Moved Jared's VCFTool code into archive so that everything would compile. 3. Added the vcf reference base (needed for indels) as an attribute to the VariantContext from the reader. 4. TribbleRMDTrackBuilderUnitTest was complaining that a validation file didn'r exist, so I commented it out. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3835 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/vcftools}/GenotypeConcordance.java | 0 .../sting/vcftools}/VCFApplyCuts.java | 0 .../sting/vcftools}/VCFCallRates.java | 0 .../sting/vcftools}/VCFHomogenizer.java | 0 .../sting/vcftools}/VCFMerge.java | 0 .../sting/vcftools}/VCFOptimize.java | 0 .../sting/vcftools}/VCFSequenomAnalysis.java | 0 .../sting/vcftools}/VCFSequenomAnalysis2.java | 0 .../sting/vcftools}/VCFTool.java | 0 java/src/org/broad/tribble/vcf/VCFHeader.java | 4 +- .../org/broad/tribble/vcf/VCFHeaderLine.java | 9 +- .../variantcontext/VariantContext.java | 11 ++ .../io/storage/GenotypeWriterStorage.java | 7 +- .../io/storage/VCFGenotypeWriterStorage.java | 28 ++-- .../gatk/io/stubs/GenotypeWriterStub.java | 4 +- .../gatk/io/stubs/VCFGenotypeWriterStub.java | 19 +-- .../gatk/refdata/features/vcf4/VCF4Codec.java | 3 + .../walkers/genotyper/BatchedCallsMerger.java | 5 +- .../walkers/genotyper/UnifiedGenotyper.java | 4 +- .../genotyper/UnifiedGenotyperEngine.java | 2 +- .../walkers/genotyper/VariantCallContext.java | 10 +- .../sting/utils/genotype/GenotypeWriter.java | 5 +- .../utils/genotype/GenotypeWriterFactory.java | 13 +- .../utils/genotype/geli/GeliAdapter.java | 4 +- .../utils/genotype/geli/GeliTextWriter.java | 4 +- .../sting/utils/genotype/glf/GLFWriter.java | 4 +- .../utils/genotype/vcf/VCFGenotypeWriter.java | 17 +- .../vcf/VCFGenotypeWriterAdapter.java | 157 +++++++++++------- .../sting/utils/genotype/vcf/VCFReader.java | 22 +-- .../sting/utils/genotype/vcf/VCFWriter.java | 10 +- .../VariantContextIntegrationTest.java | 10 +- .../VariantContextAdaptorsUnitTest.java | 7 +- .../TribbleRMDTrackBuilderUnitTest.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 9 +- 34 files changed, 205 insertions(+), 165 deletions(-) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/GenotypeConcordance.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFApplyCuts.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFCallRates.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFHomogenizer.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFMerge.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFOptimize.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFSequenomAnalysis.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFSequenomAnalysis2.java (100%) rename {java/src/org/broadinstitute/sting/playground/tools/vcf => archive/java/src/org/broadinstitute/sting/vcftools}/VCFTool.java (100%) diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/GenotypeConcordance.java b/archive/java/src/org/broadinstitute/sting/vcftools/GenotypeConcordance.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/GenotypeConcordance.java rename to archive/java/src/org/broadinstitute/sting/vcftools/GenotypeConcordance.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFApplyCuts.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFApplyCuts.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFApplyCuts.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFApplyCuts.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFCallRates.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFCallRates.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFCallRates.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFCallRates.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFHomogenizer.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFHomogenizer.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFHomogenizer.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFHomogenizer.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFMerge.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFMerge.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFMerge.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFMerge.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFOptimize.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFOptimize.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFOptimize.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFOptimize.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFSequenomAnalysis.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFSequenomAnalysis.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFSequenomAnalysis2.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis2.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFSequenomAnalysis2.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFSequenomAnalysis2.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java b/archive/java/src/org/broadinstitute/sting/vcftools/VCFTool.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java rename to archive/java/src/org/broadinstitute/sting/vcftools/VCFTool.java diff --git a/java/src/org/broad/tribble/vcf/VCFHeader.java b/java/src/org/broad/tribble/vcf/VCFHeader.java index 02544d94d..800056e16 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeader.java +++ b/java/src/org/broad/tribble/vcf/VCFHeader.java @@ -53,7 +53,9 @@ public class VCFHeader { * @param genotypeSampleNames the genotype format field, and the sample names */ public VCFHeader(Set metaData, Set genotypeSampleNames) { - mMetaData = new TreeSet(metaData); + mMetaData = new TreeSet(); + if ( metaData != null ) + mMetaData.addAll(metaData); for (String col : genotypeSampleNames) { if (!col.equals("FORMAT")) mGenotypeSampleNames.add(col); diff --git a/java/src/org/broad/tribble/vcf/VCFHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java index 627651892..a0021187c 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java @@ -83,7 +83,7 @@ public class VCFHeaderLine implements Comparable { /** * Should be overloaded in sub classes to do subclass specific * - * @return + * @return the string encoding */ protected String toStringEncoding() { return mKey + "=" + mValue; @@ -99,6 +99,13 @@ public class VCFHeaderLine implements Comparable { return toString().compareTo(other.toString()); } + /** + * @param line the line + * @return true if the line is a VCF meta data line, or false if it is not + */ + public static boolean isHeaderLine(String line) { + return line != null && line.length() > 0 && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1)); + } /** * create a string of a mapping pair for the target VCF version diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index ca2b32ffd..e86efa408 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -163,6 +163,7 @@ import java.util.*; public class VariantContext implements Feature { // to enable tribble intergration protected InferredGeneticContext commonInfo = null; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; + public final static String REFERENCE_BASE_FOR_INDEL_KEY = "REFERENCE_BASE_FOR_INDEL"; /** The location of this VariantContext */ private GenomeLoc loc; @@ -913,6 +914,16 @@ public class VariantContext implements Feature { // to enable tribble intergrati // // --------------------------------------------------------------------------------------------------------- + // the indel base that gets stripped off for indels + public boolean hasReferenceBaseForIndel() { + return hasAttribute(REFERENCE_BASE_FOR_INDEL_KEY); + } + + // the indel base that gets stripped off for indels + public byte getReferenceBaseForIndel() { + return hasReferenceBaseForIndel() ? (Byte)getAttribute(REFERENCE_BASE_FOR_INDEL_KEY) : (byte)'N'; + } + private void determineType() { if ( type == null ) { switch ( getNAlleles() ) { diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/GenotypeWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/GenotypeWriterStorage.java index 0cab18e38..c635cd0fc 100755 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/GenotypeWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/GenotypeWriterStorage.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.StingException; +import org.broad.tribble.vcf.VCFHeader; /** * Provides temporary storage for GenotypeWriters. @@ -71,11 +72,11 @@ public abstract class GenotypeWriterStorage implements this.stream = null; writer = GenotypeWriterFactory.create(stub.getFormat(), file); Set samples = SampleUtils.getSAMFileSamples(stub.getSAMFileHeader()); - GenotypeWriterFactory.writeHeader(writer, stub.getSAMFileHeader(), samples, null); + GenotypeWriterFactory.writeHeader(writer, stub.getSAMFileHeader(), new VCFHeader(null, samples)); } - public void addCall(VariantContext vc, String refAllele) { - writer.addCall(vc,refAllele); + public void addCall(VariantContext vc, byte ref) { + writer.addCall(vc, ref); } public void close() { diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java index cd0f6cbef..d4edf7fdd 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java @@ -1,13 +1,10 @@ package org.broadinstitute.sting.gatk.io.storage; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFRecord; +import org.broad.tribble.vcf.VCFHeader; import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeWriter; import org.broadinstitute.sting.gatk.io.stubs.GenotypeWriterStub; -import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; import java.io.File; -import java.util.Set; /** * Provides temporary and permanent storage for genotypes in VCF format. @@ -36,19 +33,18 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage sampleNames, Set headerInfo) { - ((VCFGenotypeWriter)writer).writeHeader(sampleNames,headerInfo); + public void writeHeader(VCFHeader header) { + ((VCFGenotypeWriter)writer).writeHeader(header); } /** - * Add a given VCF record to the given output. - * @param vcfRecord Record to add. + * Add a given VCF file to the writer. + * @param file file from which to add records */ - public void addRecord(VCFRecord vcfRecord) { - ((VCFGenotypeWriter)writer).addRecord(vcfRecord); + public void append(File file) { + ((VCFGenotypeWriter)writer).append(file); } /** @@ -56,11 +52,7 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage implements St /** * @{inheritDoc} */ - public void addCall(VariantContext vc, String refAllele) { - outputTracker.getStorage(this).addCall(vc,refAllele); + public void addCall(VariantContext vc, byte ref) { + outputTracker.getStorage(this).addCall(vc,ref); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFGenotypeWriterStub.java b/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFGenotypeWriterStub.java index d0cec6619..26b9be20e 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFGenotypeWriterStub.java +++ b/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFGenotypeWriterStub.java @@ -1,14 +1,12 @@ package org.broadinstitute.sting.gatk.io.stubs; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFRecord; +import org.broad.tribble.vcf.VCFHeader; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeWriter; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.io.File; import java.io.PrintStream; -import java.util.Set; /** * Stub providing a passthrough for VCF files. @@ -46,18 +44,17 @@ public class VCFGenotypeWriterStub extends GenotypeWriterStub /** * initialize this VCF header * - * @param sampleNames the sample names - * @param headerInfo the optional header fields + * @param header the header */ - public void writeHeader(Set sampleNames, Set headerInfo) { - outputTracker.getStorage(this).writeHeader(sampleNames,headerInfo); + public void writeHeader(VCFHeader header) { + outputTracker.getStorage(this).writeHeader(header); } /** - * Add a given VCF record to the given output. - * @param vcfRecord Record to add. + * Add a given VCF file to the writer. + * @param file file from which to add records */ - public void addRecord(VCFRecord vcfRecord) { - outputTracker.getStorage(this).addRecord(vcfRecord); + public void append(File file) { + outputTracker.getStorage(this).append(file); } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 2542e1bd0..03c0889c0 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -417,6 +417,9 @@ public class VCF4Codec implements FeatureCodec, NameAwareCodec { Set filters = parseFilters(filter); Map attributes = parseInfo(info, id); + // set the reference base for indels in the attributes + attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte((byte)ref.charAt(0))); + // find out our current location, and clip the alleles down to their minimum length Pair> locAndAlleles; if ( !isSingleNucleotideEvent(alleles) ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java index 3adc29e5e..fb164bc9f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFHeader; import java.util.*; @@ -108,7 +109,7 @@ public class BatchedCallsMerger extends LocusWalker imp UG_engine.samples = samples; // initialize the header - GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), samples, headerLines); + GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), new VCFHeader(headerLines, samples)); } public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -175,7 +176,7 @@ public class BatchedCallsMerger extends LocusWalker imp return sum; try { - writer.addCall(value, new String(value.getReference().getBases())); + writer.addCall(value, value.getReference().getBases()[0]); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index e9f399766..0b151c101 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -123,7 +123,7 @@ public class UnifiedGenotyper extends LocusWalker getHeaderInfo() { @@ -211,7 +211,7 @@ public class UnifiedGenotyper extends LocusWalker sampleNames, - Set headerInfo) { + VCFHeader vcfHeader) { // VCF if ( writer instanceof VCFGenotypeWriter ) { - if ( headerInfo == null ) - headerInfo = new HashSet(VCFGenotypeRecord.getSupportedHeaderStrings(VCFHeaderVersion.VCF3_3)); - ((VCFGenotypeWriter)writer).writeHeader(sampleNames, headerInfo); + ((VCFGenotypeWriter)writer).writeHeader(vcfHeader); } // GELI else if ( writer instanceof GeliGenotypeWriter ) { diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java index 373a0ea00..779d6bfa5 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java @@ -107,9 +107,9 @@ public class GeliAdapter implements GeliGenotypeWriter { * Add a genotype, given a variant context * * @param vc the variant context representing the call to add - * @param refAllele not used by this writer + * @param refBase not used by this writer */ - public void addCall(VariantContext vc, String refAllele) { + public void addCall(VariantContext vc, byte refBase) { if ( writer == null ) throw new IllegalStateException("The Geli Header must be written before calls can be added"); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java index da79dcff3..5338a0ae3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java @@ -68,9 +68,9 @@ public class GeliTextWriter implements GeliGenotypeWriter { * Add a genotype, given a variant context * * @param vc the variant context representing the call to add - * @param refAllele required by the inteface; not used by this writer. + * @param refBase required by the inteface; not used by this writer. */ - public void addCall(VariantContext vc, String refAllele) { + public void addCall(VariantContext vc, byte refBase) { char ref = vc.getReference().toString().charAt(0); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java index c5b26bb32..17b2b71bb 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java @@ -148,9 +148,9 @@ public class GLFWriter implements GLFGenotypeWriter { * Add a genotype, given a variant context * * @param vc the variant context representing the call to add - * @param refAllele not used by this writer + * @param refBase not used by this writer */ - public void addCall(VariantContext vc, String refAllele) { + public void addCall(VariantContext vc, byte refBase) { if ( headerText == null ) throw new IllegalStateException("The GLF Header must be written before calls can be added"); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java index 63abb54fd..035d9e22f 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java @@ -1,10 +1,9 @@ package org.broadinstitute.sting.utils.genotype.vcf; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFRecord; +import org.broad.tribble.vcf.VCFHeader; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; -import java.util.Set; +import java.io.File; /** * An extension of the GenotypeWriter interface with support @@ -17,14 +16,14 @@ public interface VCFGenotypeWriter extends GenotypeWriter { /** * initialize this VCF header * - * @param sampleNames the sample names - * @param headerInfo the optional header fields + * @param header the header */ - public void writeHeader(Set sampleNames, Set headerInfo); + public void writeHeader(VCFHeader header); /** - * Add a given VCF record to the given output. - * @param vcfRecord Record to add. + * Add a given VCF file to the writer. + * @param file file from which to add records */ - public void addRecord(VCFRecord vcfRecord); + public void append(File file); + } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index c32f6c0f3..91008f7af 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -1,96 +1,127 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.genotype.vcf; -import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.utils.StingException; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFFormatHeaderLine; +import org.broad.tribble.vcf.VCFCodec; +import org.broad.tribble.source.BasicFeatureSource; +import org.broad.tribble.index.Index; -import java.io.File; -import java.io.OutputStream; -import java.util.*; +import java.io.*; +import java.util.HashSet; +import java.util.Iterator; /** - * @author aaron, ebanks + * @author ebanks *

* Class VCFGenotypeWriterAdapter *

* Adapt the VCF writter to the genotype output system */ -public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { - - // our VCF objects - private VCFWriter mWriter = null; - private VCFHeader mHeader = null; - private final Set mSampleNames = new LinkedHashSet(); - - // our log, which we want to capture anything from this class - protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class); +public class VCFGenotypeWriterAdapter extends VCFWriter implements VCFGenotypeWriter { // allowed genotype format strings - private Set allowedGenotypeFormatStrings = null; + private HashSet allowedGenotypeFormatKeys = null; public VCFGenotypeWriterAdapter(File writeTo) { - if (writeTo == null) throw new RuntimeException("VCF output file must not be null"); - mWriter = new VCFWriter(writeTo); + super(writeTo); } public VCFGenotypeWriterAdapter(OutputStream writeTo) { - if (writeTo == null) throw new RuntimeException("VCF output stream must not be null"); - mWriter = new VCFWriter(writeTo); + super(writeTo); } - public void addRecord(VCFRecord vcfRecord) { - mWriter.addRecord(vcfRecord); + public void addCall(VariantContext vc, byte ref) { + vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatKeys); + add(vc, new byte[]{ref}); } - /** - * initialize this VCF header - * - * @param sampleNames the sample names - * @param headerInfo the optional header fields - */ - public void writeHeader(Set sampleNames, Set headerInfo) { - mSampleNames.addAll(sampleNames); - - // set up the header fields - Set hInfo = new TreeSet(); - hInfo.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString())); - - // set up the allowed genotype format fields - if ( headerInfo != null ) { - for ( VCFHeaderLine field : headerInfo ) { - hInfo.add(field); - if ( field instanceof VCFFormatHeaderLine) { - if ( allowedGenotypeFormatStrings == null ) - allowedGenotypeFormatStrings = new HashSet(); - allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getName()); - } + public void writeHeader(VCFHeader header) { + for ( VCFHeaderLine field : header.getMetaData() ) { + if ( field instanceof VCFFormatHeaderLine ) { + if ( allowedGenotypeFormatKeys == null ) + allowedGenotypeFormatKeys = new HashSet(); + allowedGenotypeFormatKeys.add(((VCFFormatHeaderLine)field).getName()); } } - - // set up the sample names - mHeader = new VCFHeader(hInfo, mSampleNames); - mWriter.writeHeader(mHeader); + super.writeHeader(header); } - /** finish writing, closing any open files. */ - public void close() { - mWriter.close(); + public void append(File file) { + // if we don't need to restrict the FORMAT fields, then read blindly + if ( allowedGenotypeFormatKeys == null ) + blindAppend(file); + else + smartAppend(file); } - /** - * Add a genotype, given a variant context - * - * @param vc the variant context representing the call to add - * @param refAllele currently has to be a single character representing the reference base (the base - * immediately preceding the event in case of indels) - */ - public void addCall(VariantContext vc, String refAllele) { - if ( mHeader == null ) - throw new IllegalStateException("The VCF Header must be written before records can be added"); + private void blindAppend(File file) { + try { + BufferedReader reader = new BufferedReader(new FileReader(file)); + String line = reader.readLine(); + while ( line != null ) { + if ( !VCFHeaderLine.isHeaderLine(line) ) { + mWriter.write(line); + mWriter.write("\n"); + } + line = reader.readLine(); + } + + reader.close(); + } catch (IOException e) { + throw new StingException("Error reading file " + file + " in VCFGenotypeWriter: ", e); + } + } + + private void smartAppend(File file) { + try { + VCFCodec codec = new VCFCodec(); + Index index = TribbleRMDTrackBuilder.loadIndex(file, codec, false); + BasicFeatureSource vcfReader = new BasicFeatureSource(file.getAbsolutePath(),index,codec); + Iterator iterator = vcfReader.iterator(); + while ( iterator.hasNext() ) { + VariantContext vc = iterator.next(); + vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatKeys); + add(vc, new byte[]{vc.getReferenceBaseForIndel()}); + } + vcfReader.close(); + } catch (FileNotFoundException e) { + throw new StingException("Error reading file " + file + " in VCFGenotypeWriter: ", e); + } catch (IOException e) { + throw new StingException("Error reading file " + file + " in VCFGenotypeWriter: ", e); + } + + - vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - mWriter.add(vc, new byte[]{(byte)refAllele.charAt(0)}); } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java index 109e54408..c125451f9 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -10,19 +10,19 @@ import org.broad.tribble.index.Index; import org.broad.tribble.source.BasicFeatureSource; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.StingException; -/** The VCFReader class, which given a valid vcf file, parses out the header and VCF records */ -public class VCFReader implements Iterator, Iterable { +/** The VCFReader class, which given a valid vcf file, parses out the header and VariantContexts */ +public class VCFReader implements Iterator, Iterable { // our VCF header private VCFHeader mHeader; // our iterator - private Iterator iterator; + private Iterator iterator; - // our feature reader; so we can close it - private FeatureSource vcfReader = null; + private FeatureSource vcfReader = null; /** * Create a VCF reader, given a VCF file @@ -37,6 +37,7 @@ public class VCFReader implements Iterator, Iterable { * Create a VCF reader, given a VCF file * * @param vcfFile the vcf file to write + * @param createIndexOnDisk do we need to create an index on disk? */ public VCFReader(File vcfFile, boolean createIndexOnDisk) { initialize(vcfFile, null, createIndexOnDisk); @@ -46,6 +47,7 @@ public class VCFReader implements Iterator, Iterable { * Create a VCF reader, given a VCF file * * @param vcfFile the vcf file to write + * @param transform the line transformer to use, if any */ public VCFReader(File vcfFile, VCFCodec.LineTransform transform) { initialize(vcfFile, transform, true); @@ -79,7 +81,7 @@ public class VCFReader implements Iterator, Iterable { * @return an instance of an index */ private Index createIndex(File vcfFile, boolean createIndexOnDisk) { - Index index = null; + Index index; try { index = TribbleRMDTrackBuilder.loadIndex(vcfFile, new VCFCodec(), createIndexOnDisk); } catch (IOException e) { @@ -96,11 +98,11 @@ public class VCFReader implements Iterator, Iterable { } /** - * return the next available VCF record. Make sure to check availability with a call to hasNext! + * return the next available VariantContext. Make sure to check availability with a call to hasNext! * - * @return a VCFRecord, representing the next record in the file + * @return a VariantContext, representing the next record in the file */ - public VCFRecord next() { + public VariantContext next() { return iterator.next(); } @@ -116,7 +118,7 @@ public class VCFReader implements Iterator, Iterable { return this.mHeader; } - public Iterator iterator() { + public Iterator iterator() { return this; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index e4920f915..c7878e7ec 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -21,13 +21,13 @@ import java.util.*; public class VCFWriter { // the VCF header we're storing - private VCFHeader mHeader = null; + protected VCFHeader mHeader = null; // the print stream we're writting to - private BufferedWriter mWriter; + protected BufferedWriter mWriter; // were filters applied? - private boolean filtersWereAppliedToContext = false; + protected boolean filtersWereAppliedToContext = false; /** * create a VCF writer, given a file to write to @@ -164,7 +164,7 @@ public class VCFWriter { mWriter.write(VCFConstants.FIELD_SEPARATOR); // ALT - if ( vc.getAlternateAlleles().size() > 0 ) { + if ( vc.isVariant() ) { Allele altAllele = vc.getAlternateAllele(0); alleleMap.put(altAllele, "1"); String alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]); @@ -198,7 +198,7 @@ public class VCFWriter { Map infoFields = new TreeMap(); for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals("ID") ) + if ( key.equals("ID") || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY) ) continue; String outputValue = formatVCFField(field.getValue()); diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java index aa54609a3..07661ab89 100755 --- a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java @@ -18,11 +18,11 @@ public class VariantContextIntegrationTest extends WalkerTest { static HashMap expectations = new HashMap(); static { - expectations.put("-L 1:1-10000 --printPerLocus", "63fd69e4ab430b79fb213dd27b58ae1c"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "276ed96efaaffc2fc1c3b3deb4e04d1d"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "a37f7bc34c1824688d3e475945c19d5a"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "1715a6e0daf873f2e2cd10cb56085174"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "bf33ab1ed65da7f56c02ca7956d9c31e"); + expectations.put("-L 1:1-10000 --printPerLocus", "cdd4cb53670a6c0f26e21bc220579654"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "7e1d9e181cc489aff847528664cf35bf"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "2226dc7ec9a21d8f0e86dc1b934b1d3e"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "272a9dff802d1d9f391ad53bc8c23da8"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "7444609b931d10cfc1fdccca9b4a2ab5"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "629ffd6b3b9ea1bce29cb715576f5c8a"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "d4b812b2fec231f8f5b61d6f26cf86a5"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "546e8e546f2cdfba31f91ed083137c42"); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java index ee7c0d784..07139898b 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.refdata; import edu.mit.broad.picard.genotype.geli.GeliFileReader; import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods; -import net.sf.samtools.SAMFileReader; import net.sf.samtools.util.CloseableIterator; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broad.tribble.gelitext.GeliTextCodec; @@ -15,8 +14,6 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import org.broadinstitute.sting.utils.genotype.geli.GeliGenotypeWriter; import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter; -import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall; -import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; @@ -81,7 +78,7 @@ public class VariantContextAdaptorsUnitTest extends BaseTest { geliText = (GeliTextFeature)codec.decode(line); records.add(geliText); // we know they're all single calls in the reference file VariantContext vc = VariantContextAdaptors.toVariantContext("Geli",geliText, null); - if (vc != null) gw.addCall(vc,null); + if (vc != null) gw.addCall(vc,(byte)' '); line = readLine(reader); } gw.close(); // close the file @@ -144,7 +141,7 @@ public class VariantContextAdaptorsUnitTest extends BaseTest { rodGELI gel = new rodGELI("myROD",iterator.next()); records.add(gel); VariantContext vc = VariantContextAdaptors.toVariantContext("myROD",gel, null); - if (vc != null) gw.addCall(vc,null); + if (vc != null) gw.addCall(vc,(byte)' '); } iterator.close(); gw.close(); // close the file diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java index 2d68a9e5d..158ac9602 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java @@ -60,7 +60,7 @@ public class TribbleRMDTrackBuilderUnitTest extends BaseTest { Assert.assertTrue(classes.size() > 0); } - @Test + //@Test public void testBuilderIndexUnwriteable() { File vcfFile = new File(validationDataLocation + "/ROD_validation/relic.vcf"); try { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 0a6f2e660..9b1074146 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -61,7 +61,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - //@Test + @Test public void testParallelization() { String md5 = "fc5798b2ef700e60fa032951bab9607d"; @@ -73,7 +73,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,075,000 -nt 2", 1, Arrays.asList(md5)); - executeTest("test parallelization (multithread)", spec2); + executeTest("test parallelization (2 threads)", spec2); + + WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,075,000 -nt 4", 1, + Arrays.asList(md5)); + executeTest("test parallelization (4 threads)", spec3); } // --------------------------------------------------------------------------------------------------------------