From 4e42988c6671c1800436a97773c58c0ebd55a251 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 16 Aug 2012 12:39:54 -0400 Subject: [PATCH] GSA-485: Remove repairVCFHeader from GATK codebase -- Removed half-a*ssed attempt to automatically repair VCF files with bad headers, which allowed users to provide a replacement header overwriting the file's actually header on the fly. Not a good idea, really. Eric has promised to create a utility that walks through a VCF file and creates a meaningful header field based on the file's contents (if this ever becomes a priority) --- .../sting/gatk/GenomeAnalysisEngine.java | 15 ++--------- .../arguments/GATKArgumentCollection.java | 9 ------- .../gatk/refdata/tracks/FeatureManager.java | 8 ++---- .../gatk/refdata/tracks/RMDTrackBuilder.java | 16 +---------- .../sting/utils/codecs/vcf/VCFCodec.java | 27 ------------------- .../variantcontext/writer/VCFWriter.java | 2 +- .../utils/codecs/vcf/VCFIntegrationTest.java | 7 +---- 7 files changed, 7 insertions(+), 77 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 56fcf0652..55107833d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -849,20 +849,9 @@ public class GenomeAnalysisEngine { SAMSequenceDictionary sequenceDictionary, GenomeLocParser genomeLocParser, ValidationExclusion.TYPE validationExclusionType) { - VCFHeader header = null; - if ( getArguments().repairVCFHeader != null ) { - try { - final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(getArguments().repairVCFHeader)); - header = (VCFHeader)new VCFCodec().readHeader(pbs).getHeaderValue(); - pbs.close(); - } catch ( IOException e ) { - throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e); - } - } + final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType); - RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType); - - List dataSources = new ArrayList(); + final List dataSources = new ArrayList(); for (RMDTriplet fileDescriptor : referenceMetaDataFiles) dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, builder, diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 4c9235b58..06177868a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -384,14 +384,5 @@ public class GATKArgumentCollection { @Hidden public boolean USE_SLOW_GENOTYPES = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - /** - * The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file - * and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other - * VCF file that GATK reads in. This allows us to have in effect a master set of header records and use these - * to fill in any missing ones in input VCF files. - */ - @Argument(fullName="repairVCFHeader", shortName = "repairVCFHeader", doc="If provided, whenever we read a VCF file we will use the header in this file to repair the header of the input VCF files", required=false) - public File repairVCFHeader = null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java index b5d5deedb..a2fe94641 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java @@ -85,18 +85,16 @@ public class FeatureManager { private final PluginManager pluginManager; private final Collection featureDescriptors = new TreeSet(); - private final VCFHeader headerForRepairs; private final boolean lenientVCFProcessing; /** * Construct a FeatureManager without a master VCF header */ public FeatureManager() { - this(null, false); + this(false); } - public FeatureManager(final VCFHeader headerForRepairs, final boolean lenientVCFProcessing) { - this.headerForRepairs = headerForRepairs; + public FeatureManager(final boolean lenientVCFProcessing) { this.lenientVCFProcessing = lenientVCFProcessing; pluginManager = new PluginManager(FeatureCodec.class, "Codecs", "Codec"); @@ -255,8 +253,6 @@ public class FeatureManager { ((NameAwareCodec)codex).setName(name); if ( codex instanceof ReferenceDependentFeatureCodec ) ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); - if ( codex instanceof VCFCodec ) - ((VCFCodec)codex).setHeaderForRepairs(headerForRepairs); if ( codex instanceof AbstractVCFCodec && lenientVCFProcessing ) ((AbstractVCFCodec)codex).disableOnTheFlyModifications(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index e183fe169..81fe73075 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -89,17 +89,15 @@ public class RMDTrackBuilder { // extends PluginManager { * please talk through your approach with the SE team. * @param dict Sequence dictionary to use. * @param genomeLocParser Location parser to use. - * @param headerForRepairs a VCF header that should be used to repair VCF headers. Can be null * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. */ public RMDTrackBuilder(final SAMSequenceDictionary dict, final GenomeLocParser genomeLocParser, - final VCFHeader headerForRepairs, ValidationExclusion.TYPE validationExclusionType) { this.dict = dict; this.validationExclusionType = validationExclusionType; this.genomeLocParser = genomeLocParser; - this.featureManager = new FeatureManager(headerForRepairs, GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); + this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); } /** @@ -111,18 +109,6 @@ public class RMDTrackBuilder { // extends PluginManager { return featureManager; } - /** - * Same as full constructor but makes one without a header for repairs - * @param dict - * @param genomeLocParser - * @param validationExclusionType - */ - public RMDTrackBuilder(final SAMSequenceDictionary dict, - final GenomeLocParser genomeLocParser, - ValidationExclusion.TYPE validationExclusionType) { - this(dict, genomeLocParser, null, validationExclusionType); - } - /** * create a RMDTrack of the specified type * diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index da5b18831..4df1efee7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -49,13 +49,6 @@ public class VCFCodec extends AbstractVCFCodec { // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; - /** - * A VCF header the contains master info/filter/format records that we use to 'fill in' - * any missing records from our input VCF header. This allows us to repair headers on - * the fly - */ - private VCFHeader headerForRepairs = null; - /** * @param reader the line reader to take header lines from * @return the number of header lines @@ -88,8 +81,6 @@ public class VCFCodec extends AbstractVCFCodec { } headerStrings.add(line); super.parseHeaderFromLines(headerStrings, version); - if ( headerForRepairs != null ) - this.header = repairHeader(this.header, headerForRepairs); return this.header; } else { @@ -103,24 +94,6 @@ public class VCFCodec extends AbstractVCFCodec { throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); } - private final VCFHeader repairHeader(final VCFHeader readHeader, final VCFHeader masterHeader) { - final Set lines = VCFUtils.smartMergeHeaders(Arrays.asList(readHeader, masterHeader), log); - return new VCFHeader(lines, readHeader.getGenotypeSamples()); - } - - /** - * Tells this VCFCodec to repair the incoming header files with the information in masterHeader - * - * @param headerForRepairs - */ - public void setHeaderForRepairs(final VCFHeader headerForRepairs) { - if ( headerForRepairs != null ) - log.info("Using master VCF header to repair missing files from incoming VCFs"); - this.headerForRepairs = headerForRepairs; - } - - - /** * parse the filter string, first checking to see if we already have parsed it in a previous attempt * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index ea968e153..db74f2263 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -563,6 +563,6 @@ class VCFWriter extends IndexingVariantContextWriter { + " at " + vc.getChr() + ":" + vc.getStart() + " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have" + " complete VCF headers by default. This error can be disabled with the engine argument" - + " -U LENIENT_VCF_PROCESSING or repair the VCF file header using repairVCFHeader"); + + " -U LENIENT_VCF_PROCESSING"); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 71fc1d464..b2a4ac2da 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -92,7 +92,7 @@ public class VCFIntegrationTest extends WalkerTest { // // - // Tests to ensure that -U LENIENT_VCF_PROCESS and header repairs are working + // Tests to ensure that -U LENIENT_VCF_PROCESS // // @@ -106,11 +106,6 @@ public class VCFIntegrationTest extends WalkerTest { runVCFWithoutHeaders("-U LENIENT_VCF_PROCESSING", "6de8cb7457154dd355aa55befb943f88", null, true); } - @Test - public void testPassingOnVCFWithoutHeadersRepairingHeaders() { - runVCFWithoutHeaders("-repairVCFHeader " + privateTestDir + "vcfexample2.justHeader.vcf", "ff61e9cad6653c7f93d82d391f7ecdcb", null, false); - } - private void runVCFWithoutHeaders(final String moreArgs, final String expectedMD5, final Class expectedException, final boolean disableBCF) { final String testVCF = privateTestDir + "vcfexample2.noHeader.vcf"; final String baseCommand = "-R " + b37KGReference