GSA-485: Remove repairVCFHeader from GATK codebase

-- Removed half-a*ssed attempt to automatically repair VCF files with bad headers, which allowed users to provide a replacement header overwriting the file's actually header on the fly.  Not a good idea, really.  Eric has promised to create a utility that walks through a VCF file and creates a meaningful header field based on the file's contents (if this ever becomes a priority)
This commit is contained in:
Mark DePristo 2012-08-16 12:39:54 -04:00
parent 52bfe8db8a
commit 4e42988c66
7 changed files with 7 additions and 77 deletions

View File

@ -849,20 +849,9 @@ public class GenomeAnalysisEngine {
SAMSequenceDictionary sequenceDictionary,
GenomeLocParser genomeLocParser,
ValidationExclusion.TYPE validationExclusionType) {
VCFHeader header = null;
if ( getArguments().repairVCFHeader != null ) {
try {
final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(getArguments().repairVCFHeader));
header = (VCFHeader)new VCFCodec().readHeader(pbs).getHeaderValue();
pbs.close();
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e);
}
}
final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType);
RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType);
List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
final List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
for (RMDTriplet fileDescriptor : referenceMetaDataFiles)
dataSources.add(new ReferenceOrderedDataSource(fileDescriptor,
builder,

View File

@ -384,14 +384,5 @@ public class GATKArgumentCollection {
@Hidden
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
/**
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other
* VCF file that GATK reads in. This allows us to have in effect a master set of header records and use these
* to fill in any missing ones in input VCF files.
*/
@Argument(fullName="repairVCFHeader", shortName = "repairVCFHeader", doc="If provided, whenever we read a VCF file we will use the header in this file to repair the header of the input VCF files", required=false)
public File repairVCFHeader = null;
}

View File

@ -85,18 +85,16 @@ public class FeatureManager {
private final PluginManager<FeatureCodec> pluginManager;
private final Collection<FeatureDescriptor> featureDescriptors = new TreeSet<FeatureDescriptor>();
private final VCFHeader headerForRepairs;
private final boolean lenientVCFProcessing;
/**
* Construct a FeatureManager without a master VCF header
*/
public FeatureManager() {
this(null, false);
this(false);
}
public FeatureManager(final VCFHeader headerForRepairs, final boolean lenientVCFProcessing) {
this.headerForRepairs = headerForRepairs;
public FeatureManager(final boolean lenientVCFProcessing) {
this.lenientVCFProcessing = lenientVCFProcessing;
pluginManager = new PluginManager<FeatureCodec>(FeatureCodec.class, "Codecs", "Codec");
@ -255,8 +253,6 @@ public class FeatureManager {
((NameAwareCodec)codex).setName(name);
if ( codex instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser);
if ( codex instanceof VCFCodec )
((VCFCodec)codex).setHeaderForRepairs(headerForRepairs);
if ( codex instanceof AbstractVCFCodec && lenientVCFProcessing )
((AbstractVCFCodec)codex).disableOnTheFlyModifications();

View File

@ -89,17 +89,15 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
* please talk through your approach with the SE team.
* @param dict Sequence dictionary to use.
* @param genomeLocParser Location parser to use.
* @param headerForRepairs a VCF header that should be used to repair VCF headers. Can be null
* @param validationExclusionType Types of validations to exclude, for sequence dictionary verification.
*/
public RMDTrackBuilder(final SAMSequenceDictionary dict,
final GenomeLocParser genomeLocParser,
final VCFHeader headerForRepairs,
ValidationExclusion.TYPE validationExclusionType) {
this.dict = dict;
this.validationExclusionType = validationExclusionType;
this.genomeLocParser = genomeLocParser;
this.featureManager = new FeatureManager(headerForRepairs, GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType));
this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType));
}
/**
@ -111,18 +109,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
return featureManager;
}
/**
* Same as full constructor but makes one without a header for repairs
* @param dict
* @param genomeLocParser
* @param validationExclusionType
*/
public RMDTrackBuilder(final SAMSequenceDictionary dict,
final GenomeLocParser genomeLocParser,
ValidationExclusion.TYPE validationExclusionType) {
this(dict, genomeLocParser, null, validationExclusionType);
}
/**
* create a RMDTrack of the specified type
*

View File

@ -49,13 +49,6 @@ public class VCFCodec extends AbstractVCFCodec {
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
/**
* A VCF header the contains master info/filter/format records that we use to 'fill in'
* any missing records from our input VCF header. This allows us to repair headers on
* the fly
*/
private VCFHeader headerForRepairs = null;
/**
* @param reader the line reader to take header lines from
* @return the number of header lines
@ -88,8 +81,6 @@ public class VCFCodec extends AbstractVCFCodec {
}
headerStrings.add(line);
super.parseHeaderFromLines(headerStrings, version);
if ( headerForRepairs != null )
this.header = repairHeader(this.header, headerForRepairs);
return this.header;
}
else {
@ -103,24 +94,6 @@ public class VCFCodec extends AbstractVCFCodec {
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
}
private final VCFHeader repairHeader(final VCFHeader readHeader, final VCFHeader masterHeader) {
final Set<VCFHeaderLine> lines = VCFUtils.smartMergeHeaders(Arrays.asList(readHeader, masterHeader), log);
return new VCFHeader(lines, readHeader.getGenotypeSamples());
}
/**
* Tells this VCFCodec to repair the incoming header files with the information in masterHeader
*
* @param headerForRepairs
*/
public void setHeaderForRepairs(final VCFHeader headerForRepairs) {
if ( headerForRepairs != null )
log.info("Using master VCF header to repair missing files from incoming VCFs");
this.headerForRepairs = headerForRepairs;
}
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
*

View File

@ -563,6 +563,6 @@ class VCFWriter extends IndexingVariantContextWriter {
+ " at " + vc.getChr() + ":" + vc.getStart()
+ " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have"
+ " complete VCF headers by default. This error can be disabled with the engine argument"
+ " -U LENIENT_VCF_PROCESSING or repair the VCF file header using repairVCFHeader");
+ " -U LENIENT_VCF_PROCESSING");
}
}

View File

@ -92,7 +92,7 @@ public class VCFIntegrationTest extends WalkerTest {
//
//
// Tests to ensure that -U LENIENT_VCF_PROCESS and header repairs are working
// Tests to ensure that -U LENIENT_VCF_PROCESS
//
//
@ -106,11 +106,6 @@ public class VCFIntegrationTest extends WalkerTest {
runVCFWithoutHeaders("-U LENIENT_VCF_PROCESSING", "6de8cb7457154dd355aa55befb943f88", null, true);
}
@Test
public void testPassingOnVCFWithoutHeadersRepairingHeaders() {
runVCFWithoutHeaders("-repairVCFHeader " + privateTestDir + "vcfexample2.justHeader.vcf", "ff61e9cad6653c7f93d82d391f7ecdcb", null, false);
}
private void runVCFWithoutHeaders(final String moreArgs, final String expectedMD5, final Class expectedException, final boolean disableBCF) {
final String testVCF = privateTestDir + "vcfexample2.noHeader.vcf";
final String baseCommand = "-R " + b37KGReference