BCF2 cleanup

-- allowMissingVCFHeaders is now part of -U argument.  If you want specifically unsafe VCF processing you need -U LENIENT_VCF_PROCESSING.  Updated lots of files to use this
-- LENIENT_VCF_PROCESSING disables on the fly VCF header cleanup.  This is now implemented via a member variable, not a class variable, which I believe was changing the GATK behavior during integration tests, causing some files to fail that pass when run as a single test because the header reading behavior was changing depending on previous failures.
This commit is contained in:
Mark DePristo 2012-06-25 10:27:37 -04:00
parent 0b5980d7b3
commit c1ac0e2760
18 changed files with 59 additions and 39 deletions

View File

@ -830,7 +830,8 @@ public class GenomeAnalysisEngine {
throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e);
}
}
RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser,header,validationExclusionType);
RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType);
List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
for (RMDTriplet fileDescriptor : referenceMetaDataFiles)
@ -854,6 +855,15 @@ public class GenomeAnalysisEngine {
return readsDataSource.getHeader();
}
public boolean lenientVCFProcessing() {
return lenientVCFProcessing(argCollection.unsafe);
}
public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) {
return val == ValidationExclusion.TYPE.ALL
|| val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING;
}
/**
* Returns the unmerged SAM file header for an individual reader.
* @param reader The reader.

View File

@ -347,9 +347,6 @@ public class GATKArgumentCollection {
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@Argument(fullName="allowMissingVCFHeaders",shortName = "allowMissingVCFHeaders",doc="If provided, the GATK will write out VCF files that contain INFO, FILTER, and FORMAT fields not found in the VCF header",required=false)
public boolean allowMissingVCFHeaders = false;
/**
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other

View File

@ -40,6 +40,7 @@ public class ValidationExclusion {
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities
LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc.
@EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL
ALL // do not check for all of the above conditions, DEFAULT
}

View File

@ -183,7 +183,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
List<Options> options = new ArrayList<Options>();
if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES);
if ( engine.getArguments().allowMissingVCFHeaders ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY);
return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options);

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -85,16 +86,18 @@ public class FeatureManager {
private final PluginManager<FeatureCodec> pluginManager;
private final Collection<FeatureDescriptor> featureDescriptors = new TreeSet<FeatureDescriptor>();
private final VCFHeader headerForRepairs;
private final boolean lenientVCFProcessing;
/**
* Construct a FeatureManager without a master VCF header
*/
public FeatureManager() {
this(null);
this(null, false);
}
public FeatureManager(final VCFHeader headerForRepairs) {
public FeatureManager(final VCFHeader headerForRepairs, final boolean lenientVCFProcessing) {
this.headerForRepairs = headerForRepairs;
this.lenientVCFProcessing = lenientVCFProcessing;
pluginManager = new PluginManager<FeatureCodec>(FeatureCodec.class, "Codecs", "Codec");
for (final String rawName: pluginManager.getPluginsByName().keySet()) {
@ -252,8 +255,11 @@ public class FeatureManager {
((NameAwareCodec)codex).setName(name);
if ( codex instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser);
if ( codex instanceof VCFCodec)
if ( codex instanceof VCFCodec )
((VCFCodec)codex).setHeaderForRepairs(headerForRepairs);
if ( codex instanceof AbstractVCFCodec && lenientVCFProcessing )
((AbstractVCFCodec)codex).disableOnTheFlyModifications();
return codex;
}
}

View File

@ -34,6 +34,7 @@ import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
@ -98,7 +99,7 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
this.dict = dict;
this.validationExclusionType = validationExclusionType;
this.genomeLocParser = genomeLocParser;
this.featureManager = new FeatureManager(headerForRepairs);
this.featureManager = new FeatureManager(headerForRepairs, GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType));
}
/**

View File

@ -64,9 +64,10 @@ public class VCFDiffableReader implements DiffableReader {
root.add("VERSION", version);
br.close();
// must be read as state is stored in reader itself
AbstractVCFCodec.disableOnTheFlyModifications();
FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false);
final VCFCodec vcfCodec = new VCFCodec();
vcfCodec.disableOnTheFlyModifications(); // must be read as state is stored in reader itself
FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false);
VCFHeader header = (VCFHeader)reader.getHeader();
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
String key = headerLine.getKey();

View File

@ -510,7 +510,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
for (VariantContext vc : vcs) {
// an option for performance testing only
if ( fullyDecode )
vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() );
// an option for performance testing only
if ( forceGenotypesDecode ) {

View File

@ -22,7 +22,6 @@ import java.util.zip.GZIPInputStream;
public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext> implements NameAwareCodec {
public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20);
protected static boolean doOnTheFlyModifications = true;
protected final static Logger log = Logger.getLogger(AbstractVCFCodec.class);
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
@ -61,6 +60,11 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
protected boolean warnedAboutNoEqualsForNonFlag = false;
/**
* If true, then we'll magically fix up VCF headers on the fly when we read them in
*/
protected boolean doOnTheFlyModifications = true;
protected AbstractVCFCodec() {
super(VariantContext.class);
}
@ -850,7 +854,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
* of VCF records. Useful primarily for raw comparisons such as when comparing
* raw VCF records
*/
public static final void disableOnTheFlyModifications() {
public final void disableOnTheFlyModifications() {
doOnTheFlyModifications = false;
}
}

View File

@ -1337,13 +1337,13 @@ public class VariantContext implements Feature { // to enable tribble integratio
* @param header containing types about all fields in this VC
* @return a fully decoded version of this VC
*/
public VariantContext fullyDecode(final VCFHeader header) {
public VariantContext fullyDecode(final VCFHeader header, final boolean lenientDecoding) {
if ( isFullyDecoded() )
return this;
else {
// TODO -- warning this is potentially very expensive as it creates copies over and over
final VariantContextBuilder builder = new VariantContextBuilder(this);
fullyDecodeInfo(builder, header);
fullyDecodeInfo(builder, header, lenientDecoding);
fullyDecodeGenotypes(builder, header);
builder.fullyDecoded(true);
return builder.make();
@ -1358,13 +1358,13 @@ public class VariantContext implements Feature { // to enable tribble integratio
return fullyDecoded;
}
private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header) {
builder.attributes(fullyDecodeAttributes(getAttributes(), header, false));
private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header, final boolean lenientDecoding) {
builder.attributes(fullyDecodeAttributes(getAttributes(), header, lenientDecoding));
}
private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
final VCFHeader header,
final boolean allowMissingValuesComparedToHeader) {
final boolean lenientDecoding) {
final Map<String, Object> newAttributes = new HashMap<String, Object>(attributes.size());
for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
@ -1377,7 +1377,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
final Object decoded = decodeValue(field, attr.getValue(), format);
if ( decoded != null &&
! allowMissingValuesComparedToHeader
! lenientDecoding
&& format.getCountType() != VCFHeaderLineCount.UNBOUNDED
&& format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements
final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1;

View File

@ -155,7 +155,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
public void add( VariantContext vc ) {
if ( doNotWriteGenotypes )
vc = new VariantContextBuilder(vc).noGenotypes().make();
vc = vc.fullyDecode(header);
vc = vc.fullyDecode(header, false);
super.add(vc); // allow on the fly indexing

View File

@ -569,6 +569,6 @@ class VCFWriter extends IndexingVariantContextWriter {
+ " at " + vc.getChr() + ":" + vc.getStart()
+ " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have"
+ " complete VCF headers by default. This error can be disabled with the engine argument"
+ " --allowMissingVCFHeaders");
+ " -U LENIENT_VCF_PROCESSING");
}
}

View File

@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " +
"--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " +
"--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " +
"-o %s --no_cmdline_in_header --allowMissingVCFHeaders", 1, Arrays.asList("cba514105039f7a56f7ecdd241fbdcca"));
"-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("cba514105039f7a56f7ecdd241fbdcca"));
spec.disableShadowBCF();
executeTest("test BeagleOutputToVCF", spec);
}
@ -51,7 +51,7 @@ public class BeagleIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ProduceBeagleInput -R " + hg19Reference + " " +
"--variant:VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " +
"-o %s --allowMissingVCFHeaders", 1, Arrays.asList("f301b089d21da259873f04bdc468835d"));
"-o %s -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("f301b089d21da259873f04bdc468835d"));
spec.disableShadowBCF();
executeTest("test BeagleInput", spec);
}
@ -61,7 +61,7 @@ public class BeagleIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ProduceBeagleInput --variant:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+
"--validation:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+
"-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 --allowMissingVCFHeaders -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta --no_cmdline_in_header ",2,
"-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -U LENIENT_VCF_PROCESSING -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta --no_cmdline_in_header ",2,
Arrays.asList("660986891b30cdc937e0f2a3a5743faa","4b6417f892ccfe5c63b8a60cb0ef3740"));
spec.disableShadowBCF();
executeTest("test BeagleInputWithBootstrap",spec);
@ -75,7 +75,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
"--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
"--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
"-L 20:1-70000 -o %s --no_cmdline_in_header --allowMissingVCFHeaders",1,Arrays.asList("fbbbebfda35bab3f6f62eea2f0be1c01"));
"-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d95a97068a97c9059811b2574b73ea60"));
spec.disableShadowBCF();
executeTest("testBeagleChangesSitesToRef",spec);
}

View File

@ -66,7 +66,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -L 20:12,000,000-30,000,000" +
" --no_cmdline_in_header" +
" -input " + params.inVCF +
" -o %s" +
" -U LENIENT_VCF_PROCESSING -o %s" +
" -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null),
Arrays.asList(params.cutVCFMD5));
@ -113,7 +113,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -T ApplyRecalibration" +
" -L 20:12,000,000-30,000,000" +
" -mode INDEL" +
" --no_cmdline_in_header" +
" -U LENIENT_VCF_PROCESSING --no_cmdline_in_header" +
" -input " + params.inVCF +
" -o %s" +
" -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) +

View File

@ -38,14 +38,14 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
// TODO TODO TODO TODO TODO TODO TODO TODO
// TODO TODO TODO TODO TODO TODO TODO TODO
//
// TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE allowMissingVCFHeaders ARGUMENTS
// TODO WHEN THE HC EMITS VALID VCF HEADERS ENABLE BCF AND REMOVE lenientVCFProcessing ARGUMENTS
//
// TODO TODO TODO TODO TODO TODO TODO TODO
// TODO TODO TODO TODO TODO TODO TODO TODO
// TODO TODO TODO TODO TODO TODO TODO TODO
//
private static String baseTestString(String args) {
return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s --allowMissingVCFHeaders -R " + b36KGReference + args;
return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args;
}
private void cvExecuteTest(final String name, final WalkerTestSpec spec) {

View File

@ -18,7 +18,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant "
+ b37hapmapGenotypes + " -disc " + testFile
+ " -o %s --no_cmdline_in_header --allowMissingVCFHeaders --allowMissingVCFHeaders",
+ " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",
1,
Arrays.asList("d88bdae45ae0e74e8d8fd196627e612c")
);
@ -47,7 +47,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant "
+ b37hapmapGenotypes + " -disc " + testFile
+ " -o %s --no_cmdline_in_header --allowMissingVCFHeaders",
+ " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",
1,
Arrays.asList("54289033d35d32b8ebbb38c51fbb614c")
);
@ -93,7 +93,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc "
+ b37hapmapGenotypes + " --variant " + testFile
+ " -o %s --no_cmdline_in_header --allowMissingVCFHeaders",
+ " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",
1,
Arrays.asList("946e7f2e0ae08dc0e931c1634360fc46")
);
@ -161,7 +161,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("a0b7f77edc16df0992d2c1363136a17e")
Arrays.asList("ef3c5f75074a5dd2b2cd2715856a2542")
);
executeTest("testNoGTs--" + testFile, spec);
@ -223,7 +223,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
final String testFile = privateTestDir + "missingHeaderLine.vcf";
final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp "
+ testFile + " -o %s --no_cmdline_in_header"
+ (expectedException == null ? " -allowMissingVCFHeaders" : "");
+ (expectedException == null ? " -lenientVCFProcessing" : "");
WalkerTestSpec spec =
expectedException != null
? new WalkerTestSpec(cmd, 1, expectedException)

View File

@ -149,7 +149,7 @@ public class VariantContextTestProvider {
logger.warn("Reading records from " + file);
for ( final VariantContext raw : x.getSecond() ) {
if ( raw != null )
fullyDecoded.add(raw.fullyDecode(x.getFirst()));
fullyDecoded.add(raw.fullyDecode(x.getFirst(), false));
}
logger.warn("Done reading " + file);
@ -599,7 +599,7 @@ public class VariantContextTestProvider {
public VariantContext next() {
try {
final VariantContext vc = codec.decode(pbs);
return vc == null ? null : vc.fullyDecode(header);
return vc == null ? null : vc.fullyDecode(header, false);
} catch ( IOException e ) {
throw new RuntimeException(e);
}

View File

@ -121,7 +121,7 @@ public class VariantContextWritersUnitTest extends BaseTest {
final List<VariantContext> fullyDecoded = new ArrayList<VariantContext>(vcsAfterIO.size());
for ( final VariantContext withStrings : vcsAfterIO )
fullyDecoded.add(withStrings.fullyDecode(header));
fullyDecoded.add(withStrings.fullyDecode(header, false));
return fullyDecoded;
}