Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-08-16 13:05:36 -04:00
commit 611d9b61e2
14 changed files with 93 additions and 121 deletions

View File

@ -849,20 +849,9 @@ public class GenomeAnalysisEngine {
SAMSequenceDictionary sequenceDictionary,
GenomeLocParser genomeLocParser,
ValidationExclusion.TYPE validationExclusionType) {
VCFHeader header = null;
if ( getArguments().repairVCFHeader != null ) {
try {
final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(getArguments().repairVCFHeader));
header = (VCFHeader)new VCFCodec().readHeader(pbs).getHeaderValue();
pbs.close();
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e);
}
}
final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType);
RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType);
List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
final List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
for (RMDTriplet fileDescriptor : referenceMetaDataFiles)
dataSources.add(new ReferenceOrderedDataSource(fileDescriptor,
builder,

View File

@ -384,14 +384,5 @@ public class GATKArgumentCollection {
@Hidden
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
/**
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other
* VCF file that GATK reads in. This allows us to have in effect a master set of header records and use these
* to fill in any missing ones in input VCF files.
*/
@Argument(fullName="repairVCFHeader", shortName = "repairVCFHeader", doc="If provided, whenever we read a VCF file we will use the header in this file to repair the header of the input VCF files", required=false)
public File repairVCFHeader = null;
}

View File

@ -119,7 +119,7 @@ public class ThreadLocalOutputTracker extends OutputTracker {
try {
tempFile = File.createTempFile( stub.getClass().getName(), null );
tempFile.deleteOnExit();
//tempFile.deleteOnExit();
}
catch( IOException ex ) {
throw new UserException.BadTmpDir("Unable to create temporary file for stub: " + stub.getClass().getName() );

View File

@ -61,6 +61,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
protected final File file;
protected OutputStream stream;
protected final VariantContextWriter writer;
boolean closed = false;
/**
* Constructs an object which will write directly into the output file provided by the stub.
@ -172,10 +173,13 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
if(file != null)
logger.debug("Closing temporary file " + file.getAbsolutePath());
writer.close();
closed = true;
}
public void mergeInto(VariantContextWriterStorage target) {
try {
if ( ! closed )
throw new ReviewedStingException("Writer not closed, but we are merging into the file!");
final String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin";
logger.debug(String.format("Merging %s into %s",file.getAbsolutePath(),targetFilePath));
@ -194,6 +198,9 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
}
source.close();
file.delete(); // this should be last to aid in debugging when the process fails
} catch (UserException e) {
throw new ReviewedStingException("BUG: intermediate file " + file + " is malformed, got error while reading", e);
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
}

View File

@ -47,6 +47,7 @@ import java.util.List;
public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
public static final String NO_HEADER_ARG_NAME = "no_cmdline_in_header";
public static final String SITES_ONLY_ARG_NAME = "sites_only";
public static final String FORCE_BCF = "bcf";
public static final HashSet<String> SUPPORTED_ZIPPED_SUFFIXES = new HashSet<String>();
//
@ -96,7 +97,11 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public List<ArgumentDefinition> createArgumentDefinitions( ArgumentSource source ) {
return Arrays.asList( createDefaultArgumentDefinition(source), createNoCommandLineHeaderArgumentDefinition(),createSitesOnlyArgumentDefinition());
return Arrays.asList(
createDefaultArgumentDefinition(source),
createNoCommandLineHeaderArgumentDefinition(),
createSitesOnlyArgumentDefinition(),
createBCFArgumentDefinition() );
}
/**
@ -117,7 +122,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) {
if(!source.isRequired())
throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default.");
VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, false, argumentSources, false, false);
VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
engine.addOutput(stub);
return stub;
}
@ -141,15 +146,15 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
if(writerFile == null && !source.isRequired())
throw new MissingArgumentValueException(defaultArgumentDefinition);
// Should we compress the output stream?
boolean compress = isCompressed(writerFileName);
boolean skipWritingCmdLineHeader = argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches);
boolean doNotWriteGenotypes = argumentIsPresent(createSitesOnlyArgumentDefinition(),matches);
// Create a stub for the given object.
VariantContextWriterStub stub = (writerFile != null) ? new VariantContextWriterStub(engine, writerFile, compress, argumentSources, skipWritingCmdLineHeader, doNotWriteGenotypes)
: new VariantContextWriterStub(engine, defaultOutputStream, compress, argumentSources, skipWritingCmdLineHeader, doNotWriteGenotypes);
final VariantContextWriterStub stub = (writerFile != null)
? new VariantContextWriterStub(engine, writerFile, argumentSources)
: new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
stub.setCompressed(isCompressed(writerFileName));
stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches));
stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches));
stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches));
// WARNING: Side effects required by engine!
parsingEngine.addTags(stub,getArgumentTags(matches));
@ -159,8 +164,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
/**
* Creates the optional compression level argument for the BAM file.
* @return Argument definition for the BAM file itself. Will not be null.
* Creates the optional no_header argument for the VCF file.
* @return Argument definition for the VCF file itself. Will not be null.
*/
private ArgumentDefinition createNoCommandLineHeaderArgumentDefinition() {
return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
@ -179,8 +184,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
/**
* Creates the optional compression level argument for the BAM file.
* @return Argument definition for the BAM file itself. Will not be null.
* Creates the optional sites_only argument definition
* @return Argument definition for the VCF file itself. Will not be null.
*/
private ArgumentDefinition createSitesOnlyArgumentDefinition() {
return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
@ -198,6 +203,26 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
null );
}
/**
* Creates the optional bcf argument definition
* @return Argument definition for the VCF file itself. Will not be null.
*/
private ArgumentDefinition createBCFArgumentDefinition() {
return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
boolean.class,
FORCE_BCF,
FORCE_BCF,
"force BCF output, regardless of the file's extension",
false,
true,
false,
true,
null,
null,
null,
null );
}
/**
* Returns true if the file will be compressed.
* @param writerFileName Name of the file

View File

@ -79,7 +79,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
/**
* Should we emit a compressed output stream?
*/
private final boolean isCompressed;
private boolean isCompressed = false;
/**
* A hack: push the argument sources into the VCF header so that the VCF header
@ -90,12 +90,17 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
/**
* Should the header be written out? A hidden argument.
*/
private final boolean skipWritingCommandLineHeader;
private boolean skipWritingCommandLineHeader = false;
/**
* Should we not write genotypes even when provided?
*/
private final boolean doNotWriteGenotypes;
private boolean doNotWriteGenotypes = false;
/**
* Should we force BCF writing regardless of the file extension?
*/
private boolean forceBCF = false;
/**
* Connects this stub with an external stream capable of serving the
@ -108,19 +113,13 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
*
* @param engine engine.
* @param genotypeFile file to (ultimately) create.
* @param isCompressed should we compress the output stream?
* @param argumentSources sources.
* @param skipWritingCommandLineHeader skip writing header.
* @param doNotWriteGenotypes do not write genotypes.
*/
public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingCommandLineHeader, boolean doNotWriteGenotypes) {
public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, Collection<Object> argumentSources) {
this.engine = engine;
this.genotypeFile = genotypeFile;
this.genotypeStream = null;
this.isCompressed = isCompressed;
this.argumentSources = argumentSources;
this.skipWritingCommandLineHeader = skipWritingCommandLineHeader;
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
/**
@ -128,19 +127,13 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
*
* @param engine engine.
* @param genotypeStream stream to (ultimately) write.
* @param isCompressed should we compress the output stream?
* @param argumentSources sources.
* @param skipWritingCommandLineHeader skip writing header.
* @param doNotWriteGenotypes do not write genotypes.
*/
public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingCommandLineHeader, boolean doNotWriteGenotypes) {
public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, Collection<Object> argumentSources) {
this.engine = engine;
this.genotypeFile = null;
this.genotypeStream = new PrintStream(genotypeStream);
this.isCompressed = isCompressed;
this.argumentSources = argumentSources;
this.skipWritingCommandLineHeader = skipWritingCommandLineHeader;
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
/**
@ -167,6 +160,22 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
return isCompressed;
}
public void setCompressed(boolean compressed) {
isCompressed = compressed;
}
public void setSkipWritingCommandLineHeader(boolean skipWritingCommandLineHeader) {
this.skipWritingCommandLineHeader = skipWritingCommandLineHeader;
}
public void setDoNotWriteGenotypes(boolean doNotWriteGenotypes) {
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
public void setForceBCF(boolean forceBCF) {
this.forceBCF = forceBCF;
}
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
@ -187,7 +196,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY);
if ( getFile() != null && VariantContextWriterFactory.isBCFOutput(getFile()) )
if ( forceBCF || (getFile() != null && VariantContextWriterFactory.isBCFOutput(getFile())) )
options.add(Options.FORCE_BCF);
return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options);

View File

@ -85,18 +85,16 @@ public class FeatureManager {
private final PluginManager<FeatureCodec> pluginManager;
private final Collection<FeatureDescriptor> featureDescriptors = new TreeSet<FeatureDescriptor>();
private final VCFHeader headerForRepairs;
private final boolean lenientVCFProcessing;
/**
* Construct a FeatureManager without a master VCF header
*/
public FeatureManager() {
this(null, false);
this(false);
}
public FeatureManager(final VCFHeader headerForRepairs, final boolean lenientVCFProcessing) {
this.headerForRepairs = headerForRepairs;
public FeatureManager(final boolean lenientVCFProcessing) {
this.lenientVCFProcessing = lenientVCFProcessing;
pluginManager = new PluginManager<FeatureCodec>(FeatureCodec.class, "Codecs", "Codec");
@ -255,8 +253,6 @@ public class FeatureManager {
((NameAwareCodec)codex).setName(name);
if ( codex instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser);
if ( codex instanceof VCFCodec )
((VCFCodec)codex).setHeaderForRepairs(headerForRepairs);
if ( codex instanceof AbstractVCFCodec && lenientVCFProcessing )
((AbstractVCFCodec)codex).disableOnTheFlyModifications();

View File

@ -89,17 +89,15 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
* please talk through your approach with the SE team.
* @param dict Sequence dictionary to use.
* @param genomeLocParser Location parser to use.
* @param headerForRepairs a VCF header that should be used to repair VCF headers. Can be null
* @param validationExclusionType Types of validations to exclude, for sequence dictionary verification.
*/
public RMDTrackBuilder(final SAMSequenceDictionary dict,
final GenomeLocParser genomeLocParser,
final VCFHeader headerForRepairs,
ValidationExclusion.TYPE validationExclusionType) {
this.dict = dict;
this.validationExclusionType = validationExclusionType;
this.genomeLocParser = genomeLocParser;
this.featureManager = new FeatureManager(headerForRepairs, GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType));
this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType));
}
/**
@ -111,18 +109,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
return featureManager;
}
/**
* Same as full constructor but makes one without a header for repairs
* @param dict
* @param genomeLocParser
* @param validationExclusionType
*/
public RMDTrackBuilder(final SAMSequenceDictionary dict,
final GenomeLocParser genomeLocParser,
ValidationExclusion.TYPE validationExclusionType) {
this(dict, genomeLocParser, null, validationExclusionType);
}
/**
* create a RMDTrack of the specified type
*

View File

@ -19,6 +19,8 @@ import java.util.*;
* it computes the AC from the genotypes themselves. If no AC can be computed, 0 is used.
*/
public class AlleleCount extends VariantStratifier {
int nchrom;
@Override
public void initialize() {
// we can only work with a single eval VCF, and it must have genotypes
@ -26,7 +28,8 @@ public class AlleleCount extends VariantStratifier {
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf");
// There are 2 x n sample chromosomes for diploids
int nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * 2;
// TODO -- generalize to handle multiple ploidy
nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * 2;
if ( nchrom < 2 )
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample");
@ -52,8 +55,10 @@ public class AlleleCount extends VariantStratifier {
}
// make sure that the AC isn't invalid
if ( AC > eval.getCalledChrCount() )
throw new UserException.MalformedVCF(String.format("The AC or MLEAC value (%d) at position %s:%d is larger than the possible called chromosome count (%d)", AC, eval.getChr(), eval.getStart(), eval.getCalledChrCount()));
if ( AC > nchrom )
throw new UserException.MalformedVCF(String.format("The AC or MLEAC value (%d) at position %s:%d " +
"is larger than the number of chromosomes over all samples (%d)", AC,
eval.getChr(), eval.getStart(), nchrom));
return Collections.singletonList((Object) AC);
} else {

View File

@ -51,7 +51,6 @@ import java.util.Map;
*/
public final class BCF2Codec implements FeatureCodec<VariantContext> {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private final static boolean FORBID_SYMBOLICS = false;
private final static int ALLOWED_MAJOR_VERSION = 2;
private final static int MIN_MINOR_VERSION = 1;
@ -178,7 +177,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
contigNames.add(contig.getID());
}
} else {
throw new UserException.MalformedBCF2("Didn't find any contig lines in BCF2 file header");
error("Didn't find any contig lines in BCF2 file header");
}
// create the string dictionary
@ -271,7 +270,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
final int nSamples = nFormatSamples & 0x00FFFFF;
if ( header.getNGenotypeSamples() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
error("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + header.getNGenotypeSamples() +
" samples in header but have a record with " + nSamples + " samples");
@ -343,9 +342,6 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
if ( isRef ) ref = alleleBases;
alleles.add(allele);
if ( FORBID_SYMBOLICS && allele.isSymbolic() )
throw new ReviewedStingException("LIMITATION: GATK BCF2 codec does not yet support symbolic alleles");
}
assert ref != null;
@ -496,7 +492,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
return gtFieldDecoders.getDecoder(field);
}
private final void error(final String message) throws RuntimeException {
private void error(final String message) throws RuntimeException {
throw new UserException.MalformedBCF2(String.format("%s, at record %d with position %d:", message, recordNo, pos));
}
}

View File

@ -49,13 +49,6 @@ public class VCFCodec extends AbstractVCFCodec {
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
/**
* A VCF header the contains master info/filter/format records that we use to 'fill in'
* any missing records from our input VCF header. This allows us to repair headers on
* the fly
*/
private VCFHeader headerForRepairs = null;
/**
* @param reader the line reader to take header lines from
* @return the number of header lines
@ -88,8 +81,6 @@ public class VCFCodec extends AbstractVCFCodec {
}
headerStrings.add(line);
super.parseHeaderFromLines(headerStrings, version);
if ( headerForRepairs != null )
this.header = repairHeader(this.header, headerForRepairs);
return this.header;
}
else {
@ -103,24 +94,6 @@ public class VCFCodec extends AbstractVCFCodec {
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
}
private final VCFHeader repairHeader(final VCFHeader readHeader, final VCFHeader masterHeader) {
final Set<VCFHeaderLine> lines = VCFUtils.smartMergeHeaders(Arrays.asList(readHeader, masterHeader), log);
return new VCFHeader(lines, readHeader.getGenotypeSamples());
}
/**
* Tells this VCFCodec to repair the incoming header files with the information in masterHeader
*
* @param headerForRepairs
*/
public void setHeaderForRepairs(final VCFHeader headerForRepairs) {
if ( headerForRepairs != null )
log.info("Using master VCF header to repair missing files from incoming VCFs");
this.headerForRepairs = headerForRepairs;
}
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
*

View File

@ -563,6 +563,6 @@ class VCFWriter extends IndexingVariantContextWriter {
+ " at " + vc.getChr() + ":" + vc.getStart()
+ " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have"
+ " complete VCF headers by default. This error can be disabled with the engine argument"
+ " -U LENIENT_VCF_PROCESSING or repair the VCF file header using repairVCFHeader");
+ " -U LENIENT_VCF_PROCESSING");
}
}

View File

@ -76,7 +76,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
"a8ce3cd3dccafdf7d580bcce7d660a9a", // tranches
"1cdf8c9ee77d91d1ba7f002573108bad", // recal file
"74c10fc15f9739a938b7138909fbde04", // recal file
"62fda105e14b619a1c263855cf56af1d"); // cut VCF
@DataProvider(name = "VRBCFTest")

View File

@ -92,7 +92,7 @@ public class VCFIntegrationTest extends WalkerTest {
//
//
// Tests to ensure that -U LENIENT_VCF_PROCESS and header repairs are working
// Tests to ensure that -U LENIENT_VCF_PROCESS
//
//
@ -106,11 +106,6 @@ public class VCFIntegrationTest extends WalkerTest {
runVCFWithoutHeaders("-U LENIENT_VCF_PROCESSING", "6de8cb7457154dd355aa55befb943f88", null, true);
}
@Test
public void testPassingOnVCFWithoutHeadersRepairingHeaders() {
runVCFWithoutHeaders("-repairVCFHeader " + privateTestDir + "vcfexample2.justHeader.vcf", "ff61e9cad6653c7f93d82d391f7ecdcb", null, false);
}
private void runVCFWithoutHeaders(final String moreArgs, final String expectedMD5, final Class expectedException, final boolean disableBCF) {
final String testVCF = privateTestDir + "vcfexample2.noHeader.vcf";
final String baseCommand = "-R " + b37KGReference