Adding a validation stringency to the VCF writers (which defaults to STRICT). If set to SILENT, it will not throw an exception for (reasonable) off-spec requests but will instead ignore such requests and silently move on.

This change allows the pooled calculation model to work correctly with multiple threads.  Boys, the Genotyper is now officially parallelized.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2462 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-29 15:33:53 +00:00
parent 29a3d9b47a
commit aeb34758e6
7 changed files with 105 additions and 31 deletions

View File

@ -51,11 +51,18 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage<VCFGenotypeW
((VCFGenotypeWriter)writer).addRecord(vcfRecord);
}
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value) {
((VCFGenotypeWriter)writer).setValidationStringency(value);
}
/**
* Merges the stream backing up this temporary storage into the target.
* @param target Target stream for the temporary storage. May not be null.
*/
@Override
public void mergeInto(VCFGenotypeWriter target) {
VCFReader reader = new VCFReader(file);
while ( reader.hasNext() )

View File

@ -50,4 +50,12 @@ public class VCFGenotypeWriterStub extends GenotypeWriterStub<VCFGenotypeWriter>
public void addRecord(VCFRecord vcfRecord) {
outputTracker.getStorage(this).addRecord(vcfRecord);
}
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value) {
outputTracker.getStorage(this).setValidationStringency(value);
}
}

View File

@ -100,7 +100,7 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
if ( UAC.POOLSIZE < 1 && UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ) {
throw new IllegalArgumentException("Attempting to use the POOLED model with a pool size less than 1. Please set the pool size to an appropriate value.");
}
if ( UAC.LOD_THRESHOLD > Double.MIN_VALUE ) {
if ( UAC.LOD_THRESHOLD != Double.MIN_VALUE ) {
StringBuilder sb = new StringBuilder();
sb.append("\n***\tThe --lod_threshold argument is no longer supported; instead, please use --min_confidence_threshold.");
sb.append("\n***\tThere is approximately a 10-to-1 mapping from confidence to LOD.");
@ -108,7 +108,7 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
throw new IllegalArgumentException(sb.toString());
}
// some arguments can't be handled (for now) while we are multi-threaded
// some arguments can't be handled (at least for now) while we are multi-threaded
if ( getToolkit().getArguments().numberOfThreads > 1 ) {
// no ASSUME_SINGLE_SAMPLE because the IO system doesn't know how to get the sample name
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
@ -118,6 +118,13 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
throw new IllegalArgumentException("For technical reasons, the VERBOSE argument cannot be used with multiple threads");
}
// set up the writer manually if it needs to use the output stream
if ( writer == null && out != null ) {
logger.warn("For technical reasons, VCF format must be used when writing to standard out.");
logger.warn("Specify an output file if you would like to use a different output format.");
writer = GenotypeWriterFactory.create(GenotypeWriterFactory.GENOTYPE_FORMAT.VCF, out);
}
// get all of the unique sample names - unless we're in POOLED mode, in which case we ignore the sample names
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
// if we're supposed to assume a single sample, do so
@ -130,11 +137,15 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
// logger.debug("SAMPLE: " + sample);
}
// set up the writer manually if it needs to use the output stream
if ( writer == null && out != null ) {
logger.warn("For technical reasons, VCF format must be used when writing to standard out.");
logger.warn("Specify an output file if you would like to use a different output format.");
writer = GenotypeWriterFactory.create(GenotypeWriterFactory.GENOTYPE_FORMAT.VCF, out);
// in pooled mode we need to check that the format is acceptable
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED && writer != null ) {
// only multi-sample calls use Variations
if ( !writer.supportsMultiSample() )
throw new IllegalArgumentException("The POOLED model is not compatible with the specified format; try using VCF instead");
// when using VCF with multiple threads, we need to turn down the validation stringency so that writing temporary files will work
if ( getToolkit().getArguments().numberOfThreads > 1 && writer instanceof VCFGenotypeWriter )
((VCFGenotypeWriter)writer).setValidationStringency(VCFGenotypeWriterAdapter.VALIDATION_STRINGENCY.SILENT);
}
// initialize the verbose writer
@ -175,10 +186,10 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
// annotation (INFO) fields from UnifiedGenotyper
headerInfo.add(new VCFHeaderLine("INFO_NOTE", "\"All annotations in the INFO field are generated only from the FILTERED context used for calling variants\""));
headerInfo.add(new VCFInfoHeaderLine("AF", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Allele Frequency"));
headerInfo.add(new VCFInfoHeaderLine("NS", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Number of Samples With Data"));
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_FREQUENCY_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Allele Frequency"));
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.SAMPLE_NUMBER_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Number of Samples With Data"));
if ( !UAC.NO_SLOD )
headerInfo.add(new VCFInfoHeaderLine("SB", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Strand Bias"));
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.STRAND_BIAS_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Strand Bias"));
// FORMAT fields if not in POOLED mode
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED )

View File

@ -25,4 +25,12 @@ public interface VCFGenotypeWriter extends GenotypeWriter {
* @param vcfRecord Record to add.
*/
public void addRecord(VCFRecord vcfRecord);
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value);
public enum VALIDATION_STRINGENCY { STRICT, SILENT };
}

View File

@ -16,14 +16,18 @@ import java.util.*;
* Adapt the VCF writter to the genotype output system
*/
public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
// our VCF objects
private VCFWriter mWriter = null;
private VCFHeader mHeader = null;
private final Set<String> mSampleNames = new LinkedHashSet<String>();
/** our log, which we want to capture anything from this class */
// our log, which we want to capture anything from this class
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
// validation stringency
private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT;
public VCFGenotypeWriterAdapter(File writeTo) {
if (writeTo == null) throw new RuntimeException("VCF output file must not be null");
@ -41,7 +45,6 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
* @param sampleNames the sample names
* @param headerInfo the optional header fields
*/
@Override
public void writeHeader(Set<String> sampleNames, Set<VCFHeaderLine> headerInfo) {
mSampleNames.addAll(sampleNames);
@ -96,7 +99,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
// get the location and reference
if ( genotypes.size() == 0 ) {
if ( locusdata == null )
throw new IllegalArgumentException("Unable to parse out the current location: genotype array must contain at least one entry or have locusdata");
throw new IllegalArgumentException("Unable to parse out the current location: genotype array must contain at least one entry or have variation data");
params.setLocations(locusdata.getLocation(), locusdata.getReference().charAt(0));
@ -121,7 +124,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
}
}
if (genotypeMap.size() > 0) {
if ( validationStringency == VALIDATION_STRINGENCY.STRICT && genotypeMap.size() > 0 ) {
for (String name : genotypeMap.keySet())
logger.fatal("Genotype " + name + " was present in the VCFHeader");
throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header");
@ -151,11 +154,11 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
params.getFormatString(),
params.getGenotypesRecords());
mWriter.addRecord(vcfRecord);
mWriter.addRecord(vcfRecord, validationStringency);
}
public void addRecord(VCFRecord vcfRecord) {
mWriter.addRecord(vcfRecord);
mWriter.addRecord(vcfRecord, validationStringency);
}
/**
@ -227,4 +230,11 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
return map;
}
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value) {
validationStringency = value;
}
}

View File

@ -504,6 +504,17 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
* @return a string
*/
public String toStringEncoding(VCFHeader header) {
return toStringEncoding(header, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT);
}
/**
* the generation of a string representation, which is used by the VCF writer
*
* @param header the VCF header for this VCF Record
* @param validationStringency the validation stringency
* @return a string
*/
public String toStringEncoding(VCFHeader header, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) {
StringBuilder builder = new StringBuilder();
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
@ -524,9 +535,15 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
builder.append(FIELD_SEPERATOR);
builder.append(createInfoString());
if (this.hasGenotypeData()) {
addGenotypeData(builder, header);
if ( this.hasGenotypeData() ) {
try {
addGenotypeData(builder, header);
} catch (Exception e) {
if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT )
throw new RuntimeException(e.getMessage());
}
}
return builder.toString();
}
@ -553,28 +570,31 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
* @param header the header object
*/
private void addGenotypeData(StringBuilder builder, VCFHeader header) {
builder.append(FIELD_SEPERATOR + mGenotypeFormatString);
if (header.getGenotypeSamples().size() < getGenotypes().size())
throw new RuntimeException("We have more genotype samples than the header specified");
StringBuffer tempStr = new StringBuffer();
if ( header.getGenotypeSamples().size() < getGenotypes().size() )
throw new IllegalStateException("We have more genotype samples than the header specified");
tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString);
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
String[] genotypeFormatStrings = mGenotypeFormatString.split(":");
for (String genotype : header.getGenotypeSamples()) {
builder.append(FIELD_SEPERATOR);
if (gMap.containsKey(genotype)) {
for ( String genotype : header.getGenotypeSamples() ) {
tempStr.append(FIELD_SEPERATOR);
if ( gMap.containsKey(genotype) ) {
VCFGenotypeRecord rec = gMap.get(genotype);
builder.append(rec.toStringEncoding(this.mAlts, genotypeFormatStrings));
tempStr.append(rec.toStringEncoding(this.mAlts, genotypeFormatStrings));
gMap.remove(genotype);
} else {
builder.append(VCFGenotypeRecord.EMPTY_GENOTYPE);
tempStr.append(VCFGenotypeRecord.EMPTY_GENOTYPE);
}
}
if (gMap.size() != 0) {
for (String sample : gMap.keySet())
if ( gMap.size() != 0 ) {
for ( String sample : gMap.keySet() )
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
throw new RuntimeException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
}
builder.append(tempStr);
}
/**

View File

@ -84,10 +84,20 @@ public class VCFWriter {
* @param record the record to output
*/
public void addRecord(VCFRecord record) {
addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT);
}
/**
* output a record to the VCF file
*
* @param record the record to output
* @param validationStringency the validation stringency
*/
public void addRecord(VCFRecord record, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added");
String vcfString = record.toStringEncoding(mHeader);
String vcfString = record.toStringEncoding(mHeader, validationStringency);
try {
mWriter.write(vcfString + "\n");
mWriter.flush(); // necessary so that writing to an output stream will work