From aeb34758e6b66c32f6b050b1248289e702767757 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 29 Dec 2009 15:33:53 +0000 Subject: [PATCH] Adding a validation stringency to the VCF writers (which defaults to STRICT). If set to SILENT, it will not throw an exception for (reasonable) off-spec requests but will instead ignore such requests and silently move on. This change allows the pooled calculation model to work correctly with multiple threads. Boys, the Genotyper is now officially parallelized. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2462 348d0f76-0448-11de-a6fe-93d51630548a --- .../io/storage/VCFGenotypeWriterStorage.java | 9 +++- .../gatk/io/stubs/VCFGenotypeWriterStub.java | 8 ++++ .../walkers/genotyper/UnifiedGenotyper.java | 31 +++++++++---- .../utils/genotype/vcf/VCFGenotypeWriter.java | 8 ++++ .../vcf/VCFGenotypeWriterAdapter.java | 22 ++++++--- .../sting/utils/genotype/vcf/VCFRecord.java | 46 +++++++++++++------ .../sting/utils/genotype/vcf/VCFWriter.java | 12 ++++- 7 files changed, 105 insertions(+), 31 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java index 5bc02c6c3..27150e5de 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java @@ -51,11 +51,18 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage public void addRecord(VCFRecord vcfRecord) { outputTracker.getStorage(this).addRecord(vcfRecord); } + + /** + * set the validation stringency + * @param value validation stringency value + */ + public void setValidationStringency(VALIDATION_STRINGENCY value) { + outputTracker.getStorage(this).setValidationStringency(value); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 6362cb03b..2e70fe6da 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -100,7 +100,7 @@ public class UnifiedGenotyper extends LocusWalker Double.MIN_VALUE ) { + if ( UAC.LOD_THRESHOLD != Double.MIN_VALUE ) { StringBuilder sb = new StringBuilder(); sb.append("\n***\tThe --lod_threshold argument is no longer supported; instead, please use --min_confidence_threshold."); sb.append("\n***\tThere is approximately a 10-to-1 mapping from confidence to LOD."); @@ -108,7 +108,7 @@ public class UnifiedGenotyper extends LocusWalker 1 ) { // no ASSUME_SINGLE_SAMPLE because the IO system doesn't know how to get the sample name if ( UAC.ASSUME_SINGLE_SAMPLE != null ) @@ -118,6 +118,13 @@ public class UnifiedGenotyper extends LocusWalker 1 && writer instanceof VCFGenotypeWriter ) + ((VCFGenotypeWriter)writer).setValidationStringency(VCFGenotypeWriterAdapter.VALIDATION_STRINGENCY.SILENT); } // initialize the verbose writer @@ -175,10 +186,10 @@ public class UnifiedGenotyper extends LocusWalker mSampleNames = new LinkedHashSet(); - /** our log, which we want to capture anything from this class */ + // our log, which we want to capture anything from this class protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class); + // validation stringency + private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT; + public VCFGenotypeWriterAdapter(File writeTo) { if (writeTo == null) throw new RuntimeException("VCF output file must not be null"); @@ -41,7 +45,6 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { * @param sampleNames the sample names * @param headerInfo the optional header fields */ - @Override public void writeHeader(Set sampleNames, Set headerInfo) { mSampleNames.addAll(sampleNames); @@ -96,7 +99,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { // get the location and reference if ( genotypes.size() == 0 ) { if ( locusdata == null ) - throw new IllegalArgumentException("Unable to parse out the current location: genotype array must contain at least one entry or have locusdata"); + throw new IllegalArgumentException("Unable to parse out the current location: genotype array must contain at least one entry or have variation data"); params.setLocations(locusdata.getLocation(), locusdata.getReference().charAt(0)); @@ -121,7 +124,7 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { } } - if (genotypeMap.size() > 0) { + if ( validationStringency == VALIDATION_STRINGENCY.STRICT && genotypeMap.size() > 0 ) { for (String name : genotypeMap.keySet()) logger.fatal("Genotype " + name + " was present in the VCFHeader"); throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header"); @@ -151,11 +154,11 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { params.getFormatString(), params.getGenotypesRecords()); - mWriter.addRecord(vcfRecord); + mWriter.addRecord(vcfRecord, validationStringency); } public void addRecord(VCFRecord vcfRecord) { - mWriter.addRecord(vcfRecord); + mWriter.addRecord(vcfRecord, validationStringency); } /** @@ -227,4 +230,11 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { return map; } + /** + * set the validation stringency + * @param value validation stringency value + */ + public void setValidationStringency(VALIDATION_STRINGENCY value) { + validationStringency = value; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index b28a59dfe..fc9752629 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -504,6 +504,17 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { * @return a string */ public String toStringEncoding(VCFHeader header) { + return toStringEncoding(header, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT); + } + + /** + * the generation of a string representation, which is used by the VCF writer + * + * @param header the VCF header for this VCF Record + * @param validationStringency the validation stringency + * @return a string + */ + public String toStringEncoding(VCFHeader header, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) { StringBuilder builder = new StringBuilder(); // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO @@ -524,9 +535,15 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { builder.append(FIELD_SEPERATOR); builder.append(createInfoString()); - if (this.hasGenotypeData()) { - addGenotypeData(builder, header); + if ( this.hasGenotypeData() ) { + try { + addGenotypeData(builder, header); + } catch (Exception e) { + if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT ) + throw new RuntimeException(e.getMessage()); + } } + return builder.toString(); } @@ -553,28 +570,31 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { * @param header the header object */ private void addGenotypeData(StringBuilder builder, VCFHeader header) { - builder.append(FIELD_SEPERATOR + mGenotypeFormatString); - if (header.getGenotypeSamples().size() < getGenotypes().size()) - throw new RuntimeException("We have more genotype samples than the header specified"); + StringBuffer tempStr = new StringBuffer(); + if ( header.getGenotypeSamples().size() < getGenotypes().size() ) + throw new IllegalStateException("We have more genotype samples than the header specified"); + tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString); Map gMap = genotypeListToMap(getGenotypes()); String[] genotypeFormatStrings = mGenotypeFormatString.split(":"); - for (String genotype : header.getGenotypeSamples()) { - builder.append(FIELD_SEPERATOR); - if (gMap.containsKey(genotype)) { + for ( String genotype : header.getGenotypeSamples() ) { + tempStr.append(FIELD_SEPERATOR); + if ( gMap.containsKey(genotype) ) { VCFGenotypeRecord rec = gMap.get(genotype); - builder.append(rec.toStringEncoding(this.mAlts, genotypeFormatStrings)); + tempStr.append(rec.toStringEncoding(this.mAlts, genotypeFormatStrings)); gMap.remove(genotype); } else { - builder.append(VCFGenotypeRecord.EMPTY_GENOTYPE); + tempStr.append(VCFGenotypeRecord.EMPTY_GENOTYPE); } } - if (gMap.size() != 0) { - for (String sample : gMap.keySet()) + if ( gMap.size() != 0 ) { + for ( String sample : gMap.keySet() ) System.err.println("Sample " + sample + " is being genotyped but isn't in the header."); - throw new RuntimeException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records"); + throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records"); } + + builder.append(tempStr); } /** diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index a0187f567..b6c2e56da 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -84,10 +84,20 @@ public class VCFWriter { * @param record the record to output */ public void addRecord(VCFRecord record) { + addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT); + } + + /** + * output a record to the VCF file + * + * @param record the record to output + * @param validationStringency the validation stringency + */ + public void addRecord(VCFRecord record, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) { if ( mHeader == null ) throw new IllegalStateException("The VCF Header must be written before records can be added"); - String vcfString = record.toStringEncoding(mHeader); + String vcfString = record.toStringEncoding(mHeader, validationStringency); try { mWriter.write(vcfString + "\n"); mWriter.flush(); // necessary so that writing to an output stream will work