From 693672a46132235590ccf482a5c418b03dc7ba21 Mon Sep 17 00:00:00 2001 From: ebanks Date: Sun, 18 Jul 2010 13:19:56 +0000 Subject: [PATCH] Refactoring the VCF writer code; now no longer uses VCFRecord or any of its related classes, instead writing directly to the writer. Integration tests pass, but some are actually broken and will be fixed this week. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3822 348d0f76-0448-11de-a6fe-93d51630548a --- .../org/broad/tribble/vcf/VCFConstants.java | 3 + java/src/org/broad/tribble/vcf/VCFHeader.java | 33 +- .../io/storage/VCFGenotypeWriterStorage.java | 8 - .../gatk/io/stubs/VCFGenotypeWriterStub.java | 8 - .../sequenom/SequenomValidationConverter.java | 6 +- .../utils/genotype/vcf/VCFGenotypeWriter.java | 8 - .../vcf/VCFGenotypeWriterAdapter.java | 19 +- .../sting/utils/genotype/vcf/VCFWriter.java | 604 +++++++----------- ...nomValidationConverterIntegrationTest.java | 2 +- 9 files changed, 278 insertions(+), 413 deletions(-) diff --git a/java/src/org/broad/tribble/vcf/VCFConstants.java b/java/src/org/broad/tribble/vcf/VCFConstants.java index e36de3e31..5c8206e3d 100755 --- a/java/src/org/broad/tribble/vcf/VCFConstants.java +++ b/java/src/org/broad/tribble/vcf/VCFConstants.java @@ -58,6 +58,9 @@ public final class VCFConstants { public static final String FIELD_SEPARATOR = "\t"; public static final String FILTER_CODE_SEPARATOR = ";"; public static final String INFO_FIELD_SEPARATOR = ";"; + public static final String UNPHASED = "/"; + public static final String PHASED = "|"; + public static final String PHASED_SWITCH_PROB_v3 = "\\"; // missing/default values public static final String UNFILTERED = "."; diff --git a/java/src/org/broad/tribble/vcf/VCFHeader.java b/java/src/org/broad/tribble/vcf/VCFHeader.java index fc084062b..02544d94d 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeader.java +++ b/java/src/org/broad/tribble/vcf/VCFHeader.java @@ -20,6 +20,8 @@ public class VCFHeader { // the associated meta data private final Set mMetaData; + private final Map mInfoMetaData = new HashMap(); + private final Map mFormatMetaData = new HashMap(); // the list of auxillary tags private final Set mGenotypeSampleNames = new LinkedHashSet(); @@ -41,6 +43,7 @@ public class VCFHeader { public VCFHeader(Set metaData) { mMetaData = new TreeSet(metaData); loadVCFVersion(); + loadMetaDataMaps(); } /** @@ -57,6 +60,7 @@ public class VCFHeader { } if (genotypeSampleNames.size() > 0) hasGenotypingData = true; loadVCFVersion(); + loadMetaDataMaps(); } /** @@ -74,6 +78,18 @@ public class VCFHeader { } + /** + * load the format/info meta data maps (these are used for quick lookup by key name) + */ + private void loadMetaDataMaps() { + for ( VCFHeaderLine line : mMetaData ) { + if ( line instanceof VCFInfoHeaderLine ) + mInfoMetaData.put(line.getKey(), (VCFInfoHeaderLine)line); + else if ( line instanceof VCFFormatHeaderLine ) + mFormatMetaData.put(line.getKey(), (VCFFormatHeaderLine)line); + } + } + /** * get the header fields in order they're presented in the input file (which is now required to be * the order presented in the spec). @@ -117,11 +133,26 @@ public class VCFHeader { return hasGenotypingData; } - /** @return the column count, */ + /** @return the column count */ public int getColumnCount() { return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); } + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFInfoHeaderLine getInfoHeaderLine(String key) { + return mInfoMetaData.get(key); + } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFFormatHeaderLine getFormatHeaderLine(String key) { + return mFormatMetaData.get(key); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java index 873e51b1c..cd0f6cbef 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java @@ -51,14 +51,6 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage public void addRecord(VCFRecord vcfRecord) { outputTracker.getStorage(this).addRecord(vcfRecord); } - - /** - * set the validation stringency - * @param value validation stringency value - */ - public void setValidationStringency(VALIDATION_STRINGENCY value) { - outputTracker.getStorage(this).setValidationStringency(value); - } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java index 3153b7480..0c7043a51 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java @@ -150,8 +150,10 @@ public class SequenomValidationConverter extends RodWalker 0 ) { + System.out.println(String.format("Number of passing records that are polymorphic:\t\t%d (%d%%)", numTrueVariants, 100*numTrueVariants/goodRecords)); + hInfo.add(new VCFHeaderLine("ValidationMetrics_PolymorphicPassingRecords", String.format("\"%d (%d%%)\"", numTrueVariants, 100*numTrueVariants/goodRecords))); + } } VCFHeader header = new VCFHeader(hInfo, sampleNames); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java index 9cd3b875e..63abb54fd 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java @@ -27,12 +27,4 @@ public interface VCFGenotypeWriter extends GenotypeWriter { * @param vcfRecord Record to add. */ public void addRecord(VCFRecord vcfRecord); - - /** - * set the validation stringency - * @param value validation stringency value - */ - public void setValidationStringency(VALIDATION_STRINGENCY value); - - public enum VALIDATION_STRINGENCY { STRICT, SILENT }; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 6aba9dad8..c32f6c0f3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -27,9 +27,6 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { // our log, which we want to capture anything from this class protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class); - // validation stringency - private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT; - // allowed genotype format strings private Set allowedGenotypeFormatStrings = null; @@ -43,6 +40,10 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { mWriter = new VCFWriter(writeTo); } + public void addRecord(VCFRecord vcfRecord) { + mWriter.addRecord(vcfRecord); + } + /** * initialize this VCF header * @@ -92,16 +93,4 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); mWriter.add(vc, new byte[]{(byte)refAllele.charAt(0)}); } - - public void addRecord(VCFRecord vcfRecord) { - mWriter.addRecord(vcfRecord, validationStringency); - } - - /** - * set the validation stringency - * @param value validation stringency value - */ - public void setValidationStringency(VALIDATION_STRINGENCY value) { - validationStringency = value; - } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 51cfb044f..e4920f915 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.CalledGenotype; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,7 +20,6 @@ import java.util.*; */ public class VCFWriter { - // the VCF header we're storing private VCFHeader mHeader = null; @@ -29,15 +29,6 @@ public class VCFWriter { // were filters applied? private boolean filtersWereAppliedToContext = false; - // our genotype sample fields - private static final List mGenotypeRecords = new ArrayList(); - - // Properties only used when using VCF4.0 encoding - Map typeUsedForFormatString = new HashMap(); - Map typeUsedForInfoFields = new HashMap(); - Map numberUsedForInfoFields = new HashMap(); - Map numberUsedForFormatFields = new HashMap(); - /** * create a VCF writer, given a file to write to * @@ -72,43 +63,36 @@ public class VCFWriter { mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n"); for ( VCFHeaderLine line : header.getMetaData() ) { - if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) || - line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) || - line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) - continue; + if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) || + line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) || + line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) + continue; - // Record, if line corresponds to a FORMAT field, which type will be used for writing value - if (line.getClass() == VCFFormatHeaderLine.class) { - VCFFormatHeaderLine a = (VCFFormatHeaderLine)line; - String key = a.getName(); - typeUsedForFormatString.put(key,a.getType()); - int num = a.getCount(); - numberUsedForFormatFields.put(key,num); - } else if (line.getClass() == VCFInfoHeaderLine.class) { - VCFInfoHeaderLine a = (VCFInfoHeaderLine)line; - String key = a.getName(); - typeUsedForInfoFields.put(key,a.getType()); - int num = a.getCount(); - numberUsedForInfoFields.put(key, num); - } else if (line.getClass() == VCFFilterHeaderLine.class) { - filtersWereAppliedToContext = true; - } + // are the records filtered (so we know what to put in the FILTER column of passing records) ? + if ( line instanceof VCFFilterHeaderLine ) + filtersWereAppliedToContext = true; - mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n"); + mWriter.write(VCFHeader.METADATA_INDICATOR); + mWriter.write(line.toString()); + mWriter.write("\n"); } // write out the column line - StringBuilder b = new StringBuilder(); - b.append(VCFHeader.HEADER_INDICATOR); - for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) - b.append(field + VCFConstants.FIELD_SEPARATOR); - - if (header.hasGenotypingData()) { - b.append("FORMAT" + VCFConstants.FIELD_SEPARATOR); - for (String field : header.getGenotypeSamples()) - b.append(field + VCFConstants.FIELD_SEPARATOR); + mWriter.write(VCFHeader.HEADER_INDICATOR); + for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { + mWriter.write(field.toString()); + mWriter.write(VCFConstants.FIELD_SEPARATOR); } - mWriter.write(b.toString() + "\n"); + + if ( header.hasGenotypingData() ) { + mWriter.write("FORMAT"); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + for ( String sample : header.getGenotypeSamples() ) { + mWriter.write(sample); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + } } + + mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { @@ -116,39 +100,13 @@ public class VCFWriter { } } - /** - * output a record to the VCF file - * - * @param record the record to output - */ - public void add(VCFRecord record) { - addRecord(record); - } - - public void addRecord(VCFRecord record) { - addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT); - } - - public void add(VariantContext vc, byte[] refBases) { - if ( mHeader == null ) - throw new IllegalStateException("The VCF Header must be written before records can be added"); - - String vcfString = toStringEncoding(vc, mHeader, refBases); - try { - mWriter.write(vcfString + "\n"); - mWriter.flush(); // necessary so that writing to an output stream will work - } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to a file"); - } - - } /** * output a record to the VCF file * * @param record the record to output - * @param validationStringency the validation stringency */ - public void addRecord(VCFRecord record, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) { + @Deprecated + public void addRecord(VCFRecord record) { if ( mHeader == null ) throw new IllegalStateException("The VCF Header must be written before records can be added"); @@ -174,104 +132,188 @@ public class VCFWriter { } } - private String toStringEncoding(VariantContext vc, VCFHeader header, byte[] refBases) { - StringBuilder builder = new StringBuilder(); + public void add(VariantContext vc, byte[] refBases) { + if ( mHeader == null ) + throw new IllegalStateException("The VCF Header must be written before records can be added"); + if ( refBases == null || refBases.length < 1 ) + throw new IllegalArgumentException("The reference base must be provided to write VCF records"); - // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO - GenomeLoc loc = vc.getLocation(); + try { + GenomeLoc loc = vc.getLocation(); + Map alleleMap = new HashMap(vc.getAlleles().size()); + alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup - String contig = loc.getContig(); - long position = loc.getStart(); - String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD; + // CHROM + mWriter.write(loc.getContig()); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + // POS + // TODO -- Remove this off-by-one issue when positions are settled in the input + mWriter.write(String.valueOf(loc.getStart() - (vc.isIndel() ? 1 : 0))); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - // deal with the reference - String referenceFromVC = new String(vc.getReference().getBases()); + // ID + String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD; + mWriter.write(ID); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1; - // TODO- clean up these flags and associated code + // REF + alleleMap.put(vc.getReference(), "0"); + String refString = makeAlleleString(vc.getReference(), vc.isIndel(), refBases[0]); + mWriter.write(refString); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + // ALT + if ( vc.getAlternateAlleles().size() > 0 ) { + Allele altAllele = vc.getAlternateAllele(0); + alleleMap.put(altAllele, "1"); + String alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]); + mWriter.write(alt); - Map alleleMap = new HashMap(); - alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFConstants.EMPTY_ALLELE)); // convenience for lookup - List vcfAltAlleles = new ArrayList(); - - int numTrailingBases = 0, numPaddingBases = 0; - String paddingBases = ""; - String trailingBases = ""; - - // search for reference allele and find trailing and padding at the end. - // See first if all alleles have a base encoding (ie no deletions) - // if so, add one common base to all alleles (reference at this location) - boolean hasBasesInAllAlleles = true; - String refString = null; - for ( Allele a : vc.getAlleles() ) { - if (a.isNull() || a.isNoCall()) - hasBasesInAllAlleles = false; - - if (a.isReference()) - refString = new String(a.getBases()); - } - - - // Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK. - if (!hasBasesInAllAlleles) { - trailingBases = new String(refBases); - numTrailingBases = 1; - position--; - } - - - for ( Allele a : vc.getAlleles() ) { - - VCFGenotypeEncoding encoding; - - - String alleleString = new String(a.getBases()); - String s = trailingBases+alleleString+paddingBases; - - encoding = new VCFGenotypeEncoding(s, true); - - // overwrite reference string by possibly padded version - if (a.isReference()) - referenceFromVC = s; - - else { - vcfAltAlleles.add(encoding); + for (int i = 1; i < vc.getAlternateAlleles().size(); i++) { + altAllele = vc.getAlternateAllele(i); + alleleMap.put(altAllele, String.valueOf(i+1)); + alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]); + mWriter.write(","); + mWriter.write(alt); + } + } else { + mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } + mWriter.write(VCFConstants.FIELD_SEPARATOR); - alleleMap.put(a, encoding); - } + // QUAL + if ( !vc.hasNegLog10PError() ) + mWriter.write(VCFConstants.MISSING_VALUE_v4); + else + mWriter.write(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, vc.getPhredScaledQual())); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - List vcfGenotypeAttributeKeys = new ArrayList(); - if ( vc.hasGenotypes() ) { - vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); - for ( String key : calcVCFGenotypeKeys(vc) ) { - vcfGenotypeAttributeKeys.add(key); - } - } else if ( header.hasGenotypingData() ) { - // this needs to be done in case all samples are no-calls - vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); - } + // FILTER + String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + mWriter.write(filters); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys); - - List genotypeObjects = new ArrayList(vc.getGenotypes().size()); - for ( Genotype g : vc.getGenotypesSortedByName() ) { - List encodings = new ArrayList(g.getPloidy()); - - for ( Allele a : g.getAlleles() ) { - encodings.add(alleleMap.get(a)); - } - - VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED; - VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing); - - for ( String key : vcfGenotypeAttributeKeys ) { - if ( key.equals(VCFConstants.GENOTYPE_KEY) ) + // INFO + Map infoFields = new TreeMap(); + for ( Map.Entry field : vc.getAttributes().entrySet() ) { + String key = field.getKey(); + if ( key.equals("ID") ) continue; + String outputValue = formatVCFField(field.getValue()); + if ( outputValue != null ) + infoFields.put(key, outputValue); + } + writeInfoString(infoFields); + + // FORMAT + List genotypeAttributeKeys = new ArrayList(); + if ( vc.hasGenotypes() ) { + genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); + for ( String key : calcVCFGenotypeKeys(vc) ) { + genotypeAttributeKeys.add(key); + } + } else if ( mHeader.hasGenotypingData() ) { + // this needs to be done in case all samples are no-calls + genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); + } + + if ( genotypeAttributeKeys.size() > 0 ) { + String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + mWriter.write(genotypeFormatString); + + addGenotypeData(vc, alleleMap, genotypeAttributeKeys); + } + + mWriter.write("\n"); + mWriter.flush(); // necessary so that writing to an output stream will work + } catch (IOException e) { + throw new RuntimeException("Unable to write the VCF object to a file"); + } + + } + + private String makeAlleleString(Allele allele, boolean isIndel, byte ref) { + String s = new String(allele.getBases()); + if ( isIndel || s.length() == 0 ) // in case the context is monomorphic at an indel site + s = (char)ref + s; + return s; + } + + /** + * create the info string; assumes that no values are null + * + * @param infoFields a map of info fields + * @throws IOException for writer + */ + protected void writeInfoString(Map infoFields) throws IOException { + if ( infoFields.isEmpty() ) { + mWriter.write(VCFConstants.EMPTY_INFO_FIELD); + return; + } + + boolean isFirst = true; + for ( Map.Entry entry : infoFields.entrySet() ) { + if ( isFirst ) + isFirst = false; + else + mWriter.write(VCFConstants.INFO_FIELD_SEPARATOR); + + String key = entry.getKey(); + mWriter.write(key); + + if ( !entry.getValue().equals("") ) { + int numVals = 1; + VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key); + if ( metaData != null ) + numVals = metaData.getCount(); + + // take care of unbounded encoding + if ( numVals == VCFInfoHeaderLine.UNBOUNDED ) + numVals = 1; + + if ( numVals > 0 ) { + mWriter.write("="); + mWriter.write(entry.getValue()); + } + } + } + } + + /** + * add the genotype data + * + * @param vc the variant context + * @param genotypeFormatKeys Genotype formatting string + * @param alleleMap alleles for this context + * @throws IOException for writer + */ + private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) + throws IOException { + + for ( String sample : mHeader.getGenotypeSamples() ) { + mWriter.write(VCFConstants.FIELD_SEPARATOR); + + Genotype g = vc.getGenotype(sample); + if ( g == null ) { + // TODO -- The VariantContext needs to know what the general ploidy is of the samples + // TODO -- We shouldn't be assuming diploid genotypes here! + mWriter.write(VCFConstants.EMPTY_GENOTYPE); + continue; + } + + writeAllele(g.getAllele(0), alleleMap); + for (int i = 1; i < g.getPloidy(); i++) { + mWriter.write(g.genotypesArePhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); + writeAllele(g.getAllele(i), alleleMap); + } + + List attrs = new ArrayList(genotypeFormatKeys.size()); + for ( String key : genotypeFormatKeys ) { + if ( key.equals(VCFConstants.GENOTYPE_KEY) ) + continue; Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4; @@ -282,255 +324,77 @@ public class VCFWriter { else { val = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); } - } else if ( key.equals(VCFConstants.DEPTH_KEY) && val == null ) { ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY); if ( pileup != null ) val = pileup.size(); } else if ( key.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - // VCF 4.0 key for no filters is "." val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : VCFConstants.PASSES_FILTERS_v4; } + VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key); + if ( metaData != null ) { + VCFHeaderLineType formatType = metaData.getType(); + if ( !(val instanceof String) ) + val = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT); - Object newVal; - if (typeUsedForFormatString.containsKey(key)) { - VCFHeaderLineType formatType = typeUsedForFormatString.get(key); - if (!val.getClass().equals(String.class)) - newVal = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT); - else - newVal = val; - - } - else { - newVal = val; - } - - if (numberUsedForFormatFields.containsKey(key)){ - int numInFormatField = numberUsedForFormatFields.get(key); - if (numInFormatField>1 && val.equals(VCFConstants.MISSING_VALUE_v4)) { - // If we have a missing field but multiple values are expected, we need to construct new string with all fields. - // for example for Number =2, string has to be ".,." - StringBuilder v = new StringBuilder(VCFConstants.MISSING_VALUE_v4); + int numInFormatField = metaData.getCount(); + if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { + // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. + // For example, if Number=2, the string has to be ".,." + StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); for ( int i = 1; i < numInFormatField; i++ ) { - v.append(","); - v.append(VCFConstants.MISSING_VALUE_v4); + sb.append(","); + sb.append(VCFConstants.MISSING_VALUE_v4); } - newVal = v.toString(); + val = sb.toString(); } } - // assume that if key is absent, given string encoding suffices. - String outputValue = formatVCFField(key, newVal); + // assume that if key is absent, then the given string encoding suffices + String outputValue = formatVCFField(val); if ( outputValue != null ) - vcfG.setField(key, outputValue); + attrs.add(outputValue); } - genotypeObjects.add(vcfG); - } - - mGenotypeRecords.clear(); - mGenotypeRecords.addAll(genotypeObjects); - // info fields - Map infoFields = new TreeMap(); - for ( Map.Entry elt : vc.getAttributes().entrySet() ) { - String key = elt.getKey(); - if ( key.equals("ID") ) - continue; - - String outputValue = formatVCFField(key, elt.getValue()); - if ( outputValue != null ) - infoFields.put(key, outputValue); - } - - - - builder.append(contig); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(position); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(ID); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(referenceFromVC); - builder.append(VCFConstants.FIELD_SEPARATOR); - - if ( vcfAltAlleles.size() > 0 ) { - builder.append(vcfAltAlleles.get(0)); - for ( int i = 1; i < vcfAltAlleles.size(); i++ ) { - builder.append(","); - builder.append(vcfAltAlleles.get(i)); - } - } else { - builder.append(VCFConstants.EMPTY_ALLELE); - } - builder.append(VCFConstants.FIELD_SEPARATOR); - - - if ( qual == -1 ) - builder.append(VCFConstants.MISSING_VALUE_v4); - else - builder.append(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, qual)); - - builder.append(VCFConstants.FIELD_SEPARATOR); - - - builder.append(filters); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(createInfoString(infoFields)); - - if ( genotypeFormatString != null && genotypeFormatString.length() > 0 ) { - addGenotypeData(builder, header, genotypeFormatString, vcfAltAlleles); - } - - return builder.toString(); - - - } - - /** - * add the genotype data - * - * @param builder the string builder - * @param header the header object - * @param genotypeFormatString Genotype formatting string - * @param vcfAltAlleles alternate alleles at this site - */ - private void addGenotypeData(StringBuilder builder, VCFHeader header, - String genotypeFormatString, ListvcfAltAlleles) { - Map gMap = genotypeListToMap(mGenotypeRecords); - - StringBuffer tempStr = new StringBuffer(); - if ( header.getGenotypeSamples().size() < mGenotypeRecords.size() ) { - for ( String sample : gMap.keySet() ) { - if ( !header.getGenotypeSamples().contains(sample) ) - System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header"); + // strip off trailing missing values + for (int i = attrs.size()-1; i >= 0; i--) { + if ( attrs.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) + attrs.remove(i); else - header.getGenotypeSamples().remove(sample); - } - throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated"); - } - tempStr.append(VCFConstants.FIELD_SEPARATOR + genotypeFormatString); - - String[] genotypeFormatStrings = genotypeFormatString.split(":"); - - for ( String genotype : header.getGenotypeSamples() ) { - tempStr.append(VCFConstants.FIELD_SEPARATOR); - if ( gMap.containsKey(genotype) ) { - VCFGenotypeRecord rec = gMap.get(genotype); - String genotypeString = rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings, true); - - // Override default produced genotype string when there are trailing - String[] genotypeStrings = genotypeString.split(":"); - int lastUsedPosition = 0; - for (int k=genotypeStrings.length-1; k >=1; k--) { - // see if string represents an empty field. If not, break. - if (!isEmptyField(genotypeStrings[k]) ) { - lastUsedPosition = k; - break; - } - } - // now reconstruct genotypeString from 0 to lastUsedPosition - genotypeString = Utils.join(":",genotypeStrings, 0,lastUsedPosition+1); - tempStr.append(genotypeString); - gMap.remove(genotype); - } else { - tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings, true)); - } - } - if ( gMap.size() != 0 ) { - for ( String sample : gMap.keySet() ) - System.err.println("Sample " + sample + " is being genotyped but isn't in the header."); - throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records"); - } - - builder.append(tempStr); - } - - boolean isEmptyField(String field) { - // check if given genotype field is empty, ie either ".", or ".,.", or ".,.,.", etc. - String[] fields = field.split(","); - boolean isEmpty = true; - - for (int k=0; k < fields.length; k++) { - if (!fields[k].equals(VCFConstants.MISSING_VALUE_v4)) { - isEmpty = false; - break; + break; } - } - return isEmpty; - - } - /** - * create a genotype mapping from a list and their sample names - * - * @param list a list of genotype samples - * @return a mapping of the sample name to VCF genotype record - */ - private static Map genotypeListToMap(List list) { - Map map = new HashMap(); - for (int i = 0; i < list.size(); i++) { - VCFGenotypeRecord rec = list.get(i); - map.put(rec.getSampleName(), rec); - } - return map; - } - - /** - * create the info string - * - * @param infoFields a map of info fields - * @return a string representing the infomation fields - */ - protected String createInfoString(Map infoFields) { - StringBuffer info = new StringBuffer(); - boolean isFirst = true; - for (Map.Entry entry : infoFields.entrySet()) { - if ( isFirst ) - isFirst = false; - else - info.append(VCFConstants.INFO_FIELD_SEPARATOR); - - info.append(entry.getKey()); - - if ( entry.getValue() != null && !entry.getValue().equals("") ) { - int numVals = 1; - String key = entry.getKey(); - if (numberUsedForInfoFields.containsKey(key)) { - numVals = numberUsedForInfoFields.get(key); - } - - // take care of unbounded encoding - // TODO - workaround for "-1" in original INFO header structure - if (numVals == VCFInfoHeaderLine.UNBOUNDED || numVals < 0) - numVals = 1; - - if (numVals > 0) { - info.append("="); - info.append(entry.getValue()); - } + for (String s : attrs ) { + mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); + mWriter.write(s); } } - return info.length() == 0 ? VCFConstants.EMPTY_INFO_FIELD : info.toString(); } - private static String formatVCFField(String key, Object val) { + private void writeAllele(Allele allele, Map alleleMap) throws IOException { + String encoding = alleleMap.get(allele); + if ( encoding == null ) + throw new StingException("Allele " + allele + " is not an allele in the variant context"); + mWriter.write(encoding); + } + + private static String formatVCFField(Object val) { String result; if ( val == null ) - result = VCFGenotypeRecord.getMissingFieldValue(key); - else if ( val instanceof Double ) { - result = String.format("%.2f", (Double)val); - } + result = VCFConstants.MISSING_VALUE_v4; + else if ( val instanceof Double ) + result = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, (Double)val); else if ( val instanceof Boolean ) result = (Boolean)val ? "" : null; // empty string for true, null for false else if ( val instanceof List ) { List list = (List)val; if ( list.size() == 0 ) - return formatVCFField(key, null); - StringBuffer sb = new StringBuffer(formatVCFField(key, list.get(0))); + return formatVCFField(null); + StringBuffer sb = new StringBuffer(formatVCFField(list.get(0))); for ( int i = 1; i < list.size(); i++) { sb.append(","); - sb.append(formatVCFField(key, list.get(i))); + sb.append(formatVCFField(list.get(i))); } result = sb.toString(); } else diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java index 675aa0fa2..a95e9a80a 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java @@ -20,7 +20,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest { String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s"; WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, - Arrays.asList("845b9a15ac947052ddded5b79228e5ec")); + Arrays.asList("fad2dd71550dec064d458c4aa83e4de9")); executeTest("Test Indels", spec); } }