From 3ca2b7374b5f7d8f29027ac743f99e09b3fcf39f Mon Sep 17 00:00:00 2001 From: delangel Date: Tue, 29 Jun 2010 16:43:00 +0000 Subject: [PATCH] Fixes to better deal with the "Type" and "Number" field in the INFO and FORMAT header lines in VCF4.0. We now record these fields and provide appropriate conversions. This is the first version that passes fully the VCF validator. Also, moved the flag indicating VCF4.0 to the VCFWriter constructor. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3669 348d0f76-0448-11de-a6fe-93d51630548a --- .../tribble/vcf/VCFFormatHeaderLine.java | 3 +- .../walkers/VCF4WriterTestWalker.java | 3 +- .../sting/utils/genotype/vcf/VCFWriter.java | 100 +++++++++++++++--- 3 files changed, 89 insertions(+), 17 deletions(-) diff --git a/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java index 0997f4422..ce8418d8e 100755 --- a/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java +++ b/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java @@ -22,7 +22,8 @@ public class VCFFormatHeaderLine extends VCFHeaderLine { public Object convert(String value) { switch (this) { case Integer: - return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name + // Take care of case when string might have say "43.0" but we request an Integer. + return Math.round(java.lang.Float.valueOf(value)); // the java.lang is needed since we use Integer as a enum name case Float: return java.lang.Float.valueOf(value); case String: diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java index b57f8dcd1..b2eef66d4 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java @@ -85,7 +85,8 @@ public class VCF4WriterTestWalker extends RodWalker { final List dataSources = this.getToolkit().getRodDataSources(); // Open output file specified by output VCF ROD - vcfWriter = new VCFWriter(new File(OUTPUT_FILE)); + + vcfWriter = new VCFWriter(new File(OUTPUT_FILE), true); for( final ReferenceOrderedDataSource source : dataSources ) { final RMDTrack rod = source.getReferenceOrderedData(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 4ef2b27a1..f8e35008a 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.CalledGenotype; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -26,9 +27,17 @@ public class VCFWriter { // the print stream we're writting to BufferedWriter mWriter; + private boolean writingVCF40Format; + // our genotype sample fields private static final List mGenotypeRecords = new ArrayList(); + // Properties only used when using VCF4.0 encoding + Map typeUsedForFormatString = new HashMap(); + Map typeUsedForInfoFields = new HashMap(); + Map numberUsedForInfoFields = new HashMap(); + Map numberUsedForFormatFields = new HashMap(); + // commonly used strings that are in the standard private final String FORMAT_FIELD_SEPARATOR = ":"; private static final String GENOTYPE_FIELD_SEPARATOR = ":"; @@ -44,7 +53,7 @@ public class VCFWriter { private static final String EMPTY_ID_FIELD = "."; private static final String EMPTY_ALLELE_FIELD = "."; private static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f"; - private static final String MISSING_GENOTYPE_QUALITY = "."; + private static final String MISSING_GENOTYPE_FIELD = "."; /** * create a VCF writer, given a file to write to @@ -52,6 +61,11 @@ public class VCFWriter { * @param location the file location to write to */ public VCFWriter(File location) { + this(location, false); + } + + public VCFWriter(File location, boolean useVCF4Format) { + this.writingVCF40Format = useVCF4Format; FileOutputStream output; try { output = new FileOutputStream(location); @@ -68,26 +82,47 @@ public class VCFWriter { * @param output the file location to write to */ public VCFWriter(OutputStream output) { + // use VCF3.3 by default + this(output, false); + } + public VCFWriter(OutputStream output, boolean useVCF4Format) { + this.writingVCF40Format = useVCF4Format; mWriter = new BufferedWriter(new OutputStreamWriter(output)); } public void writeHeader(VCFHeader header) { - // use 3.3 format by default - // TODO - update to true once 4.0 is on by default - writeHeader(header, false); - } - public void writeHeader(VCFHeader header, boolean useVCF4Format) { this.mHeader = header; + try { // the file format field needs to be written first TreeSet nonFormatMetaData = new TreeSet(); for ( VCFHeaderLine line : header.getMetaData() ) { - if (useVCF4Format) { + if (writingVCF40Format) { if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ) { mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n"); } else { nonFormatMetaData.add(line); } + + // Record, if line corresponds to a FORMAT field, which type will be used for writing value + if (line.getClass() == VCFFormatHeaderLine.class) { + + VCFFormatHeaderLine a = (VCFFormatHeaderLine)line; + String key = a.getmName(); + typeUsedForFormatString.put(key,a.getmType()); + int num = a.getmCount(); + numberUsedForFormatFields.put(key,num); + + } else if (line.getClass() == VCFInfoHeaderLine.class) { + VCFInfoHeaderLine a = (VCFInfoHeaderLine)line; + String key = a.getmName(); + typeUsedForInfoFields.put(key,a.getmType()); + int num = a.getmCount(); + numberUsedForInfoFields.put(key, num); + } + + + } else { if ( line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ) { @@ -184,7 +219,6 @@ public class VCFWriter { // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO GenomeLoc loc = vc.getLocation(); - //String referenceBases = new String(vc.getReference().getBases()); String contig = loc.getContig(); long position = loc.getStart(); @@ -288,6 +322,7 @@ public class VCFWriter { if ( key.equals(VCFGenotypeRecord.GENOTYPE_KEY) ) continue; + Object val = g.getAttribute(key); // some exceptions if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) { @@ -295,6 +330,7 @@ public class VCFWriter { val = VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY; else val = Math.min(g.getPhredScaledQual(), VCFGenotypeRecord.MAX_QUAL_VALUE); + } else if ( key.equals(VCFGenotypeRecord.DEPTH_KEY) && val == null ) { ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY); if ( pileup != null ) @@ -303,7 +339,29 @@ public class VCFWriter { val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS; } - String outputValue = formatVCFField(key, val); + String outputValue; + if (writingVCF40Format) { + VCFFormatHeaderLine.FORMAT_TYPE formatType = typeUsedForFormatString.get(key); + Object newVal; + if (!val.getClass().equals(String.class)) + newVal = formatType.convert(String.valueOf(val)); + else + newVal = val; + + if (numberUsedForFormatFields.get(key)>1 && val.equals(MISSING_GENOTYPE_FIELD)) { + // If we have a missing field but multiple values are expected, we need to construct new string with all fields. + // for example for Number =2, string has to be ".,." + StringBuilder v = new StringBuilder(MISSING_GENOTYPE_FIELD); + for ( int i = 1; i < numberUsedForFormatFields.get(key); i++ ) { + v.append(","); + v.append(MISSING_GENOTYPE_FIELD); + } + newVal = v.toString(); + } + outputValue = formatVCFField(key, newVal); + } else { + outputValue = formatVCFField(key, val); + } if ( outputValue != null ) vcfG.setField(key, outputValue); } @@ -320,6 +378,7 @@ public class VCFWriter { if ( key.equals("ID") ) continue; + //TODO - fixme String outputValue = formatVCFField(key, elt.getValue()); if ( outputValue != null ) infoFields.put(key, outputValue); @@ -327,7 +386,6 @@ public class VCFWriter { - builder.append(referenceString); builder.append(FIELD_SEPARATOR); @@ -344,7 +402,7 @@ public class VCFWriter { if ( qual == -1 ) - builder.append(MISSING_GENOTYPE_QUALITY); + builder.append(MISSING_GENOTYPE_FIELD); else builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, qual)); @@ -424,9 +482,10 @@ public class VCFWriter { /** * create the info string * + * @param infoFields a map of info fields * @return a string representing the infomation fields */ - static protected String createInfoString(Map infoFields) { + protected String createInfoString(Map infoFields) { StringBuffer info = new StringBuffer(); boolean isFirst = true; for (Map.Entry entry : infoFields.entrySet()) { @@ -436,8 +495,18 @@ public class VCFWriter { info.append(INFO_FIELD_SEPARATOR); info.append(entry.getKey()); if ( entry.getValue() != null && !entry.getValue().equals("") ) { - info.append("="); - info.append(entry.getValue()); + int numVals = 1; + if (this.writingVCF40Format) { + numVals = numberUsedForInfoFields.get(entry.getKey()); + // take care of unbounded encoding + if (numVals == VCFInfoHeaderLine.UNBOUNDED) + numVals = 1; + + } + if (numVals > 0) { + info.append("="); + info.append(entry.getValue()); + } } } return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString(); @@ -447,8 +516,9 @@ public class VCFWriter { String result; if ( val == null ) result = VCFGenotypeRecord.getMissingFieldValue(key); - else if ( val instanceof Double ) + else if ( val instanceof Double ) { result = String.format("%.2f", (Double)val); + } else if ( val instanceof Boolean ) result = (Boolean)val ? "" : null; // empty string for true, null for false else if ( val instanceof List ) {