diff --git a/java/src/org/broad/tribble/vcf/VCFConstants.java b/java/src/org/broad/tribble/vcf/VCFConstants.java index e36de3e31..5c8206e3d 100755 --- a/java/src/org/broad/tribble/vcf/VCFConstants.java +++ b/java/src/org/broad/tribble/vcf/VCFConstants.java @@ -58,6 +58,9 @@ public final class VCFConstants { public static final String FIELD_SEPARATOR = "\t"; public static final String FILTER_CODE_SEPARATOR = ";"; public static final String INFO_FIELD_SEPARATOR = ";"; + public static final String UNPHASED = "/"; + public static final String PHASED = "|"; + public static final String PHASED_SWITCH_PROB_v3 = "\\"; // missing/default values public static final String UNFILTERED = "."; diff --git a/java/src/org/broad/tribble/vcf/VCFHeader.java b/java/src/org/broad/tribble/vcf/VCFHeader.java index fc084062b..02544d94d 100644 --- a/java/src/org/broad/tribble/vcf/VCFHeader.java +++ b/java/src/org/broad/tribble/vcf/VCFHeader.java @@ -20,6 +20,8 @@ public class VCFHeader { // the associated meta data private final Set mMetaData; + private final Map mInfoMetaData = new HashMap(); + private final Map mFormatMetaData = new HashMap(); // the list of auxillary tags private final Set mGenotypeSampleNames = new LinkedHashSet(); @@ -41,6 +43,7 @@ public class VCFHeader { public VCFHeader(Set metaData) { mMetaData = new TreeSet(metaData); loadVCFVersion(); + loadMetaDataMaps(); } /** @@ -57,6 +60,7 @@ public class VCFHeader { } if (genotypeSampleNames.size() > 0) hasGenotypingData = true; loadVCFVersion(); + loadMetaDataMaps(); } /** @@ -74,6 +78,18 @@ public class VCFHeader { } + /** + * load the format/info meta data maps (these are used for quick lookup by key name) + */ + private void loadMetaDataMaps() { + for ( VCFHeaderLine line : mMetaData ) { + if ( line instanceof VCFInfoHeaderLine ) + mInfoMetaData.put(line.getKey(), (VCFInfoHeaderLine)line); + else if ( line instanceof VCFFormatHeaderLine ) + mFormatMetaData.put(line.getKey(), (VCFFormatHeaderLine)line); + } + } + /** * get the header fields in order they're presented in the input file (which is now required to be * the order presented in the spec). @@ -117,11 +133,26 @@ public class VCFHeader { return hasGenotypingData; } - /** @return the column count, */ + /** @return the column count */ public int getColumnCount() { return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); } + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFInfoHeaderLine getInfoHeaderLine(String key) { + return mInfoMetaData.get(key); + } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFFormatHeaderLine getFormatHeaderLine(String key) { + return mFormatMetaData.get(key); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java index 873e51b1c..cd0f6cbef 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFGenotypeWriterStorage.java @@ -51,14 +51,6 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage public void addRecord(VCFRecord vcfRecord) { outputTracker.getStorage(this).addRecord(vcfRecord); } - - /** - * set the validation stringency - * @param value validation stringency value - */ - public void setValidationStringency(VALIDATION_STRINGENCY value) { - outputTracker.getStorage(this).setValidationStringency(value); - } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java index 3153b7480..0c7043a51 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java @@ -150,8 +150,10 @@ public class SequenomValidationConverter extends RodWalker 0 ) { + System.out.println(String.format("Number of passing records that are polymorphic:\t\t%d (%d%%)", numTrueVariants, 100*numTrueVariants/goodRecords)); + hInfo.add(new VCFHeaderLine("ValidationMetrics_PolymorphicPassingRecords", String.format("\"%d (%d%%)\"", numTrueVariants, 100*numTrueVariants/goodRecords))); + } } VCFHeader header = new VCFHeader(hInfo, sampleNames); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java index 9cd3b875e..63abb54fd 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriter.java @@ -27,12 +27,4 @@ public interface VCFGenotypeWriter extends GenotypeWriter { * @param vcfRecord Record to add. */ public void addRecord(VCFRecord vcfRecord); - - /** - * set the validation stringency - * @param value validation stringency value - */ - public void setValidationStringency(VALIDATION_STRINGENCY value); - - public enum VALIDATION_STRINGENCY { STRICT, SILENT }; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index 6aba9dad8..c32f6c0f3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -27,9 +27,6 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { // our log, which we want to capture anything from this class protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class); - // validation stringency - private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT; - // allowed genotype format strings private Set allowedGenotypeFormatStrings = null; @@ -43,6 +40,10 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { mWriter = new VCFWriter(writeTo); } + public void addRecord(VCFRecord vcfRecord) { + mWriter.addRecord(vcfRecord); + } + /** * initialize this VCF header * @@ -92,16 +93,4 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter { vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); mWriter.add(vc, new byte[]{(byte)refAllele.charAt(0)}); } - - public void addRecord(VCFRecord vcfRecord) { - mWriter.addRecord(vcfRecord, validationStringency); - } - - /** - * set the validation stringency - * @param value validation stringency value - */ - public void setValidationStringency(VALIDATION_STRINGENCY value) { - validationStringency = value; - } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 51cfb044f..e4920f915 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.CalledGenotype; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,7 +20,6 @@ import java.util.*; */ public class VCFWriter { - // the VCF header we're storing private VCFHeader mHeader = null; @@ -29,15 +29,6 @@ public class VCFWriter { // were filters applied? private boolean filtersWereAppliedToContext = false; - // our genotype sample fields - private static final List mGenotypeRecords = new ArrayList(); - - // Properties only used when using VCF4.0 encoding - Map typeUsedForFormatString = new HashMap(); - Map typeUsedForInfoFields = new HashMap(); - Map numberUsedForInfoFields = new HashMap(); - Map numberUsedForFormatFields = new HashMap(); - /** * create a VCF writer, given a file to write to * @@ -72,43 +63,36 @@ public class VCFWriter { mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n"); for ( VCFHeaderLine line : header.getMetaData() ) { - if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) || - line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) || - line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) - continue; + if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) || + line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) || + line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) + continue; - // Record, if line corresponds to a FORMAT field, which type will be used for writing value - if (line.getClass() == VCFFormatHeaderLine.class) { - VCFFormatHeaderLine a = (VCFFormatHeaderLine)line; - String key = a.getName(); - typeUsedForFormatString.put(key,a.getType()); - int num = a.getCount(); - numberUsedForFormatFields.put(key,num); - } else if (line.getClass() == VCFInfoHeaderLine.class) { - VCFInfoHeaderLine a = (VCFInfoHeaderLine)line; - String key = a.getName(); - typeUsedForInfoFields.put(key,a.getType()); - int num = a.getCount(); - numberUsedForInfoFields.put(key, num); - } else if (line.getClass() == VCFFilterHeaderLine.class) { - filtersWereAppliedToContext = true; - } + // are the records filtered (so we know what to put in the FILTER column of passing records) ? + if ( line instanceof VCFFilterHeaderLine ) + filtersWereAppliedToContext = true; - mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n"); + mWriter.write(VCFHeader.METADATA_INDICATOR); + mWriter.write(line.toString()); + mWriter.write("\n"); } // write out the column line - StringBuilder b = new StringBuilder(); - b.append(VCFHeader.HEADER_INDICATOR); - for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) - b.append(field + VCFConstants.FIELD_SEPARATOR); - - if (header.hasGenotypingData()) { - b.append("FORMAT" + VCFConstants.FIELD_SEPARATOR); - for (String field : header.getGenotypeSamples()) - b.append(field + VCFConstants.FIELD_SEPARATOR); + mWriter.write(VCFHeader.HEADER_INDICATOR); + for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { + mWriter.write(field.toString()); + mWriter.write(VCFConstants.FIELD_SEPARATOR); } - mWriter.write(b.toString() + "\n"); + + if ( header.hasGenotypingData() ) { + mWriter.write("FORMAT"); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + for ( String sample : header.getGenotypeSamples() ) { + mWriter.write(sample); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + } } + + mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { @@ -116,39 +100,13 @@ public class VCFWriter { } } - /** - * output a record to the VCF file - * - * @param record the record to output - */ - public void add(VCFRecord record) { - addRecord(record); - } - - public void addRecord(VCFRecord record) { - addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT); - } - - public void add(VariantContext vc, byte[] refBases) { - if ( mHeader == null ) - throw new IllegalStateException("The VCF Header must be written before records can be added"); - - String vcfString = toStringEncoding(vc, mHeader, refBases); - try { - mWriter.write(vcfString + "\n"); - mWriter.flush(); // necessary so that writing to an output stream will work - } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to a file"); - } - - } /** * output a record to the VCF file * * @param record the record to output - * @param validationStringency the validation stringency */ - public void addRecord(VCFRecord record, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) { + @Deprecated + public void addRecord(VCFRecord record) { if ( mHeader == null ) throw new IllegalStateException("The VCF Header must be written before records can be added"); @@ -174,104 +132,188 @@ public class VCFWriter { } } - private String toStringEncoding(VariantContext vc, VCFHeader header, byte[] refBases) { - StringBuilder builder = new StringBuilder(); + public void add(VariantContext vc, byte[] refBases) { + if ( mHeader == null ) + throw new IllegalStateException("The VCF Header must be written before records can be added"); + if ( refBases == null || refBases.length < 1 ) + throw new IllegalArgumentException("The reference base must be provided to write VCF records"); - // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO - GenomeLoc loc = vc.getLocation(); + try { + GenomeLoc loc = vc.getLocation(); + Map alleleMap = new HashMap(vc.getAlleles().size()); + alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup - String contig = loc.getContig(); - long position = loc.getStart(); - String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD; + // CHROM + mWriter.write(loc.getContig()); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + // POS + // TODO -- Remove this off-by-one issue when positions are settled in the input + mWriter.write(String.valueOf(loc.getStart() - (vc.isIndel() ? 1 : 0))); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - // deal with the reference - String referenceFromVC = new String(vc.getReference().getBases()); + // ID + String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD; + mWriter.write(ID); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1; - // TODO- clean up these flags and associated code + // REF + alleleMap.put(vc.getReference(), "0"); + String refString = makeAlleleString(vc.getReference(), vc.isIndel(), refBases[0]); + mWriter.write(refString); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + // ALT + if ( vc.getAlternateAlleles().size() > 0 ) { + Allele altAllele = vc.getAlternateAllele(0); + alleleMap.put(altAllele, "1"); + String alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]); + mWriter.write(alt); - Map alleleMap = new HashMap(); - alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFConstants.EMPTY_ALLELE)); // convenience for lookup - List vcfAltAlleles = new ArrayList(); - - int numTrailingBases = 0, numPaddingBases = 0; - String paddingBases = ""; - String trailingBases = ""; - - // search for reference allele and find trailing and padding at the end. - // See first if all alleles have a base encoding (ie no deletions) - // if so, add one common base to all alleles (reference at this location) - boolean hasBasesInAllAlleles = true; - String refString = null; - for ( Allele a : vc.getAlleles() ) { - if (a.isNull() || a.isNoCall()) - hasBasesInAllAlleles = false; - - if (a.isReference()) - refString = new String(a.getBases()); - } - - - // Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK. - if (!hasBasesInAllAlleles) { - trailingBases = new String(refBases); - numTrailingBases = 1; - position--; - } - - - for ( Allele a : vc.getAlleles() ) { - - VCFGenotypeEncoding encoding; - - - String alleleString = new String(a.getBases()); - String s = trailingBases+alleleString+paddingBases; - - encoding = new VCFGenotypeEncoding(s, true); - - // overwrite reference string by possibly padded version - if (a.isReference()) - referenceFromVC = s; - - else { - vcfAltAlleles.add(encoding); + for (int i = 1; i < vc.getAlternateAlleles().size(); i++) { + altAllele = vc.getAlternateAllele(i); + alleleMap.put(altAllele, String.valueOf(i+1)); + alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]); + mWriter.write(","); + mWriter.write(alt); + } + } else { + mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } + mWriter.write(VCFConstants.FIELD_SEPARATOR); - alleleMap.put(a, encoding); - } + // QUAL + if ( !vc.hasNegLog10PError() ) + mWriter.write(VCFConstants.MISSING_VALUE_v4); + else + mWriter.write(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, vc.getPhredScaledQual())); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - List vcfGenotypeAttributeKeys = new ArrayList(); - if ( vc.hasGenotypes() ) { - vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); - for ( String key : calcVCFGenotypeKeys(vc) ) { - vcfGenotypeAttributeKeys.add(key); - } - } else if ( header.hasGenotypingData() ) { - // this needs to be done in case all samples are no-calls - vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); - } + // FILTER + String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED); + mWriter.write(filters); + mWriter.write(VCFConstants.FIELD_SEPARATOR); - String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys); - - List genotypeObjects = new ArrayList(vc.getGenotypes().size()); - for ( Genotype g : vc.getGenotypesSortedByName() ) { - List encodings = new ArrayList(g.getPloidy()); - - for ( Allele a : g.getAlleles() ) { - encodings.add(alleleMap.get(a)); - } - - VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED; - VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing); - - for ( String key : vcfGenotypeAttributeKeys ) { - if ( key.equals(VCFConstants.GENOTYPE_KEY) ) + // INFO + Map infoFields = new TreeMap(); + for ( Map.Entry field : vc.getAttributes().entrySet() ) { + String key = field.getKey(); + if ( key.equals("ID") ) continue; + String outputValue = formatVCFField(field.getValue()); + if ( outputValue != null ) + infoFields.put(key, outputValue); + } + writeInfoString(infoFields); + + // FORMAT + List genotypeAttributeKeys = new ArrayList(); + if ( vc.hasGenotypes() ) { + genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); + for ( String key : calcVCFGenotypeKeys(vc) ) { + genotypeAttributeKeys.add(key); + } + } else if ( mHeader.hasGenotypingData() ) { + // this needs to be done in case all samples are no-calls + genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); + } + + if ( genotypeAttributeKeys.size() > 0 ) { + String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); + mWriter.write(VCFConstants.FIELD_SEPARATOR); + mWriter.write(genotypeFormatString); + + addGenotypeData(vc, alleleMap, genotypeAttributeKeys); + } + + mWriter.write("\n"); + mWriter.flush(); // necessary so that writing to an output stream will work + } catch (IOException e) { + throw new RuntimeException("Unable to write the VCF object to a file"); + } + + } + + private String makeAlleleString(Allele allele, boolean isIndel, byte ref) { + String s = new String(allele.getBases()); + if ( isIndel || s.length() == 0 ) // in case the context is monomorphic at an indel site + s = (char)ref + s; + return s; + } + + /** + * create the info string; assumes that no values are null + * + * @param infoFields a map of info fields + * @throws IOException for writer + */ + protected void writeInfoString(Map infoFields) throws IOException { + if ( infoFields.isEmpty() ) { + mWriter.write(VCFConstants.EMPTY_INFO_FIELD); + return; + } + + boolean isFirst = true; + for ( Map.Entry entry : infoFields.entrySet() ) { + if ( isFirst ) + isFirst = false; + else + mWriter.write(VCFConstants.INFO_FIELD_SEPARATOR); + + String key = entry.getKey(); + mWriter.write(key); + + if ( !entry.getValue().equals("") ) { + int numVals = 1; + VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key); + if ( metaData != null ) + numVals = metaData.getCount(); + + // take care of unbounded encoding + if ( numVals == VCFInfoHeaderLine.UNBOUNDED ) + numVals = 1; + + if ( numVals > 0 ) { + mWriter.write("="); + mWriter.write(entry.getValue()); + } + } + } + } + + /** + * add the genotype data + * + * @param vc the variant context + * @param genotypeFormatKeys Genotype formatting string + * @param alleleMap alleles for this context + * @throws IOException for writer + */ + private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) + throws IOException { + + for ( String sample : mHeader.getGenotypeSamples() ) { + mWriter.write(VCFConstants.FIELD_SEPARATOR); + + Genotype g = vc.getGenotype(sample); + if ( g == null ) { + // TODO -- The VariantContext needs to know what the general ploidy is of the samples + // TODO -- We shouldn't be assuming diploid genotypes here! + mWriter.write(VCFConstants.EMPTY_GENOTYPE); + continue; + } + + writeAllele(g.getAllele(0), alleleMap); + for (int i = 1; i < g.getPloidy(); i++) { + mWriter.write(g.genotypesArePhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); + writeAllele(g.getAllele(i), alleleMap); + } + + List attrs = new ArrayList(genotypeFormatKeys.size()); + for ( String key : genotypeFormatKeys ) { + if ( key.equals(VCFConstants.GENOTYPE_KEY) ) + continue; Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4; @@ -282,255 +324,77 @@ public class VCFWriter { else { val = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); } - } else if ( key.equals(VCFConstants.DEPTH_KEY) && val == null ) { ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY); if ( pileup != null ) val = pileup.size(); } else if ( key.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - // VCF 4.0 key for no filters is "." val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : VCFConstants.PASSES_FILTERS_v4; } + VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key); + if ( metaData != null ) { + VCFHeaderLineType formatType = metaData.getType(); + if ( !(val instanceof String) ) + val = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT); - Object newVal; - if (typeUsedForFormatString.containsKey(key)) { - VCFHeaderLineType formatType = typeUsedForFormatString.get(key); - if (!val.getClass().equals(String.class)) - newVal = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT); - else - newVal = val; - - } - else { - newVal = val; - } - - if (numberUsedForFormatFields.containsKey(key)){ - int numInFormatField = numberUsedForFormatFields.get(key); - if (numInFormatField>1 && val.equals(VCFConstants.MISSING_VALUE_v4)) { - // If we have a missing field but multiple values are expected, we need to construct new string with all fields. - // for example for Number =2, string has to be ".,." - StringBuilder v = new StringBuilder(VCFConstants.MISSING_VALUE_v4); + int numInFormatField = metaData.getCount(); + if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { + // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. + // For example, if Number=2, the string has to be ".,." + StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); for ( int i = 1; i < numInFormatField; i++ ) { - v.append(","); - v.append(VCFConstants.MISSING_VALUE_v4); + sb.append(","); + sb.append(VCFConstants.MISSING_VALUE_v4); } - newVal = v.toString(); + val = sb.toString(); } } - // assume that if key is absent, given string encoding suffices. - String outputValue = formatVCFField(key, newVal); + // assume that if key is absent, then the given string encoding suffices + String outputValue = formatVCFField(val); if ( outputValue != null ) - vcfG.setField(key, outputValue); + attrs.add(outputValue); } - genotypeObjects.add(vcfG); - } - - mGenotypeRecords.clear(); - mGenotypeRecords.addAll(genotypeObjects); - // info fields - Map infoFields = new TreeMap(); - for ( Map.Entry elt : vc.getAttributes().entrySet() ) { - String key = elt.getKey(); - if ( key.equals("ID") ) - continue; - - String outputValue = formatVCFField(key, elt.getValue()); - if ( outputValue != null ) - infoFields.put(key, outputValue); - } - - - - builder.append(contig); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(position); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(ID); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(referenceFromVC); - builder.append(VCFConstants.FIELD_SEPARATOR); - - if ( vcfAltAlleles.size() > 0 ) { - builder.append(vcfAltAlleles.get(0)); - for ( int i = 1; i < vcfAltAlleles.size(); i++ ) { - builder.append(","); - builder.append(vcfAltAlleles.get(i)); - } - } else { - builder.append(VCFConstants.EMPTY_ALLELE); - } - builder.append(VCFConstants.FIELD_SEPARATOR); - - - if ( qual == -1 ) - builder.append(VCFConstants.MISSING_VALUE_v4); - else - builder.append(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, qual)); - - builder.append(VCFConstants.FIELD_SEPARATOR); - - - builder.append(filters); - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(createInfoString(infoFields)); - - if ( genotypeFormatString != null && genotypeFormatString.length() > 0 ) { - addGenotypeData(builder, header, genotypeFormatString, vcfAltAlleles); - } - - return builder.toString(); - - - } - - /** - * add the genotype data - * - * @param builder the string builder - * @param header the header object - * @param genotypeFormatString Genotype formatting string - * @param vcfAltAlleles alternate alleles at this site - */ - private void addGenotypeData(StringBuilder builder, VCFHeader header, - String genotypeFormatString, ListvcfAltAlleles) { - Map gMap = genotypeListToMap(mGenotypeRecords); - - StringBuffer tempStr = new StringBuffer(); - if ( header.getGenotypeSamples().size() < mGenotypeRecords.size() ) { - for ( String sample : gMap.keySet() ) { - if ( !header.getGenotypeSamples().contains(sample) ) - System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header"); + // strip off trailing missing values + for (int i = attrs.size()-1; i >= 0; i--) { + if ( attrs.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) + attrs.remove(i); else - header.getGenotypeSamples().remove(sample); - } - throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated"); - } - tempStr.append(VCFConstants.FIELD_SEPARATOR + genotypeFormatString); - - String[] genotypeFormatStrings = genotypeFormatString.split(":"); - - for ( String genotype : header.getGenotypeSamples() ) { - tempStr.append(VCFConstants.FIELD_SEPARATOR); - if ( gMap.containsKey(genotype) ) { - VCFGenotypeRecord rec = gMap.get(genotype); - String genotypeString = rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings, true); - - // Override default produced genotype string when there are trailing - String[] genotypeStrings = genotypeString.split(":"); - int lastUsedPosition = 0; - for (int k=genotypeStrings.length-1; k >=1; k--) { - // see if string represents an empty field. If not, break. - if (!isEmptyField(genotypeStrings[k]) ) { - lastUsedPosition = k; - break; - } - } - // now reconstruct genotypeString from 0 to lastUsedPosition - genotypeString = Utils.join(":",genotypeStrings, 0,lastUsedPosition+1); - tempStr.append(genotypeString); - gMap.remove(genotype); - } else { - tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings, true)); - } - } - if ( gMap.size() != 0 ) { - for ( String sample : gMap.keySet() ) - System.err.println("Sample " + sample + " is being genotyped but isn't in the header."); - throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records"); - } - - builder.append(tempStr); - } - - boolean isEmptyField(String field) { - // check if given genotype field is empty, ie either ".", or ".,.", or ".,.,.", etc. - String[] fields = field.split(","); - boolean isEmpty = true; - - for (int k=0; k < fields.length; k++) { - if (!fields[k].equals(VCFConstants.MISSING_VALUE_v4)) { - isEmpty = false; - break; + break; } - } - return isEmpty; - - } - /** - * create a genotype mapping from a list and their sample names - * - * @param list a list of genotype samples - * @return a mapping of the sample name to VCF genotype record - */ - private static Map genotypeListToMap(List list) { - Map map = new HashMap(); - for (int i = 0; i < list.size(); i++) { - VCFGenotypeRecord rec = list.get(i); - map.put(rec.getSampleName(), rec); - } - return map; - } - - /** - * create the info string - * - * @param infoFields a map of info fields - * @return a string representing the infomation fields - */ - protected String createInfoString(Map infoFields) { - StringBuffer info = new StringBuffer(); - boolean isFirst = true; - for (Map.Entry entry : infoFields.entrySet()) { - if ( isFirst ) - isFirst = false; - else - info.append(VCFConstants.INFO_FIELD_SEPARATOR); - - info.append(entry.getKey()); - - if ( entry.getValue() != null && !entry.getValue().equals("") ) { - int numVals = 1; - String key = entry.getKey(); - if (numberUsedForInfoFields.containsKey(key)) { - numVals = numberUsedForInfoFields.get(key); - } - - // take care of unbounded encoding - // TODO - workaround for "-1" in original INFO header structure - if (numVals == VCFInfoHeaderLine.UNBOUNDED || numVals < 0) - numVals = 1; - - if (numVals > 0) { - info.append("="); - info.append(entry.getValue()); - } + for (String s : attrs ) { + mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); + mWriter.write(s); } } - return info.length() == 0 ? VCFConstants.EMPTY_INFO_FIELD : info.toString(); } - private static String formatVCFField(String key, Object val) { + private void writeAllele(Allele allele, Map alleleMap) throws IOException { + String encoding = alleleMap.get(allele); + if ( encoding == null ) + throw new StingException("Allele " + allele + " is not an allele in the variant context"); + mWriter.write(encoding); + } + + private static String formatVCFField(Object val) { String result; if ( val == null ) - result = VCFGenotypeRecord.getMissingFieldValue(key); - else if ( val instanceof Double ) { - result = String.format("%.2f", (Double)val); - } + result = VCFConstants.MISSING_VALUE_v4; + else if ( val instanceof Double ) + result = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, (Double)val); else if ( val instanceof Boolean ) result = (Boolean)val ? "" : null; // empty string for true, null for false else if ( val instanceof List ) { List list = (List)val; if ( list.size() == 0 ) - return formatVCFField(key, null); - StringBuffer sb = new StringBuffer(formatVCFField(key, list.get(0))); + return formatVCFField(null); + StringBuffer sb = new StringBuffer(formatVCFField(list.get(0))); for ( int i = 1; i < list.size(); i++) { sb.append(","); - sb.append(formatVCFField(key, list.get(i))); + sb.append(formatVCFField(list.get(i))); } result = sb.toString(); } else diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java index 675aa0fa2..a95e9a80a 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java @@ -20,7 +20,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest { String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped"; String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s"; WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, - Arrays.asList("845b9a15ac947052ddded5b79228e5ec")); + Arrays.asList("fad2dd71550dec064d458c4aa83e4de9")); executeTest("Test Indels", spec); } }