diff --git a/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java index 520da467f..2cc56e610 100644 --- a/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java +++ b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java @@ -24,7 +24,10 @@ public class VCFGenotypeEncoding { private final TYPE mType; // public constructor, that parses out the base string - public VCFGenotypeEncoding(String baseString) { + public VCFGenotypeEncoding(String baseString){ + this(baseString, false); + } + public VCFGenotypeEncoding(String baseString, boolean allowMultipleBaseReference) { if ((baseString.length() == 1)) { // are we an empty (no-call) genotype? if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) { @@ -39,19 +42,23 @@ public class VCFGenotypeEncoding { mType = TYPE.SINGLE_BASE; } } else { // deletion or insertion - if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) { + if (baseString.length() < 1 ||(!allowMultipleBaseReference && (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I'))) { throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)"); } if (baseString.toUpperCase().charAt(0) == 'D') { mLength = Integer.valueOf(baseString.substring(1, baseString.length())); mBases = ""; mType = TYPE.DELETION; - } else { // we're an I + } else if (baseString.toUpperCase().charAt(0) == 'I') { // we're an I mBases = baseString.substring(1, baseString.length()).toUpperCase(); if (!validBases(mBases)) throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString); mLength = mBases.length(); mType = TYPE.INSERTION; + } else{ + mBases = baseString; + mType = TYPE.MIXED; + mLength = mBases.length(); } } } @@ -97,6 +104,7 @@ public class VCFGenotypeEncoding { switch (mType) { case SINGLE_BASE: case UNCALLED: + case MIXED: builder.append(mBases); break; case INSERTION: diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java new file mode 100755 index 000000000..b57f8dcd1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java @@ -0,0 +1,149 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broad.tribble.util.AsciiLineReader; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.RodWalker; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.util.*; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; + +import java.util.Iterator; + +/** + * Prints out all of the RODs in the input data set. Data is rendered using the toString() method + * of the given ROD. + */ +public class VCF4WriterTestWalker extends RodWalker { + private VCFWriter vcfWriter; + + @Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true) + private String OUTPUT_FILE = null; + + + public static final String INPUT_ROD_NAME = "vcf"; + + protected static String line = null; + final TreeSet samples = new TreeSet(); + VCF4Codec vcf4codec = new VCF4Codec(); + + /** + * Initialize the number of loci processed to zero. + * + * @return 0 + */ + public Integer reduceInit() { return 0; } + + public void initialize() { + final List dataSources = this.getToolkit().getRodDataSources(); + + // Open output file specified by output VCF ROD + vcfWriter = new VCFWriter(new File(OUTPUT_FILE)); + + for( final ReferenceOrderedDataSource source : dataSources ) { + final RMDTrack rod = source.getReferenceOrderedData(); + Object p = rod.getRecordType(); + if(rod.getType().equals(vcf4codec.getClass()) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) { + try { + AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath())); + int lineNumber = vcf4codec.readHeader(lineReader); + out.printf("Read %d header lines%n", lineNumber); + + VCFHeader header = vcf4codec.getHeader(VCFHeader.class); + vcfWriter.writeHeader(header); + } + catch (FileNotFoundException e ) { + throw new StingException(e.getMessage()); + } + } + + } + + + + } + + /** + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return 1 if the locus was successfully processed, 0 if otherwise + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + GenomeLoc loc = context.getLocation(); + VariantContext vc = tracker.getVariantContext(ref,"vcf", null, loc, true); + + + if (vc == null) + return 0; + + // Write directly variant context to VCF4.0 format. + vcfWriter.add(vc, ref.getBases()); + + return 1; + } + + /** + * Increment the number of rods processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of rods processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) {} +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java index 141d03cfa..3e2b79cf4 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java @@ -54,9 +54,14 @@ import java.util.*; public class ProduceBeagleInputWalker extends RodWalker { @Argument(fullName = "beagle_file", shortName = "beagle", doc = "File to print BEAGLE-specific data for use with imputation", required = true) - public PrintStream beagleWriter = null; + public PrintStream beagleWriter = null; + @Argument(fullName = "genotypes_file", shortName = "genotypes", doc = "File to print reference genotypes for error analysis", required = false) + public PrintStream beagleGenotypesWriter = null; + @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) + public double insertedNoCallRate = 0; final TreeSet samples = new TreeSet(); + Random generator; public void initialize() { @@ -71,11 +76,14 @@ public class ProduceBeagleInputWalker extends RodWalker { } } + generator = new Random(); beagleWriter.print("marker alleleA alleleB"); for ( String sample : samples ) beagleWriter.print(String.format(" %s %s %s", sample, sample, sample)); beagleWriter.println(); + if (beagleGenotypesWriter != null) + beagleGenotypesWriter.println("dummy header"); } public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { @@ -90,6 +98,9 @@ public class ProduceBeagleInputWalker extends RodWalker { // output marker ID to Beagle input file beagleWriter.print(String.format("%s ",vc_eval.getLocation().toString())); + if (beagleGenotypesWriter != null) + beagleGenotypesWriter.print(String.format("%s ",vc_eval.getLocation().toString())); + for (Allele allele: vc_eval.getAlleles()) { // TODO -- check whether this is really needed by Beagle String bglPrintString; @@ -120,20 +131,39 @@ public class ProduceBeagleInputWalker extends RodWalker { Double dg = Double.valueOf(gl); if (dg> maxLikelihood) maxLikelihood = dg; - + likeArray.add(dg); - } + } - for (Double likeVal: likeArray) - beagleWriter.print(String.format("%5.4f ",Math.pow(10, likeVal-maxLikelihood))); + // see if we need to randomly mask out genotype in this position. + Double d = generator.nextDouble(); + if (d > insertedNoCallRate ) { +// System.out.format("%5.4f ", d); + for (Double likeVal: likeArray) + beagleWriter.print(String.format("%5.4f ",Math.pow(10, likeVal-maxLikelihood))); + } + else { + // we are masking out this genotype + beagleWriter.print("0.33 0.33 0.33 "); + } + + if (beagleGenotypesWriter != null) { + char a = genotype.getAllele(0).toString().charAt(0); + char b = genotype.getAllele(0).toString().charAt(0); + + beagleGenotypesWriter.format("%c %c ", a, b); + } } - else + else { beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes. - + if (beagleGenotypesWriter != null) + beagleGenotypesWriter.print(". . "); + } } beagleWriter.println(); - + if (beagleGenotypesWriter != null) + beagleGenotypesWriter.println(); } return 1; diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index fba1303ac..4ef2b27a1 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -1,26 +1,50 @@ package org.broadinstitute.sting.utils.genotype.vcf; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broad.tribble.vcf.VCFHeaderVersion; -import org.broad.tribble.vcf.VCFRecord; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.CalledGenotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.io.*; -import java.util.TreeSet; +import java.util.*; /** * this class writers VCF files */ public class VCFWriter { - + // the VCF header we're storing private VCFHeader mHeader = null; // the print stream we're writting to BufferedWriter mWriter; - private final String FIELD_SEPERATOR = "\t"; + + // our genotype sample fields + private static final List mGenotypeRecords = new ArrayList(); + + // commonly used strings that are in the standard + private final String FORMAT_FIELD_SEPARATOR = ":"; + private static final String GENOTYPE_FIELD_SEPARATOR = ":"; + private static final String FIELD_SEPARATOR = "\t"; + private static final String FILTER_CODE_SEPARATOR = ";"; + private static final String INFO_FIELD_SEPARATOR = ";"; + + // default values + private static final String UNFILTERED = "."; + private static final String PASSES_FILTERS = "0"; + private static final String PASSES_FILTERS_VCF_4_0 = "PASS"; + private static final String EMPTY_INFO_FIELD = "."; + private static final String EMPTY_ID_FIELD = "."; + private static final String EMPTY_ALLELE_FIELD = "."; + private static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f"; + private static final String MISSING_GENOTYPE_QUALITY = "."; /** * create a VCF writer, given a file to write to @@ -48,18 +72,32 @@ public class VCFWriter { } public void writeHeader(VCFHeader header) { + // use 3.3 format by default + // TODO - update to true once 4.0 is on by default + writeHeader(header, false); + } + public void writeHeader(VCFHeader header, boolean useVCF4Format) { this.mHeader = header; try { - // the fileformat field needs to be written first + // the file format field needs to be written first TreeSet nonFormatMetaData = new TreeSet(); for ( VCFHeaderLine line : header.getMetaData() ) { - if ( line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ) { - mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n"); + if (useVCF4Format) { + if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ) { + mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n"); + } else { + nonFormatMetaData.add(line); + } } - else if ( line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) { - mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF3_2.getFormatString() + "=" + VCFHeaderVersion.VCF3_2.getVersionString() + "\n"); - } else { - nonFormatMetaData.add(line); + else { + if ( line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ) { + mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n"); + } + else if ( line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) { + mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF3_2.getFormatString() + "=" + VCFHeaderVersion.VCF3_2.getVersionString() + "\n"); + } else { + nonFormatMetaData.add(line); + } } } @@ -70,10 +108,10 @@ public class VCFWriter { // write out the column line StringBuilder b = new StringBuilder(); b.append(VCFHeader.HEADER_INDICATOR); - for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPERATOR); + for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPARATOR); if (header.hasGenotypingData()) { - b.append("FORMAT" + FIELD_SEPERATOR); - for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPERATOR); + b.append("FORMAT" + FIELD_SEPARATOR); + for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPARATOR); } mWriter.write(b.toString() + "\n"); mWriter.flush(); // necessary so that writing to an output stream will work @@ -88,10 +126,27 @@ public class VCFWriter { * * @param record the record to output */ + public void add(VCFRecord record) { + addRecord(record); + } + public void addRecord(VCFRecord record) { addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT); } + public void add(VariantContext vc, byte[] refBases) { + if ( mHeader == null ) + throw new IllegalStateException("The VCF Header must be written before records can be added"); + + String vcfString = toStringEncoding(vc, mHeader, refBases); + try { + mWriter.write(vcfString + "\n"); + mWriter.flush(); // necessary so that writing to an output stream will work + } catch (IOException e) { + throw new RuntimeException("Unable to write the VCF object to a file"); + } + + } /** * output a record to the VCF file * @@ -124,4 +179,308 @@ public class VCFWriter { } } + private String toStringEncoding(VariantContext vc, VCFHeader header, byte[] refBases) { + StringBuilder builder = new StringBuilder(); + + // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO + GenomeLoc loc = vc.getLocation(); + //String referenceBases = new String(vc.getReference().getBases()); + + String contig = loc.getContig(); + long position = loc.getStart(); + String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : EMPTY_ID_FIELD; + + builder.append(contig); + builder.append(FIELD_SEPARATOR); + builder.append(position); + builder.append(FIELD_SEPARATOR); + builder.append(ID); + builder.append(FIELD_SEPARATOR); + + // deal with the reference + String referenceBases = new String(vc.getReference().getBases()); + + double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1; + // TODO- clean up these flags and associated code + boolean filtersWereAppliedToContext = true; + List allowedGenotypeAttributeKeys = null; + boolean filtersWereAppliedToGenotypes = false; + String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED); + + String referenceString = new String(vc.getReference().getBases()); + Map alleleMap = new HashMap(); + alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup + List vcfAltAlleles = new ArrayList(); + + boolean isComplexEvent = false; + if (referenceBases.length()>1) { + // complex event: by VCF 4.0, reference from previous position is included. + position--; + isComplexEvent = true; + } + for ( Allele a : vc.getAlleles() ) { + + VCFGenotypeEncoding encoding; + + + String alleleString = new String(a.getBases()); + if (isComplexEvent) {// vc.getType() == VariantContext.Type.MIXED ) { + // Mixed variants (e.g. microsatellites) have the reference before the event included + String s = new String(refBases)+alleleString; + encoding = new VCFGenotypeEncoding(s, true); + if (a.isReference()) + referenceString = s; + + + } else if ( vc.getType() == VariantContext.Type.INDEL ) { + if ( a.isNull() ) { + if ( a.isReference() ) { + // ref, where alt is insertion + encoding = new VCFGenotypeEncoding(new String(refBases)); + } else { + // non-ref deletion + encoding = new VCFGenotypeEncoding("."); + } + } else if ( a.isReference() ) { + // ref, where alt is deletion + encoding = new VCFGenotypeEncoding(new String(refBases)); + } else { + // non-ref insertion + referenceString = new String(refBases)+alleleString; + encoding = new VCFGenotypeEncoding(referenceString, true); + } + } else { + // no variation, ref or alt for snp + encoding = new VCFGenotypeEncoding(alleleString); + } + + if ( a.isNonReference() ) { + vcfAltAlleles.add(encoding); + } + + alleleMap.put(a, encoding); + } + + List vcfGenotypeAttributeKeys = new ArrayList(); + if ( vc.hasGenotypes() ) { + vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY); + for ( String key : calcVCFGenotypeKeys(vc) ) { + if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) ) + vcfGenotypeAttributeKeys.add(key); + } + if ( filtersWereAppliedToGenotypes ) + vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY); + } + String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys); + + List genotypeObjects = new ArrayList(vc.getGenotypes().size()); + for ( Genotype g : vc.getGenotypesSortedByName() ) { + List encodings = new ArrayList(g.getPloidy()); + + for ( Allele a : g.getAlleles() ) { + encodings.add(alleleMap.get(a)); + } + + VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED; + VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing); + + for ( String key : vcfGenotypeAttributeKeys ) { + if ( key.equals(VCFGenotypeRecord.GENOTYPE_KEY) ) + continue; + + Object val = g.getAttribute(key); + // some exceptions + if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) { + if ( MathUtils.compareDoubles(g.getNegLog10PError(), Genotype.NO_NEG_LOG_10PERROR) == 0 ) + val = VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY; + else + val = Math.min(g.getPhredScaledQual(), VCFGenotypeRecord.MAX_QUAL_VALUE); + } else if ( key.equals(VCFGenotypeRecord.DEPTH_KEY) && val == null ) { + ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY); + if ( pileup != null ) + val = pileup.size(); + } else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) { + val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS; + } + + String outputValue = formatVCFField(key, val); + if ( outputValue != null ) + vcfG.setField(key, outputValue); + } + + genotypeObjects.add(vcfG); + } + + mGenotypeRecords.clear(); + mGenotypeRecords.addAll(genotypeObjects); + // info fields + Map infoFields = new HashMap(); + for ( Map.Entry elt : vc.getAttributes().entrySet() ) { + String key = elt.getKey(); + if ( key.equals("ID") ) + continue; + + String outputValue = formatVCFField(key, elt.getValue()); + if ( outputValue != null ) + infoFields.put(key, outputValue); + } + + + + + builder.append(referenceString); + builder.append(FIELD_SEPARATOR); + + if ( vcfAltAlleles.size() > 0 ) { + builder.append(vcfAltAlleles.get(0)); + for ( int i = 1; i < vcfAltAlleles.size(); i++ ) { + builder.append(","); + builder.append(vcfAltAlleles.get(i)); + } + } else { + builder.append(EMPTY_ALLELE_FIELD); + } + builder.append(FIELD_SEPARATOR); + + + if ( qual == -1 ) + builder.append(MISSING_GENOTYPE_QUALITY); + else + builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, qual)); + + builder.append(FIELD_SEPARATOR); + + + builder.append(filters); + builder.append(FIELD_SEPARATOR); + builder.append(createInfoString(infoFields)); + + if ( genotypeFormatString != null && genotypeFormatString.length() > 0 ) { + addGenotypeData(builder, header, genotypeFormatString, vcfAltAlleles); + } + + return builder.toString(); + + + } + + /** + * add the genotype data + * + * @param builder the string builder + * @param header the header object + */ + private static void addGenotypeData(StringBuilder builder, VCFHeader header, + String genotypeFormatString, ListvcfAltAlleles) { + Map gMap = genotypeListToMap(mGenotypeRecords); + + StringBuffer tempStr = new StringBuffer(); + if ( header.getGenotypeSamples().size() < mGenotypeRecords.size() ) { + for ( String sample : gMap.keySet() ) { + if ( !header.getGenotypeSamples().contains(sample) ) + System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header"); + else + header.getGenotypeSamples().remove(sample); + } + throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated"); + } + tempStr.append(FIELD_SEPARATOR + genotypeFormatString); + + String[] genotypeFormatStrings = genotypeFormatString.split(":"); + + for ( String genotype : header.getGenotypeSamples() ) { + tempStr.append(FIELD_SEPARATOR); + if ( gMap.containsKey(genotype) ) { + VCFGenotypeRecord rec = gMap.get(genotype); + tempStr.append(rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings)); + gMap.remove(genotype); + } else { + tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings)); + } + } + if ( gMap.size() != 0 ) { + for ( String sample : gMap.keySet() ) + System.err.println("Sample " + sample + " is being genotyped but isn't in the header."); + throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records"); + } + + builder.append(tempStr); + } + /** + * create a genotype mapping from a list and their sample names + * + * @param list a list of genotype samples + * @return a mapping of the sample name to VCF genotype record + */ + private static Map genotypeListToMap(List list) { + Map map = new HashMap(); + for (int i = 0; i < list.size(); i++) { + VCFGenotypeRecord rec = list.get(i); + map.put(rec.getSampleName(), rec); + } + return map; + } + + /** + * create the info string + * + * @return a string representing the infomation fields + */ + static protected String createInfoString(Map infoFields) { + StringBuffer info = new StringBuffer(); + boolean isFirst = true; + for (Map.Entry entry : infoFields.entrySet()) { + if ( isFirst ) + isFirst = false; + else + info.append(INFO_FIELD_SEPARATOR); + info.append(entry.getKey()); + if ( entry.getValue() != null && !entry.getValue().equals("") ) { + info.append("="); + info.append(entry.getValue()); + } + } + return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString(); + } + + private static String formatVCFField(String key, Object val) { + String result; + if ( val == null ) + result = VCFGenotypeRecord.getMissingFieldValue(key); + else if ( val instanceof Double ) + result = String.format("%.2f", (Double)val); + else if ( val instanceof Boolean ) + result = (Boolean)val ? "" : null; // empty string for true, null for false + else if ( val instanceof List ) { + List list = (List)val; + if ( list.size() == 0 ) + return formatVCFField(key, null); + StringBuffer sb = new StringBuffer(formatVCFField(key, list.get(0))); + for ( int i = 1; i < list.size(); i++) { + sb.append(","); + sb.append(formatVCFField(key, list.get(i))); + } + result = sb.toString(); + } else + result = val.toString(); + + return result; + } + + private static List calcVCFGenotypeKeys(VariantContext vc) { + Set keys = new HashSet(); + + boolean sawGoodQual = false; + for ( Genotype g : vc.getGenotypes().values() ) { + keys.addAll(g.getAttributes().keySet()); + if ( g.hasNegLog10PError() ) + sawGoodQual = true; + } + + if ( sawGoodQual ) + keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY); + return Utils.sorted(new ArrayList(keys)); + } + + }