1) Initial complete version of VCF4 writer. There are still issues (see below) but at least this version is fully functional. It incorporates getting rid of intermediate VCFRecord so we now operate from VariantContext objects directly to VCF 4.0 output.

See VCF4WriterTestWalker for usage example: it just amounts to adding
vcfWriter.add(vc,ref.getBases()) in walker.

add() method in VCFWriter is polymorphic and can also take a VCFRecord, lthough eventually this should be obsolete.
addRecord is still supported so all backward compatibility is maintained.

Resulting VCF4.0 are still not perfect, so additional changes are in progress. Specifically:
a) INFO codes of length 0 (e.g. HM, DB) are not emitted correctly (they should emit just "HM" but now they emit "HM=1").
b) Genotype values that are specified as Integer in header are ignored in type and are printed out as Doubles.

Both issues should be corrected with better header parsing.

2) Check in ability of Beagle to mask an additional percentage of genotype likelihoods (0 by default), for testing purposes.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3664 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-06-28 23:54:38 +00:00
parent 4a451949ba
commit ed71e53dd4
4 changed files with 574 additions and 28 deletions

View File

@ -24,7 +24,10 @@ public class VCFGenotypeEncoding {
private final TYPE mType;
// public constructor, that parses out the base string
public VCFGenotypeEncoding(String baseString) {
public VCFGenotypeEncoding(String baseString){
this(baseString, false);
}
public VCFGenotypeEncoding(String baseString, boolean allowMultipleBaseReference) {
if ((baseString.length() == 1)) {
// are we an empty (no-call) genotype?
if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
@ -39,19 +42,23 @@ public class VCFGenotypeEncoding {
mType = TYPE.SINGLE_BASE;
}
} else { // deletion or insertion
if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) {
if (baseString.length() < 1 ||(!allowMultipleBaseReference && (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I'))) {
throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)");
}
if (baseString.toUpperCase().charAt(0) == 'D') {
mLength = Integer.valueOf(baseString.substring(1, baseString.length()));
mBases = "";
mType = TYPE.DELETION;
} else { // we're an I
} else if (baseString.toUpperCase().charAt(0) == 'I') { // we're an I
mBases = baseString.substring(1, baseString.length()).toUpperCase();
if (!validBases(mBases))
throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString);
mLength = mBases.length();
mType = TYPE.INSERTION;
} else{
mBases = baseString;
mType = TYPE.MIXED;
mLength = mBases.length();
}
}
}
@ -97,6 +104,7 @@ public class VCFGenotypeEncoding {
switch (mType) {
case SINGLE_BASE:
case UNCALLED:
case MIXED:
builder.append(mBases);
break;
case INSERTION:

View File

@ -0,0 +1,149 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broad.tribble.util.AsciiLineReader;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.*;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import java.util.Iterator;
/**
* Prints out all of the RODs in the input data set. Data is rendered using the toString() method
* of the given ROD.
*/
public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
private VCFWriter vcfWriter;
@Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true)
private String OUTPUT_FILE = null;
public static final String INPUT_ROD_NAME = "vcf";
protected static String line = null;
final TreeSet<String> samples = new TreeSet<String>();
VCF4Codec vcf4codec = new VCF4Codec();
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
public void initialize() {
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
// Open output file specified by output VCF ROD
vcfWriter = new VCFWriter(new File(OUTPUT_FILE));
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
Object p = rod.getRecordType();
if(rod.getType().equals(vcf4codec.getClass()) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
try {
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath()));
int lineNumber = vcf4codec.readHeader(lineReader);
out.printf("Read %d header lines%n", lineNumber);
VCFHeader header = vcf4codec.getHeader(VCFHeader.class);
vcfWriter.writeHeader(header);
}
catch (FileNotFoundException e ) {
throw new StingException(e.getMessage());
}
}
}
}
/**
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return 1 if the locus was successfully processed, 0 if otherwise
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
GenomeLoc loc = context.getLocation();
VariantContext vc = tracker.getVariantContext(ref,"vcf", null, loc, true);
if (vc == null)
return 0;
// Write directly variant context to VCF4.0 format.
vcfWriter.add(vc, ref.getBases());
return 1;
}
/**
* Increment the number of rods processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of rods processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {}
}

View File

@ -54,9 +54,14 @@ import java.util.*;
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(fullName = "beagle_file", shortName = "beagle", doc = "File to print BEAGLE-specific data for use with imputation", required = true)
public PrintStream beagleWriter = null;
public PrintStream beagleWriter = null;
@Argument(fullName = "genotypes_file", shortName = "genotypes", doc = "File to print reference genotypes for error analysis", required = false)
public PrintStream beagleGenotypesWriter = null;
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0;
final TreeSet<String> samples = new TreeSet<String>();
Random generator;
public void initialize() {
@ -71,11 +76,14 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
}
}
generator = new Random();
beagleWriter.print("marker alleleA alleleB");
for ( String sample : samples )
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
beagleWriter.println();
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.println("dummy header");
}
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
@ -90,6 +98,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
// output marker ID to Beagle input file
beagleWriter.print(String.format("%s ",vc_eval.getLocation().toString()));
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.print(String.format("%s ",vc_eval.getLocation().toString()));
for (Allele allele: vc_eval.getAlleles()) {
// TODO -- check whether this is really needed by Beagle
String bglPrintString;
@ -120,20 +131,39 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
Double dg = Double.valueOf(gl);
if (dg> maxLikelihood)
maxLikelihood = dg;
likeArray.add(dg);
}
}
for (Double likeVal: likeArray)
beagleWriter.print(String.format("%5.4f ",Math.pow(10, likeVal-maxLikelihood)));
// see if we need to randomly mask out genotype in this position.
Double d = generator.nextDouble();
if (d > insertedNoCallRate ) {
// System.out.format("%5.4f ", d);
for (Double likeVal: likeArray)
beagleWriter.print(String.format("%5.4f ",Math.pow(10, likeVal-maxLikelihood)));
}
else {
// we are masking out this genotype
beagleWriter.print("0.33 0.33 0.33 ");
}
if (beagleGenotypesWriter != null) {
char a = genotype.getAllele(0).toString().charAt(0);
char b = genotype.getAllele(0).toString().charAt(0);
beagleGenotypesWriter.format("%c %c ", a, b);
}
}
else
else {
beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.print(". . ");
}
}
beagleWriter.println();
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.println();
}
return 1;

View File

@ -1,26 +1,50 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFHeaderVersion;
import org.broad.tribble.vcf.VCFRecord;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.*;
import java.util.TreeSet;
import java.util.*;
/**
* this class writers VCF files
*/
public class VCFWriter {
// the VCF header we're storing
private VCFHeader mHeader = null;
// the print stream we're writting to
BufferedWriter mWriter;
private final String FIELD_SEPERATOR = "\t";
// our genotype sample fields
private static final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
// commonly used strings that are in the standard
private final String FORMAT_FIELD_SEPARATOR = ":";
private static final String GENOTYPE_FIELD_SEPARATOR = ":";
private static final String FIELD_SEPARATOR = "\t";
private static final String FILTER_CODE_SEPARATOR = ";";
private static final String INFO_FIELD_SEPARATOR = ";";
// default values
private static final String UNFILTERED = ".";
private static final String PASSES_FILTERS = "0";
private static final String PASSES_FILTERS_VCF_4_0 = "PASS";
private static final String EMPTY_INFO_FIELD = ".";
private static final String EMPTY_ID_FIELD = ".";
private static final String EMPTY_ALLELE_FIELD = ".";
private static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
private static final String MISSING_GENOTYPE_QUALITY = ".";
/**
* create a VCF writer, given a file to write to
@ -48,18 +72,32 @@ public class VCFWriter {
}
public void writeHeader(VCFHeader header) {
// use 3.3 format by default
// TODO - update to true once 4.0 is on by default
writeHeader(header, false);
}
public void writeHeader(VCFHeader header, boolean useVCF4Format) {
this.mHeader = header;
try {
// the fileformat field needs to be written first
// the file format field needs to be written first
TreeSet<VCFHeaderLine> nonFormatMetaData = new TreeSet<VCFHeaderLine>();
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n");
if (useVCF4Format) {
if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n");
} else {
nonFormatMetaData.add(line);
}
}
else if ( line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF3_2.getFormatString() + "=" + VCFHeaderVersion.VCF3_2.getVersionString() + "\n");
} else {
nonFormatMetaData.add(line);
else {
if ( line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n");
}
else if ( line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF3_2.getFormatString() + "=" + VCFHeaderVersion.VCF3_2.getVersionString() + "\n");
} else {
nonFormatMetaData.add(line);
}
}
}
@ -70,10 +108,10 @@ public class VCFWriter {
// write out the column line
StringBuilder b = new StringBuilder();
b.append(VCFHeader.HEADER_INDICATOR);
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPERATOR);
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPARATOR);
if (header.hasGenotypingData()) {
b.append("FORMAT" + FIELD_SEPERATOR);
for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPERATOR);
b.append("FORMAT" + FIELD_SEPARATOR);
for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPARATOR);
}
mWriter.write(b.toString() + "\n");
mWriter.flush(); // necessary so that writing to an output stream will work
@ -88,10 +126,27 @@ public class VCFWriter {
*
* @param record the record to output
*/
public void add(VCFRecord record) {
addRecord(record);
}
public void addRecord(VCFRecord record) {
addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT);
}
public void add(VariantContext vc, byte[] refBases) {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added");
String vcfString = toStringEncoding(vc, mHeader, refBases);
try {
mWriter.write(vcfString + "\n");
mWriter.flush(); // necessary so that writing to an output stream will work
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to a file");
}
}
/**
* output a record to the VCF file
*
@ -124,4 +179,308 @@ public class VCFWriter {
}
}
private String toStringEncoding(VariantContext vc, VCFHeader header, byte[] refBases) {
StringBuilder builder = new StringBuilder();
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
GenomeLoc loc = vc.getLocation();
//String referenceBases = new String(vc.getReference().getBases());
String contig = loc.getContig();
long position = loc.getStart();
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : EMPTY_ID_FIELD;
builder.append(contig);
builder.append(FIELD_SEPARATOR);
builder.append(position);
builder.append(FIELD_SEPARATOR);
builder.append(ID);
builder.append(FIELD_SEPARATOR);
// deal with the reference
String referenceBases = new String(vc.getReference().getBases());
double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1;
// TODO- clean up these flags and associated code
boolean filtersWereAppliedToContext = true;
List<String> allowedGenotypeAttributeKeys = null;
boolean filtersWereAppliedToGenotypes = false;
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED);
String referenceString = new String(vc.getReference().getBases());
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
boolean isComplexEvent = false;
if (referenceBases.length()>1) {
// complex event: by VCF 4.0, reference from previous position is included.
position--;
isComplexEvent = true;
}
for ( Allele a : vc.getAlleles() ) {
VCFGenotypeEncoding encoding;
String alleleString = new String(a.getBases());
if (isComplexEvent) {// vc.getType() == VariantContext.Type.MIXED ) {
// Mixed variants (e.g. microsatellites) have the reference before the event included
String s = new String(refBases)+alleleString;
encoding = new VCFGenotypeEncoding(s, true);
if (a.isReference())
referenceString = s;
} else if ( vc.getType() == VariantContext.Type.INDEL ) {
if ( a.isNull() ) {
if ( a.isReference() ) {
// ref, where alt is insertion
encoding = new VCFGenotypeEncoding(new String(refBases));
} else {
// non-ref deletion
encoding = new VCFGenotypeEncoding(".");
}
} else if ( a.isReference() ) {
// ref, where alt is deletion
encoding = new VCFGenotypeEncoding(new String(refBases));
} else {
// non-ref insertion
referenceString = new String(refBases)+alleleString;
encoding = new VCFGenotypeEncoding(referenceString, true);
}
} else {
// no variation, ref or alt for snp
encoding = new VCFGenotypeEncoding(alleleString);
}
if ( a.isNonReference() ) {
vcfAltAlleles.add(encoding);
}
alleleMap.put(a, encoding);
}
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) {
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY);
for ( String key : calcVCFGenotypeKeys(vc) ) {
if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
vcfGenotypeAttributeKeys.add(key);
}
if ( filtersWereAppliedToGenotypes )
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY);
}
String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
for ( Genotype g : vc.getGenotypesSortedByName() ) {
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
for ( Allele a : g.getAlleles() ) {
encodings.add(alleleMap.get(a));
}
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing);
for ( String key : vcfGenotypeAttributeKeys ) {
if ( key.equals(VCFGenotypeRecord.GENOTYPE_KEY) )
continue;
Object val = g.getAttribute(key);
// some exceptions
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {
if ( MathUtils.compareDoubles(g.getNegLog10PError(), Genotype.NO_NEG_LOG_10PERROR) == 0 )
val = VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY;
else
val = Math.min(g.getPhredScaledQual(), VCFGenotypeRecord.MAX_QUAL_VALUE);
} else if ( key.equals(VCFGenotypeRecord.DEPTH_KEY) && val == null ) {
ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY);
if ( pileup != null )
val = pileup.size();
} else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) {
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS;
}
String outputValue = formatVCFField(key, val);
if ( outputValue != null )
vcfG.setField(key, outputValue);
}
genotypeObjects.add(vcfG);
}
mGenotypeRecords.clear();
mGenotypeRecords.addAll(genotypeObjects);
// info fields
Map<String, String> infoFields = new HashMap<String, String>();
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
String key = elt.getKey();
if ( key.equals("ID") )
continue;
String outputValue = formatVCFField(key, elt.getValue());
if ( outputValue != null )
infoFields.put(key, outputValue);
}
builder.append(referenceString);
builder.append(FIELD_SEPARATOR);
if ( vcfAltAlleles.size() > 0 ) {
builder.append(vcfAltAlleles.get(0));
for ( int i = 1; i < vcfAltAlleles.size(); i++ ) {
builder.append(",");
builder.append(vcfAltAlleles.get(i));
}
} else {
builder.append(EMPTY_ALLELE_FIELD);
}
builder.append(FIELD_SEPARATOR);
if ( qual == -1 )
builder.append(MISSING_GENOTYPE_QUALITY);
else
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, qual));
builder.append(FIELD_SEPARATOR);
builder.append(filters);
builder.append(FIELD_SEPARATOR);
builder.append(createInfoString(infoFields));
if ( genotypeFormatString != null && genotypeFormatString.length() > 0 ) {
addGenotypeData(builder, header, genotypeFormatString, vcfAltAlleles);
}
return builder.toString();
}
/**
* add the genotype data
*
* @param builder the string builder
* @param header the header object
*/
private static void addGenotypeData(StringBuilder builder, VCFHeader header,
String genotypeFormatString, List<VCFGenotypeEncoding>vcfAltAlleles) {
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(mGenotypeRecords);
StringBuffer tempStr = new StringBuffer();
if ( header.getGenotypeSamples().size() < mGenotypeRecords.size() ) {
for ( String sample : gMap.keySet() ) {
if ( !header.getGenotypeSamples().contains(sample) )
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
else
header.getGenotypeSamples().remove(sample);
}
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
}
tempStr.append(FIELD_SEPARATOR + genotypeFormatString);
String[] genotypeFormatStrings = genotypeFormatString.split(":");
for ( String genotype : header.getGenotypeSamples() ) {
tempStr.append(FIELD_SEPARATOR);
if ( gMap.containsKey(genotype) ) {
VCFGenotypeRecord rec = gMap.get(genotype);
tempStr.append(rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings));
gMap.remove(genotype);
} else {
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings));
}
}
if ( gMap.size() != 0 ) {
for ( String sample : gMap.keySet() )
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
}
builder.append(tempStr);
}
/**
* create a genotype mapping from a list and their sample names
*
* @param list a list of genotype samples
* @return a mapping of the sample name to VCF genotype record
*/
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
for (int i = 0; i < list.size(); i++) {
VCFGenotypeRecord rec = list.get(i);
map.put(rec.getSampleName(), rec);
}
return map;
}
/**
* create the info string
*
* @return a string representing the infomation fields
*/
static protected String createInfoString(Map<String,String> infoFields) {
StringBuffer info = new StringBuffer();
boolean isFirst = true;
for (Map.Entry<String, String> entry : infoFields.entrySet()) {
if ( isFirst )
isFirst = false;
else
info.append(INFO_FIELD_SEPARATOR);
info.append(entry.getKey());
if ( entry.getValue() != null && !entry.getValue().equals("") ) {
info.append("=");
info.append(entry.getValue());
}
}
return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString();
}
private static String formatVCFField(String key, Object val) {
String result;
if ( val == null )
result = VCFGenotypeRecord.getMissingFieldValue(key);
else if ( val instanceof Double )
result = String.format("%.2f", (Double)val);
else if ( val instanceof Boolean )
result = (Boolean)val ? "" : null; // empty string for true, null for false
else if ( val instanceof List ) {
List list = (List)val;
if ( list.size() == 0 )
return formatVCFField(key, null);
StringBuffer sb = new StringBuffer(formatVCFField(key, list.get(0)));
for ( int i = 1; i < list.size(); i++) {
sb.append(",");
sb.append(formatVCFField(key, list.get(i)));
}
result = sb.toString();
} else
result = val.toString();
return result;
}
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
Set<String> keys = new HashSet<String>();
boolean sawGoodQual = false;
for ( Genotype g : vc.getGenotypes().values() ) {
keys.addAll(g.getAttributes().keySet());
if ( g.hasNegLog10PError() )
sawGoodQual = true;
}
if ( sawGoodQual )
keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
return Utils.sorted(new ArrayList<String>(keys));
}
}