Refactoring the VCF writer code; now no longer uses VCFRecord or any of its related classes, instead writing directly to the writer. Integration tests pass, but some are actually broken and will be fixed this week.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3822 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-07-18 13:19:56 +00:00
parent 379584f1bf
commit 693672a461
9 changed files with 278 additions and 413 deletions

View File

@ -58,6 +58,9 @@ public final class VCFConstants {
public static final String FIELD_SEPARATOR = "\t";
public static final String FILTER_CODE_SEPARATOR = ";";
public static final String INFO_FIELD_SEPARATOR = ";";
public static final String UNPHASED = "/";
public static final String PHASED = "|";
public static final String PHASED_SWITCH_PROB_v3 = "\\";
// missing/default values
public static final String UNFILTERED = ".";

View File

@ -20,6 +20,8 @@ public class VCFHeader {
// the associated meta data
private final Set<VCFHeaderLine> mMetaData;
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
@ -41,6 +43,7 @@ public class VCFHeader {
public VCFHeader(Set<VCFHeaderLine> metaData) {
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
loadVCFVersion();
loadMetaDataMaps();
}
/**
@ -57,6 +60,7 @@ public class VCFHeader {
}
if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
loadVCFVersion();
loadMetaDataMaps();
}
/**
@ -74,6 +78,18 @@ public class VCFHeader {
}
/**
* load the format/info meta data maps (these are used for quick lookup by key name)
*/
private void loadMetaDataMaps() {
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFInfoHeaderLine )
mInfoMetaData.put(line.getKey(), (VCFInfoHeaderLine)line);
else if ( line instanceof VCFFormatHeaderLine )
mFormatMetaData.put(line.getKey(), (VCFFormatHeaderLine)line);
}
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
@ -117,11 +133,26 @@ public class VCFHeader {
return hasGenotypingData;
}
/** @return the column count, */
/** @return the column count */
public int getColumnCount() {
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFInfoHeaderLine getInfoHeaderLine(String key) {
return mInfoMetaData.get(key);
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFFormatHeaderLine getFormatHeaderLine(String key) {
return mFormatMetaData.get(key);
}
}

View File

@ -51,14 +51,6 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage<VCFGenotypeW
((VCFGenotypeWriter)writer).addRecord(vcfRecord);
}
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value) {
((VCFGenotypeWriter)writer).setValidationStringency(value);
}
/**
* Merges the stream backing up this temporary storage into the target.
* @param target Target stream for the temporary storage. May not be null.

View File

@ -60,12 +60,4 @@ public class VCFGenotypeWriterStub extends GenotypeWriterStub<VCFGenotypeWriter>
public void addRecord(VCFRecord vcfRecord) {
outputTracker.getStorage(this).addRecord(vcfRecord);
}
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value) {
outputTracker.getStorage(this).setValidationStringency(value);
}
}

View File

@ -150,8 +150,10 @@ public class SequenomValidationConverter extends RodWalker<Pair<VariantContext,
int goodRecords = numRecords - numHWViolations - numNoCallViolations - numHomVarViolations;
System.out.println(String.format("Number of records passing all filters:\t\t\t%d (%d%%)", goodRecords, 100*goodRecords/numRecords));
hInfo.add(new VCFHeaderLine("ValidationMetrics_RecordsPassingFilters", String.format("\"%d (%d%%)\"", goodRecords, 100*goodRecords/numRecords)));
System.out.println(String.format("Number of passing records that are polymorphic:\t\t%d (%d%%)", numTrueVariants, 100*numTrueVariants/goodRecords));
hInfo.add(new VCFHeaderLine("ValidationMetrics_PolymorphicPassingRecords", String.format("\"%d (%d%%)\"", numTrueVariants, 100*numTrueVariants/goodRecords)));
if ( goodRecords > 0 ) {
System.out.println(String.format("Number of passing records that are polymorphic:\t\t%d (%d%%)", numTrueVariants, 100*numTrueVariants/goodRecords));
hInfo.add(new VCFHeaderLine("ValidationMetrics_PolymorphicPassingRecords", String.format("\"%d (%d%%)\"", numTrueVariants, 100*numTrueVariants/goodRecords)));
}
}
VCFHeader header = new VCFHeader(hInfo, sampleNames);

View File

@ -27,12 +27,4 @@ public interface VCFGenotypeWriter extends GenotypeWriter {
* @param vcfRecord Record to add.
*/
public void addRecord(VCFRecord vcfRecord);
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value);
public enum VALIDATION_STRINGENCY { STRICT, SILENT };
}

View File

@ -27,9 +27,6 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
// our log, which we want to capture anything from this class
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
// validation stringency
private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT;
// allowed genotype format strings
private Set<String> allowedGenotypeFormatStrings = null;
@ -43,6 +40,10 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
mWriter = new VCFWriter(writeTo);
}
public void addRecord(VCFRecord vcfRecord) {
mWriter.addRecord(vcfRecord);
}
/**
* initialize this VCF header
*
@ -92,16 +93,4 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
mWriter.add(vc, new byte[]{(byte)refAllele.charAt(0)});
}
public void addRecord(VCFRecord vcfRecord) {
mWriter.addRecord(vcfRecord, validationStringency);
}
/**
* set the validation stringency
* @param value validation stringency value
*/
public void setValidationStringency(VALIDATION_STRINGENCY value) {
validationStringency = value;
}
}

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -19,7 +20,6 @@ import java.util.*;
*/
public class VCFWriter {
// the VCF header we're storing
private VCFHeader mHeader = null;
@ -29,15 +29,6 @@ public class VCFWriter {
// were filters applied?
private boolean filtersWereAppliedToContext = false;
// our genotype sample fields
private static final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
// Properties only used when using VCF4.0 encoding
Map<String, VCFHeaderLineType> typeUsedForFormatString = new HashMap<String, VCFHeaderLineType>();
Map<String, VCFHeaderLineType> typeUsedForInfoFields = new HashMap<String, VCFHeaderLineType>();
Map<String, Integer> numberUsedForInfoFields = new HashMap<String, Integer>();
Map<String, Integer> numberUsedForFormatFields = new HashMap<String, Integer>();
/**
* create a VCF writer, given a file to write to
*
@ -72,43 +63,36 @@ public class VCFWriter {
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n");
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ||
line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ||
line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) )
continue;
if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ||
line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ||
line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) )
continue;
// Record, if line corresponds to a FORMAT field, which type will be used for writing value
if (line.getClass() == VCFFormatHeaderLine.class) {
VCFFormatHeaderLine a = (VCFFormatHeaderLine)line;
String key = a.getName();
typeUsedForFormatString.put(key,a.getType());
int num = a.getCount();
numberUsedForFormatFields.put(key,num);
} else if (line.getClass() == VCFInfoHeaderLine.class) {
VCFInfoHeaderLine a = (VCFInfoHeaderLine)line;
String key = a.getName();
typeUsedForInfoFields.put(key,a.getType());
int num = a.getCount();
numberUsedForInfoFields.put(key, num);
} else if (line.getClass() == VCFFilterHeaderLine.class) {
filtersWereAppliedToContext = true;
}
// are the records filtered (so we know what to put in the FILTER column of passing records) ?
if ( line instanceof VCFFilterHeaderLine )
filtersWereAppliedToContext = true;
mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n");
mWriter.write(VCFHeader.METADATA_INDICATOR);
mWriter.write(line.toString());
mWriter.write("\n");
}
// write out the column line
StringBuilder b = new StringBuilder();
b.append(VCFHeader.HEADER_INDICATOR);
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields())
b.append(field + VCFConstants.FIELD_SEPARATOR);
if (header.hasGenotypingData()) {
b.append("FORMAT" + VCFConstants.FIELD_SEPARATOR);
for (String field : header.getGenotypeSamples())
b.append(field + VCFConstants.FIELD_SEPARATOR);
mWriter.write(VCFHeader.HEADER_INDICATOR);
for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) {
mWriter.write(field.toString());
mWriter.write(VCFConstants.FIELD_SEPARATOR);
}
mWriter.write(b.toString() + "\n");
if ( header.hasGenotypingData() ) {
mWriter.write("FORMAT");
mWriter.write(VCFConstants.FIELD_SEPARATOR);
for ( String sample : header.getGenotypeSamples() ) {
mWriter.write(sample);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
} }
mWriter.write("\n");
mWriter.flush(); // necessary so that writing to an output stream will work
}
catch (IOException e) {
@ -116,39 +100,13 @@ public class VCFWriter {
}
}
/**
* output a record to the VCF file
*
* @param record the record to output
*/
public void add(VCFRecord record) {
addRecord(record);
}
public void addRecord(VCFRecord record) {
addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT);
}
public void add(VariantContext vc, byte[] refBases) {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added");
String vcfString = toStringEncoding(vc, mHeader, refBases);
try {
mWriter.write(vcfString + "\n");
mWriter.flush(); // necessary so that writing to an output stream will work
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to a file");
}
}
/**
* output a record to the VCF file
*
* @param record the record to output
* @param validationStringency the validation stringency
*/
public void addRecord(VCFRecord record, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) {
@Deprecated
public void addRecord(VCFRecord record) {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added");
@ -174,104 +132,188 @@ public class VCFWriter {
}
}
private String toStringEncoding(VariantContext vc, VCFHeader header, byte[] refBases) {
StringBuilder builder = new StringBuilder();
public void add(VariantContext vc, byte[] refBases) {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added");
if ( refBases == null || refBases.length < 1 )
throw new IllegalArgumentException("The reference base must be provided to write VCF records");
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
GenomeLoc loc = vc.getLocation();
try {
GenomeLoc loc = vc.getLocation();
Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
String contig = loc.getContig();
long position = loc.getStart();
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD;
// CHROM
mWriter.write(loc.getContig());
mWriter.write(VCFConstants.FIELD_SEPARATOR);
// POS
// TODO -- Remove this off-by-one issue when positions are settled in the input
mWriter.write(String.valueOf(loc.getStart() - (vc.isIndel() ? 1 : 0)));
mWriter.write(VCFConstants.FIELD_SEPARATOR);
// deal with the reference
String referenceFromVC = new String(vc.getReference().getBases());
// ID
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD;
mWriter.write(ID);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1;
// TODO- clean up these flags and associated code
// REF
alleleMap.put(vc.getReference(), "0");
String refString = makeAlleleString(vc.getReference(), vc.isIndel(), refBases[0]);
mWriter.write(refString);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
// ALT
if ( vc.getAlternateAlleles().size() > 0 ) {
Allele altAllele = vc.getAlternateAllele(0);
alleleMap.put(altAllele, "1");
String alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]);
mWriter.write(alt);
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFConstants.EMPTY_ALLELE)); // convenience for lookup
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
int numTrailingBases = 0, numPaddingBases = 0;
String paddingBases = "";
String trailingBases = "";
// search for reference allele and find trailing and padding at the end.
// See first if all alleles have a base encoding (ie no deletions)
// if so, add one common base to all alleles (reference at this location)
boolean hasBasesInAllAlleles = true;
String refString = null;
for ( Allele a : vc.getAlleles() ) {
if (a.isNull() || a.isNoCall())
hasBasesInAllAlleles = false;
if (a.isReference())
refString = new String(a.getBases());
}
// Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK.
if (!hasBasesInAllAlleles) {
trailingBases = new String(refBases);
numTrailingBases = 1;
position--;
}
for ( Allele a : vc.getAlleles() ) {
VCFGenotypeEncoding encoding;
String alleleString = new String(a.getBases());
String s = trailingBases+alleleString+paddingBases;
encoding = new VCFGenotypeEncoding(s, true);
// overwrite reference string by possibly padded version
if (a.isReference())
referenceFromVC = s;
else {
vcfAltAlleles.add(encoding);
for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
altAllele = vc.getAlternateAllele(i);
alleleMap.put(altAllele, String.valueOf(i+1));
alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]);
mWriter.write(",");
mWriter.write(alt);
}
} else {
mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
}
mWriter.write(VCFConstants.FIELD_SEPARATOR);
alleleMap.put(a, encoding);
}
// QUAL
if ( !vc.hasNegLog10PError() )
mWriter.write(VCFConstants.MISSING_VALUE_v4);
else
mWriter.write(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, vc.getPhredScaledQual()));
mWriter.write(VCFConstants.FIELD_SEPARATOR);
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) {
vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
for ( String key : calcVCFGenotypeKeys(vc) ) {
vcfGenotypeAttributeKeys.add(key);
}
} else if ( header.hasGenotypingData() ) {
// this needs to be done in case all samples are no-calls
vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
}
// FILTER
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
mWriter.write(filters);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
for ( Genotype g : vc.getGenotypesSortedByName() ) {
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
for ( Allele a : g.getAlleles() ) {
encodings.add(alleleMap.get(a));
}
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing);
for ( String key : vcfGenotypeAttributeKeys ) {
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
// INFO
Map<String, String> infoFields = new TreeMap<String, String>();
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
String key = field.getKey();
if ( key.equals("ID") )
continue;
String outputValue = formatVCFField(field.getValue());
if ( outputValue != null )
infoFields.put(key, outputValue);
}
writeInfoString(infoFields);
// FORMAT
List<String> genotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) {
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
for ( String key : calcVCFGenotypeKeys(vc) ) {
genotypeAttributeKeys.add(key);
}
} else if ( mHeader.hasGenotypingData() ) {
// this needs to be done in case all samples are no-calls
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
}
if ( genotypeAttributeKeys.size() > 0 ) {
String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write(genotypeFormatString);
addGenotypeData(vc, alleleMap, genotypeAttributeKeys);
}
mWriter.write("\n");
mWriter.flush(); // necessary so that writing to an output stream will work
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to a file");
}
}
private String makeAlleleString(Allele allele, boolean isIndel, byte ref) {
String s = new String(allele.getBases());
if ( isIndel || s.length() == 0 ) // in case the context is monomorphic at an indel site
s = (char)ref + s;
return s;
}
/**
* create the info string; assumes that no values are null
*
* @param infoFields a map of info fields
* @throws IOException for writer
*/
protected void writeInfoString(Map<String, String> infoFields) throws IOException {
if ( infoFields.isEmpty() ) {
mWriter.write(VCFConstants.EMPTY_INFO_FIELD);
return;
}
boolean isFirst = true;
for ( Map.Entry<String, String> entry : infoFields.entrySet() ) {
if ( isFirst )
isFirst = false;
else
mWriter.write(VCFConstants.INFO_FIELD_SEPARATOR);
String key = entry.getKey();
mWriter.write(key);
if ( !entry.getValue().equals("") ) {
int numVals = 1;
VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key);
if ( metaData != null )
numVals = metaData.getCount();
// take care of unbounded encoding
if ( numVals == VCFInfoHeaderLine.UNBOUNDED )
numVals = 1;
if ( numVals > 0 ) {
mWriter.write("=");
mWriter.write(entry.getValue());
}
}
}
}
/**
* add the genotype data
*
* @param vc the variant context
* @param genotypeFormatKeys Genotype formatting string
* @param alleleMap alleles for this context
* @throws IOException for writer
*/
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
throws IOException {
for ( String sample : mHeader.getGenotypeSamples() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
Genotype g = vc.getGenotype(sample);
if ( g == null ) {
// TODO -- The VariantContext needs to know what the general ploidy is of the samples
// TODO -- We shouldn't be assuming diploid genotypes here!
mWriter.write(VCFConstants.EMPTY_GENOTYPE);
continue;
}
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.genotypesArePhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for ( String key : genotypeFormatKeys ) {
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
continue;
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
@ -282,255 +324,77 @@ public class VCFWriter {
else {
val = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
}
} else if ( key.equals(VCFConstants.DEPTH_KEY) && val == null ) {
ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY);
if ( pileup != null )
val = pileup.size();
} else if ( key.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
// VCF 4.0 key for no filters is "."
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : VCFConstants.PASSES_FILTERS_v4;
}
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key);
if ( metaData != null ) {
VCFHeaderLineType formatType = metaData.getType();
if ( !(val instanceof String) )
val = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT);
Object newVal;
if (typeUsedForFormatString.containsKey(key)) {
VCFHeaderLineType formatType = typeUsedForFormatString.get(key);
if (!val.getClass().equals(String.class))
newVal = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT);
else
newVal = val;
}
else {
newVal = val;
}
if (numberUsedForFormatFields.containsKey(key)){
int numInFormatField = numberUsedForFormatFields.get(key);
if (numInFormatField>1 && val.equals(VCFConstants.MISSING_VALUE_v4)) {
// If we have a missing field but multiple values are expected, we need to construct new string with all fields.
// for example for Number =2, string has to be ".,."
StringBuilder v = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
int numInFormatField = metaData.getCount();
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
// For example, if Number=2, the string has to be ".,."
StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
for ( int i = 1; i < numInFormatField; i++ ) {
v.append(",");
v.append(VCFConstants.MISSING_VALUE_v4);
sb.append(",");
sb.append(VCFConstants.MISSING_VALUE_v4);
}
newVal = v.toString();
val = sb.toString();
}
}
// assume that if key is absent, given string encoding suffices.
String outputValue = formatVCFField(key, newVal);
// assume that if key is absent, then the given string encoding suffices
String outputValue = formatVCFField(val);
if ( outputValue != null )
vcfG.setField(key, outputValue);
attrs.add(outputValue);
}
genotypeObjects.add(vcfG);
}
mGenotypeRecords.clear();
mGenotypeRecords.addAll(genotypeObjects);
// info fields
Map<String, String> infoFields = new TreeMap<String, String>();
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
String key = elt.getKey();
if ( key.equals("ID") )
continue;
String outputValue = formatVCFField(key, elt.getValue());
if ( outputValue != null )
infoFields.put(key, outputValue);
}
builder.append(contig);
builder.append(VCFConstants.FIELD_SEPARATOR);
builder.append(position);
builder.append(VCFConstants.FIELD_SEPARATOR);
builder.append(ID);
builder.append(VCFConstants.FIELD_SEPARATOR);
builder.append(referenceFromVC);
builder.append(VCFConstants.FIELD_SEPARATOR);
if ( vcfAltAlleles.size() > 0 ) {
builder.append(vcfAltAlleles.get(0));
for ( int i = 1; i < vcfAltAlleles.size(); i++ ) {
builder.append(",");
builder.append(vcfAltAlleles.get(i));
}
} else {
builder.append(VCFConstants.EMPTY_ALLELE);
}
builder.append(VCFConstants.FIELD_SEPARATOR);
if ( qual == -1 )
builder.append(VCFConstants.MISSING_VALUE_v4);
else
builder.append(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, qual));
builder.append(VCFConstants.FIELD_SEPARATOR);
builder.append(filters);
builder.append(VCFConstants.FIELD_SEPARATOR);
builder.append(createInfoString(infoFields));
if ( genotypeFormatString != null && genotypeFormatString.length() > 0 ) {
addGenotypeData(builder, header, genotypeFormatString, vcfAltAlleles);
}
return builder.toString();
}
/**
* add the genotype data
*
* @param builder the string builder
* @param header the header object
* @param genotypeFormatString Genotype formatting string
* @param vcfAltAlleles alternate alleles at this site
*/
private void addGenotypeData(StringBuilder builder, VCFHeader header,
String genotypeFormatString, List<VCFGenotypeEncoding>vcfAltAlleles) {
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(mGenotypeRecords);
StringBuffer tempStr = new StringBuffer();
if ( header.getGenotypeSamples().size() < mGenotypeRecords.size() ) {
for ( String sample : gMap.keySet() ) {
if ( !header.getGenotypeSamples().contains(sample) )
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
// strip off trailing missing values
for (int i = attrs.size()-1; i >= 0; i--) {
if ( attrs.get(i).equals(VCFConstants.MISSING_VALUE_v4) )
attrs.remove(i);
else
header.getGenotypeSamples().remove(sample);
}
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
}
tempStr.append(VCFConstants.FIELD_SEPARATOR + genotypeFormatString);
String[] genotypeFormatStrings = genotypeFormatString.split(":");
for ( String genotype : header.getGenotypeSamples() ) {
tempStr.append(VCFConstants.FIELD_SEPARATOR);
if ( gMap.containsKey(genotype) ) {
VCFGenotypeRecord rec = gMap.get(genotype);
String genotypeString = rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings, true);
// Override default produced genotype string when there are trailing
String[] genotypeStrings = genotypeString.split(":");
int lastUsedPosition = 0;
for (int k=genotypeStrings.length-1; k >=1; k--) {
// see if string represents an empty field. If not, break.
if (!isEmptyField(genotypeStrings[k]) ) {
lastUsedPosition = k;
break;
}
}
// now reconstruct genotypeString from 0 to lastUsedPosition
genotypeString = Utils.join(":",genotypeStrings, 0,lastUsedPosition+1);
tempStr.append(genotypeString);
gMap.remove(genotype);
} else {
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings, true));
}
}
if ( gMap.size() != 0 ) {
for ( String sample : gMap.keySet() )
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
}
builder.append(tempStr);
}
boolean isEmptyField(String field) {
// check if given genotype field is empty, ie either ".", or ".,.", or ".,.,.", etc.
String[] fields = field.split(",");
boolean isEmpty = true;
for (int k=0; k < fields.length; k++) {
if (!fields[k].equals(VCFConstants.MISSING_VALUE_v4)) {
isEmpty = false;
break;
break;
}
}
return isEmpty;
}
/**
* create a genotype mapping from a list and their sample names
*
* @param list a list of genotype samples
* @return a mapping of the sample name to VCF genotype record
*/
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
for (int i = 0; i < list.size(); i++) {
VCFGenotypeRecord rec = list.get(i);
map.put(rec.getSampleName(), rec);
}
return map;
}
/**
* create the info string
*
* @param infoFields a map of info fields
* @return a string representing the infomation fields
*/
protected String createInfoString(Map<String,String> infoFields) {
StringBuffer info = new StringBuffer();
boolean isFirst = true;
for (Map.Entry<String, String> entry : infoFields.entrySet()) {
if ( isFirst )
isFirst = false;
else
info.append(VCFConstants.INFO_FIELD_SEPARATOR);
info.append(entry.getKey());
if ( entry.getValue() != null && !entry.getValue().equals("") ) {
int numVals = 1;
String key = entry.getKey();
if (numberUsedForInfoFields.containsKey(key)) {
numVals = numberUsedForInfoFields.get(key);
}
// take care of unbounded encoding
// TODO - workaround for "-1" in original INFO header structure
if (numVals == VCFInfoHeaderLine.UNBOUNDED || numVals < 0)
numVals = 1;
if (numVals > 0) {
info.append("=");
info.append(entry.getValue());
}
for (String s : attrs ) {
mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
mWriter.write(s);
}
}
return info.length() == 0 ? VCFConstants.EMPTY_INFO_FIELD : info.toString();
}
private static String formatVCFField(String key, Object val) {
private void writeAllele(Allele allele, Map<Allele, String> alleleMap) throws IOException {
String encoding = alleleMap.get(allele);
if ( encoding == null )
throw new StingException("Allele " + allele + " is not an allele in the variant context");
mWriter.write(encoding);
}
private static String formatVCFField(Object val) {
String result;
if ( val == null )
result = VCFGenotypeRecord.getMissingFieldValue(key);
else if ( val instanceof Double ) {
result = String.format("%.2f", (Double)val);
}
result = VCFConstants.MISSING_VALUE_v4;
else if ( val instanceof Double )
result = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, (Double)val);
else if ( val instanceof Boolean )
result = (Boolean)val ? "" : null; // empty string for true, null for false
else if ( val instanceof List ) {
List list = (List)val;
if ( list.size() == 0 )
return formatVCFField(key, null);
StringBuffer sb = new StringBuffer(formatVCFField(key, list.get(0)));
return formatVCFField(null);
StringBuffer sb = new StringBuffer(formatVCFField(list.get(0)));
for ( int i = 1; i < list.size(); i++) {
sb.append(",");
sb.append(formatVCFField(key, list.get(i)));
sb.append(formatVCFField(list.get(i)));
}
result = sb.toString();
} else

View File

@ -20,7 +20,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("845b9a15ac947052ddded5b79228e5ec"));
Arrays.asList("fad2dd71550dec064d458c4aa83e4de9"));
executeTest("Test Indels", spec);
}
}