Refactoring the VCF writer code; now no longer uses VCFRecord or any of its related classes, instead writing directly to the writer. Integration tests pass, but some are actually broken and will be fixed this week.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3822 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
379584f1bf
commit
693672a461
|
|
@ -58,6 +58,9 @@ public final class VCFConstants {
|
|||
public static final String FIELD_SEPARATOR = "\t";
|
||||
public static final String FILTER_CODE_SEPARATOR = ";";
|
||||
public static final String INFO_FIELD_SEPARATOR = ";";
|
||||
public static final String UNPHASED = "/";
|
||||
public static final String PHASED = "|";
|
||||
public static final String PHASED_SWITCH_PROB_v3 = "\\";
|
||||
|
||||
// missing/default values
|
||||
public static final String UNFILTERED = ".";
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@ public class VCFHeader {
|
|||
|
||||
// the associated meta data
|
||||
private final Set<VCFHeaderLine> mMetaData;
|
||||
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
|
||||
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
|
||||
|
||||
// the list of auxillary tags
|
||||
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
|
||||
|
|
@ -41,6 +43,7 @@ public class VCFHeader {
|
|||
public VCFHeader(Set<VCFHeaderLine> metaData) {
|
||||
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
|
||||
loadVCFVersion();
|
||||
loadMetaDataMaps();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -57,6 +60,7 @@ public class VCFHeader {
|
|||
}
|
||||
if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
|
||||
loadVCFVersion();
|
||||
loadMetaDataMaps();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -74,6 +78,18 @@ public class VCFHeader {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* load the format/info meta data maps (these are used for quick lookup by key name)
|
||||
*/
|
||||
private void loadMetaDataMaps() {
|
||||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFInfoHeaderLine )
|
||||
mInfoMetaData.put(line.getKey(), (VCFInfoHeaderLine)line);
|
||||
else if ( line instanceof VCFFormatHeaderLine )
|
||||
mFormatMetaData.put(line.getKey(), (VCFFormatHeaderLine)line);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header fields in order they're presented in the input file (which is now required to be
|
||||
* the order presented in the spec).
|
||||
|
|
@ -117,11 +133,26 @@ public class VCFHeader {
|
|||
return hasGenotypingData;
|
||||
}
|
||||
|
||||
/** @return the column count, */
|
||||
/** @return the column count */
|
||||
public int getColumnCount() {
|
||||
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFInfoHeaderLine getInfoHeaderLine(String key) {
|
||||
return mInfoMetaData.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFFormatHeaderLine getFormatHeaderLine(String key) {
|
||||
return mFormatMetaData.get(key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -51,14 +51,6 @@ public class VCFGenotypeWriterStorage extends GenotypeWriterStorage<VCFGenotypeW
|
|||
((VCFGenotypeWriter)writer).addRecord(vcfRecord);
|
||||
}
|
||||
|
||||
/**
|
||||
* set the validation stringency
|
||||
* @param value validation stringency value
|
||||
*/
|
||||
public void setValidationStringency(VALIDATION_STRINGENCY value) {
|
||||
((VCFGenotypeWriter)writer).setValidationStringency(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges the stream backing up this temporary storage into the target.
|
||||
* @param target Target stream for the temporary storage. May not be null.
|
||||
|
|
|
|||
|
|
@ -60,12 +60,4 @@ public class VCFGenotypeWriterStub extends GenotypeWriterStub<VCFGenotypeWriter>
|
|||
public void addRecord(VCFRecord vcfRecord) {
|
||||
outputTracker.getStorage(this).addRecord(vcfRecord);
|
||||
}
|
||||
|
||||
/**
|
||||
* set the validation stringency
|
||||
* @param value validation stringency value
|
||||
*/
|
||||
public void setValidationStringency(VALIDATION_STRINGENCY value) {
|
||||
outputTracker.getStorage(this).setValidationStringency(value);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -150,8 +150,10 @@ public class SequenomValidationConverter extends RodWalker<Pair<VariantContext,
|
|||
int goodRecords = numRecords - numHWViolations - numNoCallViolations - numHomVarViolations;
|
||||
System.out.println(String.format("Number of records passing all filters:\t\t\t%d (%d%%)", goodRecords, 100*goodRecords/numRecords));
|
||||
hInfo.add(new VCFHeaderLine("ValidationMetrics_RecordsPassingFilters", String.format("\"%d (%d%%)\"", goodRecords, 100*goodRecords/numRecords)));
|
||||
System.out.println(String.format("Number of passing records that are polymorphic:\t\t%d (%d%%)", numTrueVariants, 100*numTrueVariants/goodRecords));
|
||||
hInfo.add(new VCFHeaderLine("ValidationMetrics_PolymorphicPassingRecords", String.format("\"%d (%d%%)\"", numTrueVariants, 100*numTrueVariants/goodRecords)));
|
||||
if ( goodRecords > 0 ) {
|
||||
System.out.println(String.format("Number of passing records that are polymorphic:\t\t%d (%d%%)", numTrueVariants, 100*numTrueVariants/goodRecords));
|
||||
hInfo.add(new VCFHeaderLine("ValidationMetrics_PolymorphicPassingRecords", String.format("\"%d (%d%%)\"", numTrueVariants, 100*numTrueVariants/goodRecords)));
|
||||
}
|
||||
}
|
||||
|
||||
VCFHeader header = new VCFHeader(hInfo, sampleNames);
|
||||
|
|
|
|||
|
|
@ -27,12 +27,4 @@ public interface VCFGenotypeWriter extends GenotypeWriter {
|
|||
* @param vcfRecord Record to add.
|
||||
*/
|
||||
public void addRecord(VCFRecord vcfRecord);
|
||||
|
||||
/**
|
||||
* set the validation stringency
|
||||
* @param value validation stringency value
|
||||
*/
|
||||
public void setValidationStringency(VALIDATION_STRINGENCY value);
|
||||
|
||||
public enum VALIDATION_STRINGENCY { STRICT, SILENT };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,9 +27,6 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
|
|||
// our log, which we want to capture anything from this class
|
||||
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
|
||||
|
||||
// validation stringency
|
||||
private VALIDATION_STRINGENCY validationStringency = VALIDATION_STRINGENCY.STRICT;
|
||||
|
||||
// allowed genotype format strings
|
||||
private Set<String> allowedGenotypeFormatStrings = null;
|
||||
|
||||
|
|
@ -43,6 +40,10 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
|
|||
mWriter = new VCFWriter(writeTo);
|
||||
}
|
||||
|
||||
public void addRecord(VCFRecord vcfRecord) {
|
||||
mWriter.addRecord(vcfRecord);
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize this VCF header
|
||||
*
|
||||
|
|
@ -92,16 +93,4 @@ public class VCFGenotypeWriterAdapter implements VCFGenotypeWriter {
|
|||
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
|
||||
mWriter.add(vc, new byte[]{(byte)refAllele.charAt(0)});
|
||||
}
|
||||
|
||||
public void addRecord(VCFRecord vcfRecord) {
|
||||
mWriter.addRecord(vcfRecord, validationStringency);
|
||||
}
|
||||
|
||||
/**
|
||||
* set the validation stringency
|
||||
* @param value validation stringency value
|
||||
*/
|
||||
public void setValidationStringency(VALIDATION_STRINGENCY value) {
|
||||
validationStringency = value;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
|
|
@ -19,7 +20,6 @@ import java.util.*;
|
|||
*/
|
||||
public class VCFWriter {
|
||||
|
||||
|
||||
// the VCF header we're storing
|
||||
private VCFHeader mHeader = null;
|
||||
|
||||
|
|
@ -29,15 +29,6 @@ public class VCFWriter {
|
|||
// were filters applied?
|
||||
private boolean filtersWereAppliedToContext = false;
|
||||
|
||||
// our genotype sample fields
|
||||
private static final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||
|
||||
// Properties only used when using VCF4.0 encoding
|
||||
Map<String, VCFHeaderLineType> typeUsedForFormatString = new HashMap<String, VCFHeaderLineType>();
|
||||
Map<String, VCFHeaderLineType> typeUsedForInfoFields = new HashMap<String, VCFHeaderLineType>();
|
||||
Map<String, Integer> numberUsedForInfoFields = new HashMap<String, Integer>();
|
||||
Map<String, Integer> numberUsedForFormatFields = new HashMap<String, Integer>();
|
||||
|
||||
/**
|
||||
* create a VCF writer, given a file to write to
|
||||
*
|
||||
|
|
@ -72,43 +63,36 @@ public class VCFWriter {
|
|||
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n");
|
||||
|
||||
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||
if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ||
|
||||
line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ||
|
||||
line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) )
|
||||
continue;
|
||||
if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ||
|
||||
line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ||
|
||||
line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) )
|
||||
continue;
|
||||
|
||||
// Record, if line corresponds to a FORMAT field, which type will be used for writing value
|
||||
if (line.getClass() == VCFFormatHeaderLine.class) {
|
||||
VCFFormatHeaderLine a = (VCFFormatHeaderLine)line;
|
||||
String key = a.getName();
|
||||
typeUsedForFormatString.put(key,a.getType());
|
||||
int num = a.getCount();
|
||||
numberUsedForFormatFields.put(key,num);
|
||||
} else if (line.getClass() == VCFInfoHeaderLine.class) {
|
||||
VCFInfoHeaderLine a = (VCFInfoHeaderLine)line;
|
||||
String key = a.getName();
|
||||
typeUsedForInfoFields.put(key,a.getType());
|
||||
int num = a.getCount();
|
||||
numberUsedForInfoFields.put(key, num);
|
||||
} else if (line.getClass() == VCFFilterHeaderLine.class) {
|
||||
filtersWereAppliedToContext = true;
|
||||
}
|
||||
// are the records filtered (so we know what to put in the FILTER column of passing records) ?
|
||||
if ( line instanceof VCFFilterHeaderLine )
|
||||
filtersWereAppliedToContext = true;
|
||||
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n");
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR);
|
||||
mWriter.write(line.toString());
|
||||
mWriter.write("\n");
|
||||
}
|
||||
|
||||
// write out the column line
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append(VCFHeader.HEADER_INDICATOR);
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields())
|
||||
b.append(field + VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
if (header.hasGenotypingData()) {
|
||||
b.append("FORMAT" + VCFConstants.FIELD_SEPARATOR);
|
||||
for (String field : header.getGenotypeSamples())
|
||||
b.append(field + VCFConstants.FIELD_SEPARATOR);
|
||||
mWriter.write(VCFHeader.HEADER_INDICATOR);
|
||||
for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) {
|
||||
mWriter.write(field.toString());
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
}
|
||||
mWriter.write(b.toString() + "\n");
|
||||
|
||||
if ( header.hasGenotypingData() ) {
|
||||
mWriter.write("FORMAT");
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
for ( String sample : header.getGenotypeSamples() ) {
|
||||
mWriter.write(sample);
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
} }
|
||||
|
||||
mWriter.write("\n");
|
||||
mWriter.flush(); // necessary so that writing to an output stream will work
|
||||
}
|
||||
catch (IOException e) {
|
||||
|
|
@ -116,39 +100,13 @@ public class VCFWriter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* output a record to the VCF file
|
||||
*
|
||||
* @param record the record to output
|
||||
*/
|
||||
public void add(VCFRecord record) {
|
||||
addRecord(record);
|
||||
}
|
||||
|
||||
public void addRecord(VCFRecord record) {
|
||||
addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT);
|
||||
}
|
||||
|
||||
public void add(VariantContext vc, byte[] refBases) {
|
||||
if ( mHeader == null )
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added");
|
||||
|
||||
String vcfString = toStringEncoding(vc, mHeader, refBases);
|
||||
try {
|
||||
mWriter.write(vcfString + "\n");
|
||||
mWriter.flush(); // necessary so that writing to an output stream will work
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to write the VCF object to a file");
|
||||
}
|
||||
|
||||
}
|
||||
/**
|
||||
* output a record to the VCF file
|
||||
*
|
||||
* @param record the record to output
|
||||
* @param validationStringency the validation stringency
|
||||
*/
|
||||
public void addRecord(VCFRecord record, VCFGenotypeWriter.VALIDATION_STRINGENCY validationStringency) {
|
||||
@Deprecated
|
||||
public void addRecord(VCFRecord record) {
|
||||
if ( mHeader == null )
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added");
|
||||
|
||||
|
|
@ -174,104 +132,188 @@ public class VCFWriter {
|
|||
}
|
||||
}
|
||||
|
||||
private String toStringEncoding(VariantContext vc, VCFHeader header, byte[] refBases) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
public void add(VariantContext vc, byte[] refBases) {
|
||||
if ( mHeader == null )
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added");
|
||||
if ( refBases == null || refBases.length < 1 )
|
||||
throw new IllegalArgumentException("The reference base must be provided to write VCF records");
|
||||
|
||||
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
|
||||
GenomeLoc loc = vc.getLocation();
|
||||
try {
|
||||
GenomeLoc loc = vc.getLocation();
|
||||
Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
|
||||
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
|
||||
|
||||
String contig = loc.getContig();
|
||||
long position = loc.getStart();
|
||||
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD;
|
||||
// CHROM
|
||||
mWriter.write(loc.getContig());
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// POS
|
||||
// TODO -- Remove this off-by-one issue when positions are settled in the input
|
||||
mWriter.write(String.valueOf(loc.getStart() - (vc.isIndel() ? 1 : 0)));
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// deal with the reference
|
||||
String referenceFromVC = new String(vc.getReference().getBases());
|
||||
// ID
|
||||
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : VCFConstants.EMPTY_ID_FIELD;
|
||||
mWriter.write(ID);
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1;
|
||||
// TODO- clean up these flags and associated code
|
||||
// REF
|
||||
alleleMap.put(vc.getReference(), "0");
|
||||
String refString = makeAlleleString(vc.getReference(), vc.isIndel(), refBases[0]);
|
||||
mWriter.write(refString);
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
|
||||
// ALT
|
||||
if ( vc.getAlternateAlleles().size() > 0 ) {
|
||||
Allele altAllele = vc.getAlternateAllele(0);
|
||||
alleleMap.put(altAllele, "1");
|
||||
String alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]);
|
||||
mWriter.write(alt);
|
||||
|
||||
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
|
||||
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFConstants.EMPTY_ALLELE)); // convenience for lookup
|
||||
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
|
||||
int numTrailingBases = 0, numPaddingBases = 0;
|
||||
String paddingBases = "";
|
||||
String trailingBases = "";
|
||||
|
||||
// search for reference allele and find trailing and padding at the end.
|
||||
// See first if all alleles have a base encoding (ie no deletions)
|
||||
// if so, add one common base to all alleles (reference at this location)
|
||||
boolean hasBasesInAllAlleles = true;
|
||||
String refString = null;
|
||||
for ( Allele a : vc.getAlleles() ) {
|
||||
if (a.isNull() || a.isNoCall())
|
||||
hasBasesInAllAlleles = false;
|
||||
|
||||
if (a.isReference())
|
||||
refString = new String(a.getBases());
|
||||
}
|
||||
|
||||
|
||||
// Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK.
|
||||
if (!hasBasesInAllAlleles) {
|
||||
trailingBases = new String(refBases);
|
||||
numTrailingBases = 1;
|
||||
position--;
|
||||
}
|
||||
|
||||
|
||||
for ( Allele a : vc.getAlleles() ) {
|
||||
|
||||
VCFGenotypeEncoding encoding;
|
||||
|
||||
|
||||
String alleleString = new String(a.getBases());
|
||||
String s = trailingBases+alleleString+paddingBases;
|
||||
|
||||
encoding = new VCFGenotypeEncoding(s, true);
|
||||
|
||||
// overwrite reference string by possibly padded version
|
||||
if (a.isReference())
|
||||
referenceFromVC = s;
|
||||
|
||||
else {
|
||||
vcfAltAlleles.add(encoding);
|
||||
for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
|
||||
altAllele = vc.getAlternateAllele(i);
|
||||
alleleMap.put(altAllele, String.valueOf(i+1));
|
||||
alt = makeAlleleString(altAllele, vc.isIndel(), refBases[0]);
|
||||
mWriter.write(",");
|
||||
mWriter.write(alt);
|
||||
}
|
||||
} else {
|
||||
mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
|
||||
}
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
alleleMap.put(a, encoding);
|
||||
}
|
||||
// QUAL
|
||||
if ( !vc.hasNegLog10PError() )
|
||||
mWriter.write(VCFConstants.MISSING_VALUE_v4);
|
||||
else
|
||||
mWriter.write(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, vc.getPhredScaledQual()));
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||
for ( String key : calcVCFGenotypeKeys(vc) ) {
|
||||
vcfGenotypeAttributeKeys.add(key);
|
||||
}
|
||||
} else if ( header.hasGenotypingData() ) {
|
||||
// this needs to be done in case all samples are no-calls
|
||||
vcfGenotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||
}
|
||||
// FILTER
|
||||
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
|
||||
mWriter.write(filters);
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
|
||||
|
||||
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
|
||||
for ( Genotype g : vc.getGenotypesSortedByName() ) {
|
||||
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
|
||||
|
||||
for ( Allele a : g.getAlleles() ) {
|
||||
encodings.add(alleleMap.get(a));
|
||||
}
|
||||
|
||||
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
|
||||
VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing);
|
||||
|
||||
for ( String key : vcfGenotypeAttributeKeys ) {
|
||||
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
|
||||
// INFO
|
||||
Map<String, String> infoFields = new TreeMap<String, String>();
|
||||
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
|
||||
String key = field.getKey();
|
||||
if ( key.equals("ID") )
|
||||
continue;
|
||||
|
||||
String outputValue = formatVCFField(field.getValue());
|
||||
if ( outputValue != null )
|
||||
infoFields.put(key, outputValue);
|
||||
}
|
||||
writeInfoString(infoFields);
|
||||
|
||||
// FORMAT
|
||||
List<String> genotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||
for ( String key : calcVCFGenotypeKeys(vc) ) {
|
||||
genotypeAttributeKeys.add(key);
|
||||
}
|
||||
} else if ( mHeader.hasGenotypingData() ) {
|
||||
// this needs to be done in case all samples are no-calls
|
||||
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||
}
|
||||
|
||||
if ( genotypeAttributeKeys.size() > 0 ) {
|
||||
String genotypeFormatString = Utils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
mWriter.write(genotypeFormatString);
|
||||
|
||||
addGenotypeData(vc, alleleMap, genotypeAttributeKeys);
|
||||
}
|
||||
|
||||
mWriter.write("\n");
|
||||
mWriter.flush(); // necessary so that writing to an output stream will work
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to write the VCF object to a file");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private String makeAlleleString(Allele allele, boolean isIndel, byte ref) {
|
||||
String s = new String(allele.getBases());
|
||||
if ( isIndel || s.length() == 0 ) // in case the context is monomorphic at an indel site
|
||||
s = (char)ref + s;
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the info string; assumes that no values are null
|
||||
*
|
||||
* @param infoFields a map of info fields
|
||||
* @throws IOException for writer
|
||||
*/
|
||||
protected void writeInfoString(Map<String, String> infoFields) throws IOException {
|
||||
if ( infoFields.isEmpty() ) {
|
||||
mWriter.write(VCFConstants.EMPTY_INFO_FIELD);
|
||||
return;
|
||||
}
|
||||
|
||||
boolean isFirst = true;
|
||||
for ( Map.Entry<String, String> entry : infoFields.entrySet() ) {
|
||||
if ( isFirst )
|
||||
isFirst = false;
|
||||
else
|
||||
mWriter.write(VCFConstants.INFO_FIELD_SEPARATOR);
|
||||
|
||||
String key = entry.getKey();
|
||||
mWriter.write(key);
|
||||
|
||||
if ( !entry.getValue().equals("") ) {
|
||||
int numVals = 1;
|
||||
VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key);
|
||||
if ( metaData != null )
|
||||
numVals = metaData.getCount();
|
||||
|
||||
// take care of unbounded encoding
|
||||
if ( numVals == VCFInfoHeaderLine.UNBOUNDED )
|
||||
numVals = 1;
|
||||
|
||||
if ( numVals > 0 ) {
|
||||
mWriter.write("=");
|
||||
mWriter.write(entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* add the genotype data
|
||||
*
|
||||
* @param vc the variant context
|
||||
* @param genotypeFormatKeys Genotype formatting string
|
||||
* @param alleleMap alleles for this context
|
||||
* @throws IOException for writer
|
||||
*/
|
||||
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
|
||||
throws IOException {
|
||||
|
||||
for ( String sample : mHeader.getGenotypeSamples() ) {
|
||||
mWriter.write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
Genotype g = vc.getGenotype(sample);
|
||||
if ( g == null ) {
|
||||
// TODO -- The VariantContext needs to know what the general ploidy is of the samples
|
||||
// TODO -- We shouldn't be assuming diploid genotypes here!
|
||||
mWriter.write(VCFConstants.EMPTY_GENOTYPE);
|
||||
continue;
|
||||
}
|
||||
|
||||
writeAllele(g.getAllele(0), alleleMap);
|
||||
for (int i = 1; i < g.getPloidy(); i++) {
|
||||
mWriter.write(g.genotypesArePhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
|
||||
writeAllele(g.getAllele(i), alleleMap);
|
||||
}
|
||||
|
||||
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
|
||||
for ( String key : genotypeFormatKeys ) {
|
||||
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
|
||||
continue;
|
||||
|
||||
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
|
||||
|
||||
|
|
@ -282,255 +324,77 @@ public class VCFWriter {
|
|||
else {
|
||||
val = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
|
||||
}
|
||||
|
||||
} else if ( key.equals(VCFConstants.DEPTH_KEY) && val == null ) {
|
||||
ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY);
|
||||
if ( pileup != null )
|
||||
val = pileup.size();
|
||||
} else if ( key.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||
// VCF 4.0 key for no filters is "."
|
||||
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : VCFConstants.PASSES_FILTERS_v4;
|
||||
}
|
||||
|
||||
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key);
|
||||
if ( metaData != null ) {
|
||||
VCFHeaderLineType formatType = metaData.getType();
|
||||
if ( !(val instanceof String) )
|
||||
val = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT);
|
||||
|
||||
Object newVal;
|
||||
if (typeUsedForFormatString.containsKey(key)) {
|
||||
VCFHeaderLineType formatType = typeUsedForFormatString.get(key);
|
||||
if (!val.getClass().equals(String.class))
|
||||
newVal = formatType.convert(String.valueOf(val), VCFCompoundHeaderLine.SupportedHeaderLineType.FORMAT);
|
||||
else
|
||||
newVal = val;
|
||||
|
||||
}
|
||||
else {
|
||||
newVal = val;
|
||||
}
|
||||
|
||||
if (numberUsedForFormatFields.containsKey(key)){
|
||||
int numInFormatField = numberUsedForFormatFields.get(key);
|
||||
if (numInFormatField>1 && val.equals(VCFConstants.MISSING_VALUE_v4)) {
|
||||
// If we have a missing field but multiple values are expected, we need to construct new string with all fields.
|
||||
// for example for Number =2, string has to be ".,."
|
||||
StringBuilder v = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
|
||||
int numInFormatField = metaData.getCount();
|
||||
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
|
||||
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
|
||||
// For example, if Number=2, the string has to be ".,."
|
||||
StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
|
||||
for ( int i = 1; i < numInFormatField; i++ ) {
|
||||
v.append(",");
|
||||
v.append(VCFConstants.MISSING_VALUE_v4);
|
||||
sb.append(",");
|
||||
sb.append(VCFConstants.MISSING_VALUE_v4);
|
||||
}
|
||||
newVal = v.toString();
|
||||
val = sb.toString();
|
||||
}
|
||||
}
|
||||
// assume that if key is absent, given string encoding suffices.
|
||||
String outputValue = formatVCFField(key, newVal);
|
||||
|
||||
// assume that if key is absent, then the given string encoding suffices
|
||||
String outputValue = formatVCFField(val);
|
||||
if ( outputValue != null )
|
||||
vcfG.setField(key, outputValue);
|
||||
attrs.add(outputValue);
|
||||
}
|
||||
|
||||
genotypeObjects.add(vcfG);
|
||||
}
|
||||
|
||||
mGenotypeRecords.clear();
|
||||
mGenotypeRecords.addAll(genotypeObjects);
|
||||
// info fields
|
||||
Map<String, String> infoFields = new TreeMap<String, String>();
|
||||
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
|
||||
String key = elt.getKey();
|
||||
if ( key.equals("ID") )
|
||||
continue;
|
||||
|
||||
String outputValue = formatVCFField(key, elt.getValue());
|
||||
if ( outputValue != null )
|
||||
infoFields.put(key, outputValue);
|
||||
}
|
||||
|
||||
|
||||
|
||||
builder.append(contig);
|
||||
builder.append(VCFConstants.FIELD_SEPARATOR);
|
||||
builder.append(position);
|
||||
builder.append(VCFConstants.FIELD_SEPARATOR);
|
||||
builder.append(ID);
|
||||
builder.append(VCFConstants.FIELD_SEPARATOR);
|
||||
builder.append(referenceFromVC);
|
||||
builder.append(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
if ( vcfAltAlleles.size() > 0 ) {
|
||||
builder.append(vcfAltAlleles.get(0));
|
||||
for ( int i = 1; i < vcfAltAlleles.size(); i++ ) {
|
||||
builder.append(",");
|
||||
builder.append(vcfAltAlleles.get(i));
|
||||
}
|
||||
} else {
|
||||
builder.append(VCFConstants.EMPTY_ALLELE);
|
||||
}
|
||||
builder.append(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
|
||||
if ( qual == -1 )
|
||||
builder.append(VCFConstants.MISSING_VALUE_v4);
|
||||
else
|
||||
builder.append(String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, qual));
|
||||
|
||||
builder.append(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
|
||||
builder.append(filters);
|
||||
builder.append(VCFConstants.FIELD_SEPARATOR);
|
||||
builder.append(createInfoString(infoFields));
|
||||
|
||||
if ( genotypeFormatString != null && genotypeFormatString.length() > 0 ) {
|
||||
addGenotypeData(builder, header, genotypeFormatString, vcfAltAlleles);
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* add the genotype data
|
||||
*
|
||||
* @param builder the string builder
|
||||
* @param header the header object
|
||||
* @param genotypeFormatString Genotype formatting string
|
||||
* @param vcfAltAlleles alternate alleles at this site
|
||||
*/
|
||||
private void addGenotypeData(StringBuilder builder, VCFHeader header,
|
||||
String genotypeFormatString, List<VCFGenotypeEncoding>vcfAltAlleles) {
|
||||
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(mGenotypeRecords);
|
||||
|
||||
StringBuffer tempStr = new StringBuffer();
|
||||
if ( header.getGenotypeSamples().size() < mGenotypeRecords.size() ) {
|
||||
for ( String sample : gMap.keySet() ) {
|
||||
if ( !header.getGenotypeSamples().contains(sample) )
|
||||
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
|
||||
// strip off trailing missing values
|
||||
for (int i = attrs.size()-1; i >= 0; i--) {
|
||||
if ( attrs.get(i).equals(VCFConstants.MISSING_VALUE_v4) )
|
||||
attrs.remove(i);
|
||||
else
|
||||
header.getGenotypeSamples().remove(sample);
|
||||
}
|
||||
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
|
||||
}
|
||||
tempStr.append(VCFConstants.FIELD_SEPARATOR + genotypeFormatString);
|
||||
|
||||
String[] genotypeFormatStrings = genotypeFormatString.split(":");
|
||||
|
||||
for ( String genotype : header.getGenotypeSamples() ) {
|
||||
tempStr.append(VCFConstants.FIELD_SEPARATOR);
|
||||
if ( gMap.containsKey(genotype) ) {
|
||||
VCFGenotypeRecord rec = gMap.get(genotype);
|
||||
String genotypeString = rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings, true);
|
||||
|
||||
// Override default produced genotype string when there are trailing
|
||||
String[] genotypeStrings = genotypeString.split(":");
|
||||
int lastUsedPosition = 0;
|
||||
for (int k=genotypeStrings.length-1; k >=1; k--) {
|
||||
// see if string represents an empty field. If not, break.
|
||||
if (!isEmptyField(genotypeStrings[k]) ) {
|
||||
lastUsedPosition = k;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// now reconstruct genotypeString from 0 to lastUsedPosition
|
||||
genotypeString = Utils.join(":",genotypeStrings, 0,lastUsedPosition+1);
|
||||
tempStr.append(genotypeString);
|
||||
gMap.remove(genotype);
|
||||
} else {
|
||||
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings, true));
|
||||
}
|
||||
}
|
||||
if ( gMap.size() != 0 ) {
|
||||
for ( String sample : gMap.keySet() )
|
||||
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
|
||||
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
|
||||
}
|
||||
|
||||
builder.append(tempStr);
|
||||
}
|
||||
|
||||
boolean isEmptyField(String field) {
|
||||
// check if given genotype field is empty, ie either ".", or ".,.", or ".,.,.", etc.
|
||||
String[] fields = field.split(",");
|
||||
boolean isEmpty = true;
|
||||
|
||||
for (int k=0; k < fields.length; k++) {
|
||||
if (!fields[k].equals(VCFConstants.MISSING_VALUE_v4)) {
|
||||
isEmpty = false;
|
||||
break;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
return isEmpty;
|
||||
|
||||
}
|
||||
/**
|
||||
* create a genotype mapping from a list and their sample names
|
||||
*
|
||||
* @param list a list of genotype samples
|
||||
* @return a mapping of the sample name to VCF genotype record
|
||||
*/
|
||||
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
|
||||
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
VCFGenotypeRecord rec = list.get(i);
|
||||
map.put(rec.getSampleName(), rec);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the info string
|
||||
*
|
||||
* @param infoFields a map of info fields
|
||||
* @return a string representing the infomation fields
|
||||
*/
|
||||
protected String createInfoString(Map<String,String> infoFields) {
|
||||
StringBuffer info = new StringBuffer();
|
||||
boolean isFirst = true;
|
||||
for (Map.Entry<String, String> entry : infoFields.entrySet()) {
|
||||
if ( isFirst )
|
||||
isFirst = false;
|
||||
else
|
||||
info.append(VCFConstants.INFO_FIELD_SEPARATOR);
|
||||
|
||||
info.append(entry.getKey());
|
||||
|
||||
if ( entry.getValue() != null && !entry.getValue().equals("") ) {
|
||||
int numVals = 1;
|
||||
String key = entry.getKey();
|
||||
if (numberUsedForInfoFields.containsKey(key)) {
|
||||
numVals = numberUsedForInfoFields.get(key);
|
||||
}
|
||||
|
||||
// take care of unbounded encoding
|
||||
// TODO - workaround for "-1" in original INFO header structure
|
||||
if (numVals == VCFInfoHeaderLine.UNBOUNDED || numVals < 0)
|
||||
numVals = 1;
|
||||
|
||||
if (numVals > 0) {
|
||||
info.append("=");
|
||||
info.append(entry.getValue());
|
||||
}
|
||||
for (String s : attrs ) {
|
||||
mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
|
||||
mWriter.write(s);
|
||||
}
|
||||
}
|
||||
return info.length() == 0 ? VCFConstants.EMPTY_INFO_FIELD : info.toString();
|
||||
}
|
||||
|
||||
private static String formatVCFField(String key, Object val) {
|
||||
private void writeAllele(Allele allele, Map<Allele, String> alleleMap) throws IOException {
|
||||
String encoding = alleleMap.get(allele);
|
||||
if ( encoding == null )
|
||||
throw new StingException("Allele " + allele + " is not an allele in the variant context");
|
||||
mWriter.write(encoding);
|
||||
}
|
||||
|
||||
private static String formatVCFField(Object val) {
|
||||
String result;
|
||||
if ( val == null )
|
||||
result = VCFGenotypeRecord.getMissingFieldValue(key);
|
||||
else if ( val instanceof Double ) {
|
||||
result = String.format("%.2f", (Double)val);
|
||||
}
|
||||
result = VCFConstants.MISSING_VALUE_v4;
|
||||
else if ( val instanceof Double )
|
||||
result = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, (Double)val);
|
||||
else if ( val instanceof Boolean )
|
||||
result = (Boolean)val ? "" : null; // empty string for true, null for false
|
||||
else if ( val instanceof List ) {
|
||||
List list = (List)val;
|
||||
if ( list.size() == 0 )
|
||||
return formatVCFField(key, null);
|
||||
StringBuffer sb = new StringBuffer(formatVCFField(key, list.get(0)));
|
||||
return formatVCFField(null);
|
||||
StringBuffer sb = new StringBuffer(formatVCFField(list.get(0)));
|
||||
for ( int i = 1; i < list.size(); i++) {
|
||||
sb.append(",");
|
||||
sb.append(formatVCFField(key, list.get(i)));
|
||||
sb.append(formatVCFField(list.get(i)));
|
||||
}
|
||||
result = sb.toString();
|
||||
} else
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ public class SequenomValidationConverterIntegrationTest extends WalkerTest {
|
|||
String testPedFile = validationDataLocation + "pilot2_indel_validation.renamed.ped";
|
||||
String testArgs = "-R "+oneKGLocation+"reference/human_b36_both.fasta -T SequenomValidationConverter -B sequenom,Plink,"+testPedFile+" -o %s";
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("845b9a15ac947052ddded5b79228e5ec"));
|
||||
Arrays.asList("fad2dd71550dec064d458c4aa83e4de9"));
|
||||
executeTest("Test Indels", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue