More necessary fixes for VCF4.0 - now results look more sensible in realistic, bigger VCF files produced by say Dindel and not just the small test VCF:

- Fixed and cleaned code to produce trailing and padding bases in alleles around indels.
- Deal better with missing fields.
Pending:
- Chopping missing fields at end of genotypes.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3679 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2010-06-30 02:59:30 +00:00
parent 12c0de6170
commit d932322190
2 changed files with 119 additions and 102 deletions

View File

@ -275,6 +275,10 @@ public class VCFGenotypeRecord {
* @return a string * @return a string
*/ */
public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles, String[] genotypeFormatStrings) { public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles, String[] genotypeFormatStrings) {
return toStringEncoding(altAlleles, genotypeFormatStrings, false);
}
public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles, String[] genotypeFormatStrings, boolean doVCF40) {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
builder.append(toGenotypeString(altAlleles)); builder.append(toGenotypeString(altAlleles));
@ -288,7 +292,7 @@ public class VCFGenotypeRecord {
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR); builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
if ( value == null || value.equals("") ) if ( value == null || value.equals("") )
builder.append(getMissingFieldValue(field)); builder.append(getMissingFieldValue(field, doVCF40));
else else
builder.append(value); builder.append(value);
} }
@ -304,6 +308,10 @@ public class VCFGenotypeRecord {
* @return a string * @return a string
*/ */
public static String stringEncodingForEmptyGenotype(String[] genotypeFormatStrings) { public static String stringEncodingForEmptyGenotype(String[] genotypeFormatStrings) {
// backward compatibility to VCF 3.3
return stringEncodingForEmptyGenotype(genotypeFormatStrings, false);
}
public static String stringEncodingForEmptyGenotype(String[] genotypeFormatStrings, boolean doVCF40) {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
builder.append(EMPTY_GENOTYPE); builder.append(EMPTY_GENOTYPE);
@ -311,15 +319,35 @@ public class VCFGenotypeRecord {
if ( field.equals(GENOTYPE_KEY) ) if ( field.equals(GENOTYPE_KEY) )
continue; continue;
// in VCF4.0, if a genotype is empty only the ./. key can be included
if (!doVCF40) {
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR); builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
builder.append(getMissingFieldValue(field)); builder.append(getMissingFieldValue(field));
} }
}
return builder.toString(); return builder.toString();
} }
public static String getMissingFieldValue(String field) { public static String getMissingFieldValue(String field) {
String result = ""; // backward compatibility to VCF 3.3
return getMissingFieldValue(field, false);
}
public static String getMissingFieldValue(String field, boolean doVCF40) {
String result;
if (doVCF40) {
result = "."; // default missing value
// TODO - take number of elements in field as input and output corresponding .'s
if ( field.equals(GENOTYPE_LIKELIHOODS_KEY) )
result = ".,.,.";
else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
result = ".,.";
}
else {
result = "";
if ( field.equals(GENOTYPE_QUALITY_KEY) ) if ( field.equals(GENOTYPE_QUALITY_KEY) )
result = String.valueOf(MISSING_GENOTYPE_QUALITY); result = String.valueOf(MISSING_GENOTYPE_QUALITY);
else if ( field.equals(DEPTH_KEY) || field.equals(OLD_DEPTH_KEY) ) else if ( field.equals(DEPTH_KEY) || field.equals(OLD_DEPTH_KEY) )
@ -331,6 +359,7 @@ public class VCFGenotypeRecord {
// TODO -- support haplotype quality // TODO -- support haplotype quality
//else if ( field.equals(HAPLOTYPE_QUALITY_KEY) ) //else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
// result = String.valueOf(MISSING_HAPLOTYPE_QUALITY); // result = String.valueOf(MISSING_HAPLOTYPE_QUALITY);
}
return result; return result;
} }

View File

@ -143,10 +143,13 @@ public class VCFWriter {
// write out the column line // write out the column line
StringBuilder b = new StringBuilder(); StringBuilder b = new StringBuilder();
b.append(VCFHeader.HEADER_INDICATOR); b.append(VCFHeader.HEADER_INDICATOR);
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPARATOR); for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields())
b.append(field + FIELD_SEPARATOR);
if (header.hasGenotypingData()) { if (header.hasGenotypingData()) {
b.append("FORMAT" + FIELD_SEPARATOR); b.append("FORMAT" + FIELD_SEPARATOR);
for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPARATOR); for (String field : header.getGenotypeSamples())
b.append(field + FIELD_SEPARATOR);
} }
mWriter.write(b.toString() + "\n"); mWriter.write(b.toString() + "\n");
mWriter.flush(); // necessary so that writing to an output stream will work mWriter.flush(); // necessary so that writing to an output stream will work
@ -224,15 +227,9 @@ public class VCFWriter {
long position = loc.getStart(); long position = loc.getStart();
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : EMPTY_ID_FIELD; String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : EMPTY_ID_FIELD;
builder.append(contig);
builder.append(FIELD_SEPARATOR);
builder.append(position);
builder.append(FIELD_SEPARATOR);
builder.append(ID);
builder.append(FIELD_SEPARATOR);
// deal with the reference // deal with the reference
String referenceBases = new String(vc.getReference().getBases()); String referenceFromVC = new String(vc.getReference().getBases());
double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1; double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1;
// TODO- clean up these flags and associated code // TODO- clean up these flags and associated code
@ -241,7 +238,6 @@ public class VCFWriter {
boolean filtersWereAppliedToGenotypes = false; boolean filtersWereAppliedToGenotypes = false;
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED); String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED);
String referenceString = new String(vc.getReference().getBases());
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>(); Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>(); List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
@ -252,12 +248,7 @@ public class VCFWriter {
ArrayList<Allele> originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST"); ArrayList<Allele> originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST");
boolean isComplexEvent = false;
if (referenceBases.length()>1) {
// complex event: by VCF 4.0, reference from previous position is included.
position--;
isComplexEvent = true;
}
// search for reference allele and find trailing and padding at the end. // search for reference allele and find trailing and padding at the end.
if (originalAlleles != null) { if (originalAlleles != null) {
@ -277,7 +268,16 @@ public class VCFWriter {
String refString = new String(a.getBases()); String refString = new String(a.getBases());
String originalRef = new String(originalReferenceAllele.getBases()); String originalRef = new String(originalReferenceAllele.getBases());
if (refString.equals("")) {
// special case with null reference:
// for example, null reference, originalRef = "AC", alt = "AGC".
// In this case, we'd have one trailing and one padding base, with alleles as {-,G}.
// TEMP fix: assume in this case one trailing base, more general solution would be to
// scan through all alleles and look for maximum common subsequence at beginning and at end for all alleles
numTrailingBases = 1;
} else
numTrailingBases = originalRef.indexOf(refString); numTrailingBases = originalRef.indexOf(refString);
numPaddingBases = originalRef.length()-refString.length()-numTrailingBases; numPaddingBases = originalRef.length()-refString.length()-numTrailingBases;
if (numTrailingBases > 0) { if (numTrailingBases > 0) {
@ -285,12 +285,14 @@ public class VCFWriter {
trailingBases = originalRef.substring(0,numTrailingBases); trailingBases = originalRef.substring(0,numTrailingBases);
} }
if (numPaddingBases > 0) if (numPaddingBases > 0)
paddingBases = originalRef.substring(refString.length(),originalRef.length()-1); paddingBases = originalRef.substring(originalRef.length()-numPaddingBases,originalRef.length());
} }
} else { } else {
// no original Allele information: // no original Allele information: add one common base to all alleles (reference at this location)
if (isComplexEvent) trailingBases = new String(refBases);
numTrailingBases = 1;
position--; position--;
} }
@ -300,42 +302,15 @@ public class VCFWriter {
String alleleString = new String(a.getBases()); String alleleString = new String(a.getBases());
if ( isComplexEvent ) { String s = trailingBases+alleleString+paddingBases;
// Mixed variants (e.g. microsatellites) have the reference before the event included
String s;
if (numTrailingBases > 0) {
s = trailingBases+alleleString+paddingBases;
} else {
s = new String(refBases)+alleleString;
}
encoding = new VCFGenotypeEncoding(s, true); encoding = new VCFGenotypeEncoding(s, true);
// overwrite reference string by possibly padded version
if (a.isReference()) if (a.isReference())
referenceString = s; referenceFromVC = s;
else {
} else if ( vc.getType() == VariantContext.Type.INDEL ) {
if ( a.isNull() ) {
if ( a.isReference() ) {
// ref, where alt is insertion
encoding = new VCFGenotypeEncoding(new String(refBases));
} else {
// non-ref deletion
encoding = new VCFGenotypeEncoding(".");
}
} else if ( a.isReference() ) {
// ref, where alt is deletion
encoding = new VCFGenotypeEncoding(new String(refBases));
} else {
// non-ref insertion
referenceString = new String(refBases)+alleleString;
encoding = new VCFGenotypeEncoding(referenceString, true);
}
} else {
// no variation, ref or alt for snp
encoding = new VCFGenotypeEncoding(alleleString);
}
if ( a.isNonReference() ) {
vcfAltAlleles.add(encoding); vcfAltAlleles.add(encoding);
} }
@ -375,8 +350,11 @@ public class VCFWriter {
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) { if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {
if ( MathUtils.compareDoubles(g.getNegLog10PError(), Genotype.NO_NEG_LOG_10PERROR) == 0 ) if ( MathUtils.compareDoubles(g.getNegLog10PError(), Genotype.NO_NEG_LOG_10PERROR) == 0 )
val = VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY; val = VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY;
else else {
val = Math.min(g.getPhredScaledQual(), VCFGenotypeRecord.MAX_QUAL_VALUE); // TODO - check whether we need to saturate quality to 99 as in VCF3.3 coder. For now allow unbounded values
// val = Math.min(g.getPhredScaledQual(), VCFGenotypeRecord.MAX_QUAL_VALUE);
val = g.getPhredScaledQual();
}
} else if ( key.equals(VCFGenotypeRecord.DEPTH_KEY) && val == null ) { } else if ( key.equals(VCFGenotypeRecord.DEPTH_KEY) && val == null ) {
ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY); ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY);
@ -385,9 +363,14 @@ public class VCFWriter {
} else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) { } else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) {
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS; val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS;
} }
// TODO - do I need this?
/*else if (val == null) {
// generic case when there's no value associated with entry:
val = MISSING_GENOTYPE_FIELD;
} */
String outputValue;
if (writingVCF40Format) {
VCFFormatHeaderLine.FORMAT_TYPE formatType = typeUsedForFormatString.get(key); VCFFormatHeaderLine.FORMAT_TYPE formatType = typeUsedForFormatString.get(key);
Object newVal; Object newVal;
if (!val.getClass().equals(String.class)) if (!val.getClass().equals(String.class))
@ -405,10 +388,9 @@ public class VCFWriter {
} }
newVal = v.toString(); newVal = v.toString();
} }
outputValue = formatVCFField(key, newVal); String outputValue = formatVCFField(key, newVal);
} else {
outputValue = formatVCFField(key, val);
}
if ( outputValue != null ) if ( outputValue != null )
vcfG.setField(key, outputValue); vcfG.setField(key, outputValue);
} }
@ -436,7 +418,13 @@ public class VCFWriter {
builder.append(referenceString); builder.append(contig);
builder.append(FIELD_SEPARATOR);
builder.append(position);
builder.append(FIELD_SEPARATOR);
builder.append(ID);
builder.append(FIELD_SEPARATOR);
builder.append(referenceFromVC);
builder.append(FIELD_SEPARATOR); builder.append(FIELD_SEPARATOR);
if ( vcfAltAlleles.size() > 0 ) { if ( vcfAltAlleles.size() > 0 ) {
@ -478,7 +466,7 @@ public class VCFWriter {
* @param builder the string builder * @param builder the string builder
* @param header the header object * @param header the header object
*/ */
private static void addGenotypeData(StringBuilder builder, VCFHeader header, private void addGenotypeData(StringBuilder builder, VCFHeader header,
String genotypeFormatString, List<VCFGenotypeEncoding>vcfAltAlleles) { String genotypeFormatString, List<VCFGenotypeEncoding>vcfAltAlleles) {
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(mGenotypeRecords); Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(mGenotypeRecords);
@ -500,10 +488,10 @@ public class VCFWriter {
tempStr.append(FIELD_SEPARATOR); tempStr.append(FIELD_SEPARATOR);
if ( gMap.containsKey(genotype) ) { if ( gMap.containsKey(genotype) ) {
VCFGenotypeRecord rec = gMap.get(genotype); VCFGenotypeRecord rec = gMap.get(genotype);
tempStr.append(rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings)); tempStr.append(rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings, true));
gMap.remove(genotype); gMap.remove(genotype);
} else { } else {
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings)); tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings, true));
} }
} }
if ( gMap.size() != 0 ) { if ( gMap.size() != 0 ) {