Allowing VCF records without GTs in vf4.1

This commit is contained in:
Eric Banks 2011-07-13 09:56:08 -04:00
parent a2597e7f00
commit 6007eea3ff
5 changed files with 83 additions and 42 deletions

View File

@ -32,6 +32,7 @@ import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broad.tribble.util.ParsingUtils;
import org.broad.tribble.util.PositionalStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -300,10 +301,7 @@ public class StandardVCFWriter implements VCFWriter {
} else {
List<String> genotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) {
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
for ( String key : calcVCFGenotypeKeys(vc) ) {
genotypeAttributeKeys.add(key);
}
genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
} else if ( mHeader.hasGenotypingData() ) {
// this needs to be done in case all samples are no-calls
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
@ -387,16 +385,22 @@ public class StandardVCFWriter implements VCFWriter {
continue;
}
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for ( String key : genotypeFormatKeys ) {
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
if ( key.equals(VCFConstants.GENOTYPE_KEY) ) {
if ( !g.isAvailable() ) {
throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
}
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
continue;
}
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
@ -488,10 +492,13 @@ public class StandardVCFWriter implements VCFWriter {
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
Set<String> keys = new HashSet<String>();
boolean sawGoodGT = false;
boolean sawGoodQual = false;
boolean sawGenotypeFilter = false;
for ( Genotype g : vc.getGenotypes().values() ) {
keys.addAll(g.getAttributes().keySet());
if ( g.isAvailable() )
sawGoodGT = true;
if ( g.hasNegLog10PError() )
sawGoodQual = true;
if (g.isFiltered() && g.isCalled())
@ -504,7 +511,17 @@ public class StandardVCFWriter implements VCFWriter {
if (sawGenotypeFilter)
keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
return ParsingUtils.sortList(new ArrayList<String>(keys));
List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
// make sure the GT is first
if ( sawGoodGT ) {
List<String> newList = new ArrayList<String>(sortedList.size()+1);
newList.add(VCFConstants.GENOTYPE_KEY);
newList.addAll(sortedList);
sortedList = newList;
}
return sortedList;
}

View File

@ -141,8 +141,6 @@ public class VCF3Codec extends AbstractVCFCodec {
boolean missing = i >= GTValueSplitSize;
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
if (i != 0)
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -156,12 +154,13 @@ public class VCF3Codec extends AbstractVCFCodec {
}
}
// check to make sure we found a gentoype field
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
// check to make sure we found a genotype field
if ( genotypeAlleleLocation < 0 )
generateException("Unable to find the GT field for the record; the GT field is required");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
// todo -- assuming allele list length in the single digits is bad. Fix me.
// Check for > 1 for haploid genotypes
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {

View File

@ -145,8 +145,6 @@ public class VCFCodec extends AbstractVCFCodec {
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
if (i != 0)
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -160,22 +158,24 @@ public class VCFCodec extends AbstractVCFCodec {
}
}
// check to make sure we found a gentoype field
// TODO -- This is no longer required in v4.1
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
// check to make sure we found a genotype field if we are a VCF4.0 file
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
// todo -- assuming allele list length in the single digits is bad. Fix me.
// Check for > 1 for haploid genotypes
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.put(sampleName, new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
GTQual,
genotypeFilters,
gtAttributes,
phased));
genotypes.put(sampleName,
new Genotype(sampleName,
GTalleles,
GTQual,
genotypeFilters,
gtAttributes,
phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
@ -19,12 +20,14 @@ public class Genotype {
protected InferredGeneticContext commonInfo;
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
protected List<Allele> alleles = null; // new ArrayList<Allele>();
protected Type type = null;
protected boolean isPhased = false;
private boolean filtersWereAppliedToContext;
protected boolean filtersWereAppliedToContext;
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
this.alleles = Collections.unmodifiableList(alleles);
if ( alleles != null )
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
filtersWereAppliedToContext = filters != null;
this.isPhased = isPhased;
@ -66,6 +69,9 @@ public class Genotype {
}
public List<Allele> getAlleles(Allele allele) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
List<Allele> al = new ArrayList<Allele>();
for ( Allele a : alleles )
if ( a.equals(allele) )
@ -75,6 +81,8 @@ public class Genotype {
}
public Allele getAllele(int i) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
return alleles.get(i);
}
@ -89,10 +97,21 @@ public class Genotype {
NO_CALL,
HOM_REF,
HET,
HOM_VAR
HOM_VAR,
UNAVAILABLE
}
public Type getType() {
if ( type == null ) {
type = determineType();
}
return type;
}
protected Type determineType() {
if ( alleles == null )
return Type.UNAVAILABLE;
Allele firstAllele = alleles.get(0);
if ( firstAllele.isNoCall() ) {
@ -122,7 +141,8 @@ public class Genotype {
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
*/
public boolean isNoCall() { return getType() == Type.NO_CALL; }
public boolean isCalled() { return getType() != Type.NO_CALL; }
public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }
//
// Useful methods for getting genotype likelihoods for a genotype object, if present
@ -157,8 +177,8 @@ public class Genotype {
}
public void validate() {
if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles");
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles");
if ( alleles == null ) return;
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");
int nNoCalls = 0;
for ( Allele allele : alleles ) {
@ -175,6 +195,9 @@ public class Genotype {
}
public String getGenotypeString(boolean ignoreRefState) {
if ( alleles == null )
return null;
// Notes:
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)

View File

@ -1206,9 +1206,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName());
for ( Allele gAllele : g.getAlleles() ) {
if ( ! hasAllele(gAllele) && gAllele.isCalled() )
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
if ( g.isAvailable() ) {
for ( Allele gAllele : g.getAlleles() ) {
if ( ! hasAllele(gAllele) && gAllele.isCalled() )
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
}
}
}
}