Allowing VCF records without GTs in vf4.1

This commit is contained in:
Eric Banks 2011-07-13 09:56:08 -04:00
parent a2597e7f00
commit 6007eea3ff
5 changed files with 83 additions and 42 deletions

View File

@ -32,6 +32,7 @@ import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream; import org.broad.tribble.util.LittleEndianOutputStream;
import org.broad.tribble.util.ParsingUtils; import org.broad.tribble.util.ParsingUtils;
import org.broad.tribble.util.PositionalStream; import org.broad.tribble.util.PositionalStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -300,10 +301,7 @@ public class StandardVCFWriter implements VCFWriter {
} else { } else {
List<String> genotypeAttributeKeys = new ArrayList<String>(); List<String> genotypeAttributeKeys = new ArrayList<String>();
if ( vc.hasGenotypes() ) { if ( vc.hasGenotypes() ) {
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
for ( String key : calcVCFGenotypeKeys(vc) ) {
genotypeAttributeKeys.add(key);
}
} else if ( mHeader.hasGenotypingData() ) { } else if ( mHeader.hasGenotypingData() ) {
// this needs to be done in case all samples are no-calls // this needs to be done in case all samples are no-calls
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY); genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
@ -387,16 +385,22 @@ public class StandardVCFWriter implements VCFWriter {
continue; continue;
} }
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size()); List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for ( String key : genotypeFormatKeys ) { for ( String key : genotypeFormatKeys ) {
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
if ( key.equals(VCFConstants.GENOTYPE_KEY) ) {
if ( !g.isAvailable() ) {
throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
}
writeAllele(g.getAllele(0), alleleMap);
for (int i = 1; i < g.getPloidy(); i++) {
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap);
}
continue; continue;
}
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4; Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
@ -488,10 +492,13 @@ public class StandardVCFWriter implements VCFWriter {
private static List<String> calcVCFGenotypeKeys(VariantContext vc) { private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
Set<String> keys = new HashSet<String>(); Set<String> keys = new HashSet<String>();
boolean sawGoodGT = false;
boolean sawGoodQual = false; boolean sawGoodQual = false;
boolean sawGenotypeFilter = false; boolean sawGenotypeFilter = false;
for ( Genotype g : vc.getGenotypes().values() ) { for ( Genotype g : vc.getGenotypes().values() ) {
keys.addAll(g.getAttributes().keySet()); keys.addAll(g.getAttributes().keySet());
if ( g.isAvailable() )
sawGoodGT = true;
if ( g.hasNegLog10PError() ) if ( g.hasNegLog10PError() )
sawGoodQual = true; sawGoodQual = true;
if (g.isFiltered() && g.isCalled()) if (g.isFiltered() && g.isCalled())
@ -504,7 +511,17 @@ public class StandardVCFWriter implements VCFWriter {
if (sawGenotypeFilter) if (sawGenotypeFilter)
keys.add(VCFConstants.GENOTYPE_FILTER_KEY); keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
return ParsingUtils.sortList(new ArrayList<String>(keys)); List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
// make sure the GT is first
if ( sawGoodGT ) {
List<String> newList = new ArrayList<String>(sortedList.size()+1);
newList.add(VCFConstants.GENOTYPE_KEY);
newList.addAll(sortedList);
sortedList = newList;
}
return sortedList;
} }

View File

@ -141,8 +141,6 @@ public class VCF3Codec extends AbstractVCFCodec {
boolean missing = i >= GTValueSplitSize; boolean missing = i >= GTValueSplitSize;
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
if (i != 0)
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
genotypeAlleleLocation = i; genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -156,12 +154,13 @@ public class VCF3Codec extends AbstractVCFCodec {
} }
} }
// check to make sure we found a gentoype field // check to make sure we found a genotype field
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field"); if ( genotypeAlleleLocation < 0 )
generateException("Unable to find the GT field for the record; the GT field is required");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
// todo -- assuming allele list length in the single digits is bad. Fix me. boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// Check for > 1 for haploid genotypes
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
// add it to the list // add it to the list
try { try {

View File

@ -145,8 +145,6 @@ public class VCFCodec extends AbstractVCFCodec {
// todo -- all of these on the fly parsing of the missing value should be static constants // todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
if (i != 0)
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
genotypeAlleleLocation = i; genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { } else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]); GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
@ -160,22 +158,24 @@ public class VCFCodec extends AbstractVCFCodec {
} }
} }
// check to make sure we found a gentoype field // check to make sure we found a genotype field if we are a VCF4.0 file
// TODO -- This is no longer required in v4.1 if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field"); generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
// todo -- assuming allele list length in the single digits is bad. Fix me. List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
// Check for > 1 for haploid genotypes boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
// add it to the list // add it to the list
try { try {
genotypes.put(sampleName, new Genotype(sampleName, genotypes.put(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap), new Genotype(sampleName,
GTQual, GTalleles,
genotypeFilters, GTQual,
gtAttributes, genotypeFilters,
phased)); gtAttributes,
phased));
} catch (TribbleException e) { } catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
} }

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.util.ParsingUtils; import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*; import java.util.*;
@ -19,12 +20,14 @@ public class Genotype {
protected InferredGeneticContext commonInfo; protected InferredGeneticContext commonInfo;
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
protected List<Allele> alleles = null; // new ArrayList<Allele>(); protected List<Allele> alleles = null; // new ArrayList<Allele>();
protected Type type = null;
protected boolean isPhased = false; protected boolean isPhased = false;
private boolean filtersWereAppliedToContext; protected boolean filtersWereAppliedToContext;
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) { public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
this.alleles = Collections.unmodifiableList(alleles); if ( alleles != null )
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
filtersWereAppliedToContext = filters != null; filtersWereAppliedToContext = filters != null;
this.isPhased = isPhased; this.isPhased = isPhased;
@ -66,6 +69,9 @@ public class Genotype {
} }
public List<Allele> getAlleles(Allele allele) { public List<Allele> getAlleles(Allele allele) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
List<Allele> al = new ArrayList<Allele>(); List<Allele> al = new ArrayList<Allele>();
for ( Allele a : alleles ) for ( Allele a : alleles )
if ( a.equals(allele) ) if ( a.equals(allele) )
@ -75,6 +81,8 @@ public class Genotype {
} }
public Allele getAllele(int i) { public Allele getAllele(int i) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
return alleles.get(i); return alleles.get(i);
} }
@ -89,10 +97,21 @@ public class Genotype {
NO_CALL, NO_CALL,
HOM_REF, HOM_REF,
HET, HET,
HOM_VAR HOM_VAR,
UNAVAILABLE
} }
public Type getType() { public Type getType() {
if ( type == null ) {
type = determineType();
}
return type;
}
protected Type determineType() {
if ( alleles == null )
return Type.UNAVAILABLE;
Allele firstAllele = alleles.get(0); Allele firstAllele = alleles.get(0);
if ( firstAllele.isNoCall() ) { if ( firstAllele.isNoCall() ) {
@ -122,7 +141,8 @@ public class Genotype {
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF) * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
*/ */
public boolean isNoCall() { return getType() == Type.NO_CALL; } public boolean isNoCall() { return getType() == Type.NO_CALL; }
public boolean isCalled() { return getType() != Type.NO_CALL; } public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }
// //
// Useful methods for getting genotype likelihoods for a genotype object, if present // Useful methods for getting genotype likelihoods for a genotype object, if present
@ -157,8 +177,8 @@ public class Genotype {
} }
public void validate() { public void validate() {
if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles"); if ( alleles == null ) return;
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles"); if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");
int nNoCalls = 0; int nNoCalls = 0;
for ( Allele allele : alleles ) { for ( Allele allele : alleles ) {
@ -175,6 +195,9 @@ public class Genotype {
} }
public String getGenotypeString(boolean ignoreRefState) { public String getGenotypeString(boolean ignoreRefState) {
if ( alleles == null )
return null;
// Notes: // Notes:
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased // 1. Make sure to use the appropriate separator depending on whether the genotype is phased
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele) // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)

View File

@ -1206,9 +1206,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName());
for ( Allele gAllele : g.getAlleles() ) { if ( g.isAvailable() ) {
if ( ! hasAllele(gAllele) && gAllele.isCalled() ) for ( Allele gAllele : g.getAlleles() ) {
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); if ( ! hasAllele(gAllele) && gAllele.isCalled() )
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
}
} }
} }
} }