Allowing VCF records without GTs in vf4.1
This commit is contained in:
parent
a2597e7f00
commit
6007eea3ff
|
|
@ -32,6 +32,7 @@ import org.broad.tribble.index.IndexFactory;
|
||||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||||
import org.broad.tribble.util.ParsingUtils;
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
import org.broad.tribble.util.PositionalStream;
|
import org.broad.tribble.util.PositionalStream;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
|
@ -300,10 +301,7 @@ public class StandardVCFWriter implements VCFWriter {
|
||||||
} else {
|
} else {
|
||||||
List<String> genotypeAttributeKeys = new ArrayList<String>();
|
List<String> genotypeAttributeKeys = new ArrayList<String>();
|
||||||
if ( vc.hasGenotypes() ) {
|
if ( vc.hasGenotypes() ) {
|
||||||
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
genotypeAttributeKeys.addAll(calcVCFGenotypeKeys(vc));
|
||||||
for ( String key : calcVCFGenotypeKeys(vc) ) {
|
|
||||||
genotypeAttributeKeys.add(key);
|
|
||||||
}
|
|
||||||
} else if ( mHeader.hasGenotypingData() ) {
|
} else if ( mHeader.hasGenotypingData() ) {
|
||||||
// this needs to be done in case all samples are no-calls
|
// this needs to be done in case all samples are no-calls
|
||||||
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
genotypeAttributeKeys.add(VCFConstants.GENOTYPE_KEY);
|
||||||
|
|
@ -387,16 +385,22 @@ public class StandardVCFWriter implements VCFWriter {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
|
||||||
|
for ( String key : genotypeFormatKeys ) {
|
||||||
|
|
||||||
|
if ( key.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||||
|
if ( !g.isAvailable() ) {
|
||||||
|
throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
|
||||||
|
}
|
||||||
|
|
||||||
writeAllele(g.getAllele(0), alleleMap);
|
writeAllele(g.getAllele(0), alleleMap);
|
||||||
for (int i = 1; i < g.getPloidy(); i++) {
|
for (int i = 1; i < g.getPloidy(); i++) {
|
||||||
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
|
mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
|
||||||
writeAllele(g.getAllele(i), alleleMap);
|
writeAllele(g.getAllele(i), alleleMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
|
|
||||||
for ( String key : genotypeFormatKeys ) {
|
|
||||||
if ( key.equals(VCFConstants.GENOTYPE_KEY) )
|
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
|
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
|
||||||
|
|
||||||
|
|
@ -488,10 +492,13 @@ public class StandardVCFWriter implements VCFWriter {
|
||||||
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
|
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
|
||||||
Set<String> keys = new HashSet<String>();
|
Set<String> keys = new HashSet<String>();
|
||||||
|
|
||||||
|
boolean sawGoodGT = false;
|
||||||
boolean sawGoodQual = false;
|
boolean sawGoodQual = false;
|
||||||
boolean sawGenotypeFilter = false;
|
boolean sawGenotypeFilter = false;
|
||||||
for ( Genotype g : vc.getGenotypes().values() ) {
|
for ( Genotype g : vc.getGenotypes().values() ) {
|
||||||
keys.addAll(g.getAttributes().keySet());
|
keys.addAll(g.getAttributes().keySet());
|
||||||
|
if ( g.isAvailable() )
|
||||||
|
sawGoodGT = true;
|
||||||
if ( g.hasNegLog10PError() )
|
if ( g.hasNegLog10PError() )
|
||||||
sawGoodQual = true;
|
sawGoodQual = true;
|
||||||
if (g.isFiltered() && g.isCalled())
|
if (g.isFiltered() && g.isCalled())
|
||||||
|
|
@ -504,7 +511,17 @@ public class StandardVCFWriter implements VCFWriter {
|
||||||
if (sawGenotypeFilter)
|
if (sawGenotypeFilter)
|
||||||
keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
|
keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
|
||||||
|
|
||||||
return ParsingUtils.sortList(new ArrayList<String>(keys));
|
List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
|
||||||
|
|
||||||
|
// make sure the GT is first
|
||||||
|
if ( sawGoodGT ) {
|
||||||
|
List<String> newList = new ArrayList<String>(sortedList.size()+1);
|
||||||
|
newList.add(VCFConstants.GENOTYPE_KEY);
|
||||||
|
newList.addAll(sortedList);
|
||||||
|
sortedList = newList;
|
||||||
|
}
|
||||||
|
|
||||||
|
return sortedList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -141,8 +141,6 @@ public class VCF3Codec extends AbstractVCFCodec {
|
||||||
boolean missing = i >= GTValueSplitSize;
|
boolean missing = i >= GTValueSplitSize;
|
||||||
|
|
||||||
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
|
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
|
||||||
if (i != 0)
|
|
||||||
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
|
|
||||||
genotypeAlleleLocation = i;
|
genotypeAlleleLocation = i;
|
||||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
||||||
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
|
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
|
||||||
|
|
@ -156,12 +154,13 @@ public class VCF3Codec extends AbstractVCFCodec {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check to make sure we found a gentoype field
|
// check to make sure we found a genotype field
|
||||||
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
|
if ( genotypeAlleleLocation < 0 )
|
||||||
|
generateException("Unable to find the GT field for the record; the GT field is required");
|
||||||
|
if ( genotypeAlleleLocation > 0 )
|
||||||
|
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
|
||||||
|
|
||||||
// todo -- assuming allele list length in the single digits is bad. Fix me.
|
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
|
||||||
// Check for > 1 for haploid genotypes
|
|
||||||
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
|
|
||||||
|
|
||||||
// add it to the list
|
// add it to the list
|
||||||
try {
|
try {
|
||||||
|
|
|
||||||
|
|
@ -145,8 +145,6 @@ public class VCFCodec extends AbstractVCFCodec {
|
||||||
|
|
||||||
// todo -- all of these on the fly parsing of the missing value should be static constants
|
// todo -- all of these on the fly parsing of the missing value should be static constants
|
||||||
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
|
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
|
||||||
if (i != 0)
|
|
||||||
generateException("Saw GT at position " + i + ", but it must be at the first position for genotypes");
|
|
||||||
genotypeAlleleLocation = i;
|
genotypeAlleleLocation = i;
|
||||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
||||||
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
|
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
|
||||||
|
|
@ -160,18 +158,20 @@ public class VCFCodec extends AbstractVCFCodec {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check to make sure we found a gentoype field
|
// check to make sure we found a genotype field if we are a VCF4.0 file
|
||||||
// TODO -- This is no longer required in v4.1
|
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
|
||||||
if (genotypeAlleleLocation < 0) generateException("Unable to find required field GT for the record; we don't yet support a missing GT field");
|
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
|
||||||
|
if ( genotypeAlleleLocation > 0 )
|
||||||
|
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
|
||||||
|
|
||||||
// todo -- assuming allele list length in the single digits is bad. Fix me.
|
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
|
||||||
// Check for > 1 for haploid genotypes
|
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
|
||||||
boolean phased = GTValueArray[genotypeAlleleLocation].length() > 1 && GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
|
|
||||||
|
|
||||||
// add it to the list
|
// add it to the list
|
||||||
try {
|
try {
|
||||||
genotypes.put(sampleName, new Genotype(sampleName,
|
genotypes.put(sampleName,
|
||||||
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
|
new Genotype(sampleName,
|
||||||
|
GTalleles,
|
||||||
GTQual,
|
GTQual,
|
||||||
genotypeFilters,
|
genotypeFilters,
|
||||||
gtAttributes,
|
gtAttributes,
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext;
|
||||||
|
|
||||||
import org.broad.tribble.util.ParsingUtils;
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -19,11 +20,13 @@ public class Genotype {
|
||||||
protected InferredGeneticContext commonInfo;
|
protected InferredGeneticContext commonInfo;
|
||||||
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
||||||
protected List<Allele> alleles = null; // new ArrayList<Allele>();
|
protected List<Allele> alleles = null; // new ArrayList<Allele>();
|
||||||
|
protected Type type = null;
|
||||||
|
|
||||||
protected boolean isPhased = false;
|
protected boolean isPhased = false;
|
||||||
private boolean filtersWereAppliedToContext;
|
protected boolean filtersWereAppliedToContext;
|
||||||
|
|
||||||
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
|
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
|
||||||
|
if ( alleles != null )
|
||||||
this.alleles = Collections.unmodifiableList(alleles);
|
this.alleles = Collections.unmodifiableList(alleles);
|
||||||
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
|
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
|
||||||
filtersWereAppliedToContext = filters != null;
|
filtersWereAppliedToContext = filters != null;
|
||||||
|
|
@ -66,6 +69,9 @@ public class Genotype {
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Allele> getAlleles(Allele allele) {
|
public List<Allele> getAlleles(Allele allele) {
|
||||||
|
if ( getType() == Type.UNAVAILABLE )
|
||||||
|
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
|
||||||
|
|
||||||
List<Allele> al = new ArrayList<Allele>();
|
List<Allele> al = new ArrayList<Allele>();
|
||||||
for ( Allele a : alleles )
|
for ( Allele a : alleles )
|
||||||
if ( a.equals(allele) )
|
if ( a.equals(allele) )
|
||||||
|
|
@ -75,6 +81,8 @@ public class Genotype {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Allele getAllele(int i) {
|
public Allele getAllele(int i) {
|
||||||
|
if ( getType() == Type.UNAVAILABLE )
|
||||||
|
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
|
||||||
return alleles.get(i);
|
return alleles.get(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -89,10 +97,21 @@ public class Genotype {
|
||||||
NO_CALL,
|
NO_CALL,
|
||||||
HOM_REF,
|
HOM_REF,
|
||||||
HET,
|
HET,
|
||||||
HOM_VAR
|
HOM_VAR,
|
||||||
|
UNAVAILABLE
|
||||||
}
|
}
|
||||||
|
|
||||||
public Type getType() {
|
public Type getType() {
|
||||||
|
if ( type == null ) {
|
||||||
|
type = determineType();
|
||||||
|
}
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Type determineType() {
|
||||||
|
if ( alleles == null )
|
||||||
|
return Type.UNAVAILABLE;
|
||||||
|
|
||||||
Allele firstAllele = alleles.get(0);
|
Allele firstAllele = alleles.get(0);
|
||||||
|
|
||||||
if ( firstAllele.isNoCall() ) {
|
if ( firstAllele.isNoCall() ) {
|
||||||
|
|
@ -122,7 +141,8 @@ public class Genotype {
|
||||||
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
|
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
|
||||||
*/
|
*/
|
||||||
public boolean isNoCall() { return getType() == Type.NO_CALL; }
|
public boolean isNoCall() { return getType() == Type.NO_CALL; }
|
||||||
public boolean isCalled() { return getType() != Type.NO_CALL; }
|
public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
|
||||||
|
public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }
|
||||||
|
|
||||||
//
|
//
|
||||||
// Useful methods for getting genotype likelihoods for a genotype object, if present
|
// Useful methods for getting genotype likelihoods for a genotype object, if present
|
||||||
|
|
@ -157,8 +177,8 @@ public class Genotype {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void validate() {
|
public void validate() {
|
||||||
if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles");
|
if ( alleles == null ) return;
|
||||||
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles");
|
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");
|
||||||
|
|
||||||
int nNoCalls = 0;
|
int nNoCalls = 0;
|
||||||
for ( Allele allele : alleles ) {
|
for ( Allele allele : alleles ) {
|
||||||
|
|
@ -175,6 +195,9 @@ public class Genotype {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getGenotypeString(boolean ignoreRefState) {
|
public String getGenotypeString(boolean ignoreRefState) {
|
||||||
|
if ( alleles == null )
|
||||||
|
return null;
|
||||||
|
|
||||||
// Notes:
|
// Notes:
|
||||||
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
|
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
|
||||||
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)
|
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)
|
||||||
|
|
|
||||||
|
|
@ -1206,12 +1206,14 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
||||||
|
|
||||||
if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName());
|
if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName());
|
||||||
|
|
||||||
|
if ( g.isAvailable() ) {
|
||||||
for ( Allele gAllele : g.getAlleles() ) {
|
for ( Allele gAllele : g.getAlleles() ) {
|
||||||
if ( ! hasAllele(gAllele) && gAllele.isCalled() )
|
if ( ! hasAllele(gAllele) && gAllele.isCalled() )
|
||||||
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
|
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue