Near final BCF2 implementation

-- Trivial import changes in some walkers
-- SelectVariants has a new hidden mode to fully decode a VCF file
-- DepthPerAlleleBySample (AD) changed to have not UNBOUNDED by A type, which is actually the right type
-- GenotypeLikelihoods now implements List<Double> for convenience.  The PL duality here is going to be removed in a subsequent commit
-- BugFixes in BCF2Writer.  Proper handling of padding.  Bugfix for nFields for a field
-- padAllele function in VariantContextUtils
-- Much better tests for VariantContextTestProvider, including loading parts of dbSNP 135 and the Phase II 1000G call set with genotypes to test encoding / decoding of fields.
This commit is contained in:
Mark DePristo 2012-05-16 20:21:03 -04:00
parent dfee17a672
commit aaf11f00e3
11 changed files with 215 additions and 100 deletions

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator; package org.broadinstitute.sting.gatk.walkers.annotator;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -15,10 +16,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/** /**
@ -79,9 +77,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
Map<String, Object> map = new HashMap<String, Object>(); return toADAnnotation(counts);
map.put(getKeyNames().get(0), counts);
return map;
} }
private Map<String,Object> annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) { private Map<String,Object> annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) {
@ -132,11 +128,11 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) ); counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) );
Map<String, Object> map = new HashMap<String, Object>(); return toADAnnotation(counts);
map.put(getKeyNames().get(0), counts); }
//map.put(getKeyNames().get(0), counts); private final Map<String, Object> toADAnnotation(final Integer[] counts) {
return map; return Collections.singletonMap(getKeyNames().get(0), (Object)Arrays.asList(counts));
} }
private String getAlleleRepresentation(Allele allele) { private String getAlleleRepresentation(Allele allele) {
@ -151,5 +147,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
// public String getIndelBases() // public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("AD"); } public List<String> getKeyNames() { return Arrays.asList("AD"); }
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); } public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(
new VCFFormatHeaderLine(
getKeyNames().get(0),
VCFHeaderLineCount.A,
VCFHeaderLineType.Integer,
"Allelic depths for the ref and alt alleles in the order listed"));
}
} }

View File

@ -310,6 +310,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false) @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false)
private String outMVFile = null; private String outMVFile = null;
@Hidden
@Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
private boolean fullyDecode = false;
/* Private class used to store the intermediate variants in the integer random selection process */ /* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure { private class RandomVariantStructure {
private VariantContext vc; private VariantContext vc;
@ -357,6 +361,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private Random randomGenotypes = new Random(); private Random randomGenotypes = new Random();
private Set<String> IDsToKeep = null; private Set<String> IDsToKeep = null;
private Map<String, VCFHeader> vcfRods;
/** /**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
@ -365,7 +370,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
// Get list of samples to include in the output // Get list of samples to include in the output
List<String> rodNames = Arrays.asList(variantCollection.variants.getName()); List<String> rodNames = Arrays.asList(variantCollection.variants.getName());
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
@ -486,6 +491,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
} }
for (VariantContext vc : vcs) { for (VariantContext vc : vcs) {
if ( fullyDecode ) vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) ) if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
continue; continue;
@ -724,6 +730,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
} }
private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) { private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) {
if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data
if (KEEP_ORIGINAL_CHR_COUNTS) { if (KEEP_ORIGINAL_CHR_COUNTS) {
if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) )
builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY));

View File

@ -323,7 +323,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
getters.put("REF", new Getter() { getters.put("REF", new Getter() {
public String get(VariantContext vc) { public String get(VariantContext vc) {
StringBuilder x = new StringBuilder(); StringBuilder x = new StringBuilder();
x.append(vc.getAlleleWithRefPadding(vc.getReference())); x.append(vc.getAlleleStringWithRefPadding(vc.getReference()));
return x.toString(); return x.toString();
} }
}); });
@ -335,7 +335,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
for ( int i = 0; i < n; i++ ) { for ( int i = 0; i < n; i++ ) {
if ( i != 0 ) x.append(","); if ( i != 0 ) x.append(",");
x.append(vc.getAlleleWithRefPadding(vc.getAlternateAllele(i))); x.append(vc.getAlleleStringWithRefPadding(vc.getAlternateAllele(i)));
} }
return x.toString(); return x.toString();
} }
@ -377,11 +377,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
private static Object splitAltAlleles(VariantContext vc) { private static Object splitAltAlleles(VariantContext vc) {
final int numAltAlleles = vc.getAlternateAlleles().size(); final int numAltAlleles = vc.getAlternateAlleles().size();
if ( numAltAlleles == 1 ) if ( numAltAlleles == 1 )
return vc.getAlleleWithRefPadding(vc.getAlternateAllele(0)); return vc.getAlleleStringWithRefPadding(vc.getAlternateAllele(0));
final List<String> alleles = new ArrayList<String>(numAltAlleles); final List<String> alleles = new ArrayList<String>(numAltAlleles);
for ( Allele allele : vc.getAlternateAlleles() ) for ( Allele allele : vc.getAlternateAlleles() )
alleles.add(vc.getAlleleWithRefPadding(allele)); alleles.add(vc.getAlleleStringWithRefPadding(allele));
return alleles; return alleles;
} }
} }

View File

@ -233,7 +233,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
} }
public static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) { public static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
if ( AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size()); ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1); AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1);
return clippedAlleles; return clippedAlleles;

View File

@ -24,15 +24,16 @@
package org.broadinstitute.sting.utils.variantcontext; package org.broadinstitute.sting.utils.variantcontext;
import org.apache.commons.lang.ArrayUtils;
import org.broad.tribble.TribbleException; import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.EnumMap; import java.util.*;
public class GenotypeLikelihoods { public class GenotypeLikelihoods implements List<Double> {
public static final boolean CAP_PLS = false; public static final boolean CAP_PLS = false;
public static final int PL_CAP = 255; public static final int PL_CAP = 255;
@ -205,6 +206,41 @@ public class GenotypeLikelihoods {
return s.toString(); return s.toString();
} }
// -------------------------------------------------------------------------------------
//
// List interface functions
//
// -------------------------------------------------------------------------------------
private final void notImplemented() {
throw new ReviewedStingException("BUG: code not implemented");
}
@Override public int size() { return this.log10Likelihoods.length; }
@Override public Double get(final int i) { return log10Likelihoods[i];}
@Override public Double set(final int i, final Double aDouble) { return log10Likelihoods[i] = aDouble; }
@Override public boolean isEmpty() { return false; }
@Override public Iterator<Double> iterator() { return Arrays.asList(ArrayUtils.toObject(log10Likelihoods)).iterator(); }
@Override public Object[] toArray() { return ArrayUtils.toObject(log10Likelihoods); }
// none of these are implemented
@Override public boolean contains(final Object o) { notImplemented(); return false; }
@Override public <T> T[] toArray(final T[] ts) { notImplemented(); return null; }
@Override public boolean add(final Double aDouble) { notImplemented(); return false; }
@Override public boolean remove(final Object o) {notImplemented(); return false; }
@Override public boolean containsAll(final Collection<?> objects) { notImplemented(); return false; }
@Override public boolean addAll(final Collection<? extends Double> doubles) { notImplemented(); return false; }
@Override public boolean addAll(final int i, final Collection<? extends Double> doubles) { notImplemented(); return false; }
@Override public boolean removeAll(final Collection<?> objects) { notImplemented(); return false; }
@Override public boolean retainAll(final Collection<?> objects) { notImplemented(); return false; }
@Override public void clear() { notImplemented(); }
@Override public void add(final int i, final Double aDouble) { notImplemented(); }
@Override public Double remove(final int i) { notImplemented(); return null; }
@Override public int indexOf(final Object o) { notImplemented(); return -1; }
@Override public int lastIndexOf(final Object o) { notImplemented(); return 0; }
@Override public ListIterator<Double> listIterator() { notImplemented(); return null; }
@Override public ListIterator<Double> listIterator(final int i) { notImplemented(); return null; }
@Override public List<Double> subList(final int i, final int i1) { notImplemented(); return null; }
// ------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------
// //
@ -280,7 +316,6 @@ public class GenotypeLikelihoods {
* @param ploidy Ploidy, or number of chromosomes in set * @param ploidy Ploidy, or number of chromosomes in set
* @return Number of likelihood elements we need to hold. * @return Number of likelihood elements we need to hold.
*/ */
public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) {
// fast, closed form solution for diploid samples (most common use case) // fast, closed form solution for diploid samples (most common use case)

View File

@ -519,13 +519,10 @@ public class VariantContext implements Feature { // to enable tribble integratio
return REFERENCE_BASE_FOR_INDEL; return REFERENCE_BASE_FOR_INDEL;
} }
public String getAlleleWithRefPadding(final Allele allele) { public String getAlleleStringWithRefPadding(final Allele allele) {
if ( hasReferenceBaseForIndel() && isIndel() ) { if ( VariantContextUtils.needsPadding(this) )
StringBuilder sb = new StringBuilder(); return VariantContextUtils.padAllele(this, allele);
sb.append((char)getReferenceBaseForIndel().byteValue()); else
sb.append(allele.getDisplayString());
return sb.toString();
} else
return allele.getDisplayString(); return allele.getDisplayString();
} }
@ -1288,6 +1285,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
final String field = attr.getKey(); final String field = attr.getKey();
final VCFCompoundHeaderLine format = getMetaDataForField(header, field); final VCFCompoundHeaderLine format = getMetaDataForField(header, field);
final Object decoded = decodeValue(field, attr.getValue(), format); final Object decoded = decodeValue(field, attr.getValue(), format);
if ( decoded != null ) if ( decoded != null )
newAttributes.put(field, decoded); newAttributes.put(field, decoded);
} }
@ -1297,6 +1295,9 @@ public class VariantContext implements Feature { // to enable tribble integratio
private final Object decodeValue(final String field, final Object value, final VCFCompoundHeaderLine format) { private final Object decodeValue(final String field, final Object value, final VCFCompoundHeaderLine format) {
if ( value instanceof String ) { if ( value instanceof String ) {
if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) )
return value.equals(".") ? null : new GenotypeLikelihoods((String)value);
final String string = (String)value; final String string = (String)value;
if ( string.indexOf(",") != -1 ) { if ( string.indexOf(",") != -1 ) {
final String[] splits = string.split(","); final String[] splits = string.split(",");
@ -1307,6 +1308,12 @@ public class VariantContext implements Feature { // to enable tribble integratio
} else { } else {
return decodeOne(field, string, format); return decodeOne(field, string, format);
} }
} else if ( value instanceof List && (((List) value).get(0)) instanceof String ) {
final List<String> asList = (List<String>)value;
final List<Object> values = new ArrayList<Object>(asList.size());
for ( final String s : asList )
values.add(decodeOne(field, s, format));
return values;
} else { } else {
return value; return value;
} }
@ -1321,7 +1328,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
case Flag: return Boolean.valueOf(string); case Flag: return Boolean.valueOf(string);
case String: return string; case String: return string;
case Integer: return Integer.valueOf(string); case Integer: return Integer.valueOf(string);
case Float: return Float.valueOf(string); case Float: return Double.valueOf(string);
default: throw new ReviewedStingException("Unexpected type for field" + field); default: throw new ReviewedStingException("Unexpected type for field" + field);
} }
} }

View File

@ -197,6 +197,16 @@ public class VariantContextUtils {
return false; return false;
} }
public static String padAllele(final VariantContext vc, final Allele allele) {
assert needsPadding(vc);
StringBuilder sb = new StringBuilder();
sb.append((char)vc.getReferenceBaseForIndel().byteValue());
sb.append(allele.getDisplayString());
return sb.toString();
}
public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) {
final boolean padVC = needsPadding(inputVC); final boolean padVC = needsPadding(inputVC);

View File

@ -32,10 +32,7 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
@ -177,8 +174,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
} }
private void buildAlleles( VariantContext vc ) throws IOException { private void buildAlleles( VariantContext vc ) throws IOException {
final boolean needsPadding = VariantContextUtils.needsPadding(vc);
for ( final Allele allele : vc.getAlleles() ) { for ( final Allele allele : vc.getAlleles() ) {
final String s = vc.getAlleleWithRefPadding(allele); final String s = needsPadding ? VariantContextUtils.padAllele(vc,allele) : allele.getDisplayString();
encoder.encodeTyped(s, BCF2Type.CHAR); encoder.encodeTyped(s, BCF2Type.CHAR);
} }
} }
@ -211,6 +209,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
addGQ(vc); addGQ(vc);
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
addGenotypeFilters(vc); addGenotypeFilters(vc);
// } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
// addPLs(vc);
} else { } else {
addGenericGenotypeField(vc, field); addGenericGenotypeField(vc, field);
} }
@ -221,7 +221,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final int getNGenotypeFieldValues(final String field, final VariantContext vc) { private final int getNGenotypeFieldValues(final String field, final VariantContext vc) {
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field); final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
int nFields = metaData.getCount(vc.getAlternateAlleles().size()); int nFields = metaData.getCount(vc.getNAlleles());
if ( nFields == -1 ) { // unbounded, need to look at values if ( nFields == -1 ) { // unbounded, need to look at values
return computeMaxSizeOfGenotypeFieldFromValues(field, vc); return computeMaxSizeOfGenotypeFieldFromValues(field, vc);
} else { } else {
@ -264,6 +264,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type); encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type);
} else { } else {
final Object val = g.getAttribute(field); final Object val = g.getAttribute(field);
if ( (val instanceof List) && (((List) val).size() != numInFormatField ))
throw new ReviewedStingException("BUG: header for " + field +
" has inconsistent number of values " + numInFormatField +
" compared to values in VariantContext " + ((List) val).size());
final Collection<Object> vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val; final Collection<Object> vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val;
encoder.encodeRawValues(vals, encoding.BCF2Type); encoder.encodeRawValues(vals, encoding.BCF2Type);
} }
@ -297,7 +301,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
case Flag: case Flag:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1)); return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
case String: case String:
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String)value); final List<String> s = isList ? (List<String>)value : Collections.singletonList((String) value);
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s); return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
case Integer: // note integer calculation is a bit complex because of the need to determine sizes case Integer: // note integer calculation is a bit complex because of the need to determine sizes
List<Integer> l; List<Integer> l;
@ -346,6 +350,19 @@ class BCF2Writer extends IndexingVariantContextWriter {
} }
} }
// private final void addPLs(final VariantContext vc) throws IOException {
// startGenotypeField(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 1, BCF2Type.INT16);
// for ( final Genotype g : vc.getGenotypes() ) {
// if ( g.hasLog10PError() ) {
// final int GQ = (int)Math.round(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
// if ( GQ > VCFConstants.MAX_GENOTYPE_QUAL ) throw new ReviewedStingException("Unexpectedly large GQ " + GQ + " at " + vc);
// encoder.encodeRawValue(GQ, BCF2Type.INT8);
// } else {
// encoder.encodeRawMissingValues(1, BCF2Type.INT8);
// }
// }
// }
private final void addGenotypes(final VariantContext vc) throws IOException { private final void addGenotypes(final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > 127 ) if ( vc.getNAlleles() > 127 )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " + throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
@ -390,7 +407,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
// iterate over strings until we find one that needs 16 bits, and break // iterate over strings until we find one that needs 16 bits, and break
for ( final String string : strings ) { for ( final String string : strings ) {
final int offset = stringDictionaryMap.get(string); final Integer got = stringDictionaryMap.get(string);
if ( got == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF");
final int offset = got;
offsets.add(offset); offsets.add(offset);
final BCF2Type type1 = encoder.determineIntegerType(offset); final BCF2Type type1 = encoder.determineIntegerType(offset);
switch ( type1 ) { switch ( type1 ) {

View File

@ -33,7 +33,6 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import java.util.EnumMap; import java.util.EnumMap;

View File

@ -27,7 +27,9 @@ package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream; import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.writer.Options; import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.testng.Assert; import org.testng.Assert;
@ -44,10 +46,17 @@ import java.util.*;
* @since Date created * @since Date created
*/ */
public class VariantContextTestProvider { public class VariantContextTestProvider {
final private static boolean ADVANCED_TESTS = false; final private static boolean ENABLE_VARARRAY_TESTS = false;
final static VCFHeader header; final private static boolean ENABLE_PLOIDY_TESTS = false;
final private static boolean ENABLE_PL_TESTS = true;
private static VCFHeader syntheticHeader;
final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>(); final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>();
final static VariantContext ROOT; private static VariantContext ROOT;
private final static List<File> testSourceVCFs = Arrays.asList(
new File(BaseTest.testDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"),
new File(BaseTest.testDir + "dbsnp_135.b37.1000.vcf")
);
public abstract static class VariantContextIOTest { public abstract static class VariantContextIOTest {
public String toString() { public String toString() {
@ -67,17 +76,19 @@ public class VariantContextTestProvider {
} }
public static class VariantContextTestData { public static class VariantContextTestData {
public final VCFHeader header;
public List<VariantContext> vcs; public List<VariantContext> vcs;
public VariantContextTestData(final VariantContextBuilder builder) { public VariantContextTestData(final VCFHeader header, final VariantContextBuilder builder) {
this(Collections.singletonList(builder.make())); this(header, Collections.singletonList(builder.make()));
} }
public VariantContextTestData(final VariantContext vc) { public VariantContextTestData(final VCFHeader header, final List<VariantContext> vcs) {
this(Collections.singletonList(vc)); final Set<String> samples = new HashSet<String>();
} for ( final VariantContext vc : vcs )
if ( vc.hasGenotypes() )
public VariantContextTestData(final List<VariantContext> vcs) { samples.addAll(vc.getSampleNames());
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaData(), samples);
this.vcs = vcs; this.vcs = vcs;
} }
@ -88,9 +99,11 @@ public class VariantContextTestProvider {
public String toString() { public String toString() {
StringBuilder b = new StringBuilder(); StringBuilder b = new StringBuilder();
b.append("VariantContextTestData: ["); b.append("VariantContextTestData: [");
for ( VariantContext vc : vcs ) { final VariantContext vc = vcs.get(0);
b.append(vc.toString()).append(" ----- "); final VariantContextBuilder builder = new VariantContextBuilder(vc);
} builder.noGenotypes();
b.append(builder.make().toString()).append(" nGenotypes = ").append(vc.getNSamples());
if ( vcs.size() > 1 ) b.append(" ----- with another ").append(vcs.size() - 1).append(" VariantContext records");
b.append("]"); b.append("]");
return b.toString(); return b.toString();
} }
@ -101,11 +114,55 @@ public class VariantContextTestProvider {
} }
private final static void add(VariantContextBuilder builder) { private final static void add(VariantContextBuilder builder) {
TEST_DATAs.add(new VariantContextTestData(builder)); TEST_DATAs.add(new VariantContextTestData(syntheticHeader, builder));
} }
static { public static void initializeTests() throws IOException {
createSyntheticHeader();
makeSyntheticTests();
makeEmpiricalTests();
}
private static void makeEmpiricalTests() throws IOException {
for ( final File file : testSourceVCFs ) {
VCFCodec codec = new VCFCodec();
Pair<VCFHeader, List<VariantContext>> x = readAllVCs( file, codec );
List<VariantContext> fullyDecoded = new ArrayList<VariantContext>(x.getSecond().size());
for ( final VariantContext raw : x.getSecond() )
fullyDecoded.add(raw.fullyDecode(x.getFirst()));
TEST_DATAs.add(new VariantContextTestData(x.getFirst(), x.getSecond()));
}
}
private static void createSyntheticHeader() {
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>(); Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
// prep the header
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
metaData.add(new VCFFilterHeaderLine("FILTER1"));
metaData.add(new VCFFilterHeaderLine("FILTER2"));
metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x"));
metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x"));
metaData.add(new VCFInfoHeaderLine("FLAG", 1, VCFHeaderLineType.Flag, "x"));
syntheticHeader = new VCFHeader(metaData);
}
private static void makeSyntheticTests() {
VariantContextBuilder rootBuilder = new VariantContextBuilder(); VariantContextBuilder rootBuilder = new VariantContextBuilder();
rootBuilder.source("test"); rootBuilder.source("test");
rootBuilder.loc("1", 10, 10); rootBuilder.loc("1", 10, 10);
@ -126,8 +183,6 @@ public class VariantContextTestProvider {
add(builder().passFilters()); add(builder().passFilters());
add(builder().filters("FILTER1")); add(builder().filters("FILTER1"));
add(builder().filters("FILTER1", "FILTER2")); add(builder().filters("FILTER1", "FILTER2"));
metaData.add(new VCFFilterHeaderLine("FILTER1"));
metaData.add(new VCFFilterHeaderLine("FILTER2"));
add(builder().log10PError(VariantContext.NO_LOG10_PERROR)); add(builder().log10PError(VariantContext.NO_LOG10_PERROR));
add(builder().log10PError(-1)); add(builder().log10PError(-1));
@ -147,12 +202,6 @@ public class VariantContextTestProvider {
add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000))); add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000)));
add(builder().attribute("INT3", null)); add(builder().attribute("INT3", null));
add(builder().attribute("INT20", Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20))); add(builder().attribute("INT20", Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)));
metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x"));
metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
add(builder().attribute("FLOAT1", 1.0)); add(builder().attribute("FLOAT1", 1.0));
add(builder().attribute("FLOAT1", 100.0)); add(builder().attribute("FLOAT1", 100.0));
@ -163,32 +212,17 @@ public class VariantContextTestProvider {
add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0))); add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0)));
add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0))); add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0)));
add(builder().attribute("FLOAT3", null)); add(builder().attribute("FLOAT3", null));
metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x"));
metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x"));
add(builder().attribute("FLAG", true)); add(builder().attribute("FLAG", true));
add(builder().attribute("FLAG", false)); add(builder().attribute("FLAG", false));
metaData.add(new VCFInfoHeaderLine("FLAG", 1, VCFHeaderLineType.Flag, "x"));
add(builder().attribute("STRING1", "s1")); add(builder().attribute("STRING1", "s1"));
add(builder().attribute("STRING1", null)); add(builder().attribute("STRING1", null));
add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3"))); add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3")));
add(builder().attribute("STRING3", null)); add(builder().attribute("STRING3", null));
add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20"))); add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20")));
metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));
metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
addGenotypesToTestData(); addGenotypesToTestData();
// prep the header
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
header = new VCFHeader(metaData);
} }
private static void addGenotypesToTestData() { private static void addGenotypesToTestData() {
@ -250,7 +284,7 @@ public class VariantContextTestProvider {
addGenotypeTests(site, homRef, het, homVar); addGenotypeTests(site, homRef, het, homVar);
// ploidy // ploidy
if ( ADVANCED_TESTS ) { if ( ENABLE_PLOIDY_TESTS ) {
addGenotypeTests(site, addGenotypeTests(site,
new Genotype("dip", Arrays.asList(ref, alt1)), new Genotype("dip", Arrays.asList(ref, alt1)),
new Genotype("hap", Arrays.asList(ref))); new Genotype("hap", Arrays.asList(ref)));
@ -261,7 +295,7 @@ public class VariantContextTestProvider {
} }
} }
if ( ADVANCED_TESTS ) { if ( ENABLE_PL_TESTS ) {
// testing PLs // testing PLs
addGenotypeTests(site, addGenotypeTests(site,
new Genotype("g1", Arrays.asList(ref, ref), -1, new double[]{0, -1, -2}), new Genotype("g1", Arrays.asList(ref, ref), -1, new double[]{0, -1, -2}),
@ -294,7 +328,7 @@ public class VariantContextTestProvider {
attr("g1", ref, "INT3", 1, 2, 3), attr("g1", ref, "INT3", 1, 2, 3),
attr("g2", ref, "INT3")); attr("g2", ref, "INT3"));
if ( ADVANCED_TESTS ) { if (ENABLE_VARARRAY_TESTS) {
addGenotypeTests(site, addGenotypeTests(site,
attr("g1", ref, "INT.VAR", 1, 2, 3), attr("g1", ref, "INT.VAR", 1, 2, 3),
attr("g2", ref, "INT.VAR", 4, 5), attr("g2", ref, "INT.VAR", 4, 5),
@ -331,13 +365,6 @@ public class VariantContextTestProvider {
} }
} }
private static VCFHeader getHeader(final List<VariantContext> vcs) {
final Set<String> samples = new HashSet<String>();
for ( final VariantContext vc : vcs )
samples.addAll(vc.getSampleNames());
return new VCFHeader(header.getMetaData(), samples);
}
public static List<VariantContextTestData> generateSiteTests() { public static List<VariantContextTestData> generateSiteTests() {
return TEST_DATAs; return TEST_DATAs;
} }
@ -351,31 +378,37 @@ public class VariantContextTestProvider {
// write // write
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY); final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final VariantContextWriter writer = tester.makeWriter(tmpFile, options); final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
writer.writeHeader(VariantContextTestProvider.getHeader(data.vcs)); writer.writeHeader(data.header);
final List<VariantContext> expected = data.vcs; final List<VariantContext> expected = data.vcs;
for ( VariantContext vc : expected ) for ( VariantContext vc : expected )
writer.add(vc); writer.add(vc);
writer.close(); writer.close();
// read in the features final List<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
FeatureCodec<VariantContext> codec = tester.makeCodec();
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(tmpFile));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
// TODO -- test header quality
pbs = new PositionalBufferedStream(new FileInputStream(tmpFile));
pbs.skip(header.getHeaderEnd());
final List<VariantContext> actual = new ArrayList<VariantContext>(expected.size());
while ( ! pbs.isDone() ) { actual.add(codec.decode(pbs)); };
Assert.assertEquals(actual.size(), expected.size()); Assert.assertEquals(actual.size(), expected.size());
for ( int i = 0; i < expected.size(); i++ ) for ( int i = 0; i < expected.size(); i++ )
VariantContextTestProvider.assertEquals(actual.get(i), expected.get(i)); VariantContextTestProvider.assertEquals(actual.get(i), expected.get(i));
} }
private final static Pair<VCFHeader, List<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
// read in the features
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(source));
pbs.skip(header.getHeaderEnd());
final List<VariantContext> read = new ArrayList<VariantContext>();
while ( ! pbs.isDone() ) {
final VariantContext vc = codec.decode(pbs);
if ( vc != null ) read.add(vc);
};
return new Pair<VCFHeader, List<VariantContext>>((VCFHeader)header.getHeaderValue(), read);
}
public static void assertEquals( final VariantContext actual, final VariantContext expected ) { public static void assertEquals( final VariantContext actual, final VariantContext expected ) {
Assert.assertNotNull(actual); Assert.assertNotNull(actual);
Assert.assertEquals(actual.getChr(), expected.getChr()); Assert.assertEquals(actual.getChr(), expected.getChr());

View File

@ -57,6 +57,7 @@ public class VariantContextWritersUnitTest extends BaseTest {
public void before() throws IOException { public void before() throws IOException {
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
dictionary = seq.getSequenceDictionary(); dictionary = seq.getSequenceDictionary();
VariantContextTestProvider.initializeTests();
} }
@DataProvider(name = "VariantContextTest_SingleContexts") @DataProvider(name = "VariantContextTest_SingleContexts")