From aaf11f00e36767d91255d7fa2ef4cf4a1951296f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 16 May 2012 20:21:03 -0400 Subject: [PATCH] Near final BCF2 implementation -- Trivial import changes in some walkers -- SelectVariants has a new hidden mode to fully decode a VCF file -- DepthPerAlleleBySample (AD) changed to have not UNBOUNDED by A type, which is actually the right type -- GenotypeLikelihoods now implements List for convenience. The PL duality here is going to be removed in a subsequent commit -- BugFixes in BCF2Writer. Proper handling of padding. Bugfix for nFields for a field -- padAllele function in VariantContextUtils -- Much better tests for VariantContextTestProvider, including loading parts of dbSNP 135 and the Phase II 1000G call set with genotypes to test encoding / decoding of fields. --- .../annotator/DepthPerAlleleBySample.java | 27 +-- .../walkers/variantutils/SelectVariants.java | 10 +- .../walkers/variantutils/VariantsToTable.java | 8 +- .../sting/utils/codecs/bcf2/BCF2Codec.java | 2 +- .../variantcontext/GenotypeLikelihoods.java | 41 ++++- .../utils/variantcontext/VariantContext.java | 23 ++- .../variantcontext/VariantContextUtils.java | 10 ++ .../variantcontext/writer/BCF2Writer.java | 35 +++- .../GenotypeLikelihoodsUnitTest.java | 1 - .../VariantContextTestProvider.java | 157 +++++++++++------- .../writer/VariantContextWritersUnitTest.java | 1 + 11 files changed, 215 insertions(+), 100 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index acb1e378a..6b6648a98 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -15,10 +16,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -79,9 +77,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); - Map map = new HashMap(); - map.put(getKeyNames().get(0), counts); - return map; + return toADAnnotation(counts); } private Map annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) { @@ -132,11 +128,11 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) ); - Map map = new HashMap(); - map.put(getKeyNames().get(0), counts); + return toADAnnotation(counts); + } - //map.put(getKeyNames().get(0), counts); - return map; + private final Map toADAnnotation(final Integer[] counts) { + return Collections.singletonMap(getKeyNames().get(0), (Object)Arrays.asList(counts)); } private String getAlleleRepresentation(Allele allele) { @@ -151,5 +147,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa // public String getIndelBases() public List getKeyNames() { return Arrays.asList("AD"); } - public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); } + public List getDescriptions() { + return Arrays.asList( + new VCFFormatHeaderLine( + getKeyNames().get(0), + VCFHeaderLineCount.A, + VCFHeaderLineType.Integer, + "Allelic depths for the ref and alt alleles in the order listed")); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 7a80d1e36..e4c665482 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -310,6 +310,10 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false) private String outMVFile = null; + @Hidden + @Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false) + private boolean fullyDecode = false; + /* Private class used to store the intermediate variants in the integer random selection process */ private class RandomVariantStructure { private VariantContext vc; @@ -357,6 +361,7 @@ public class SelectVariants extends RodWalker implements TreeR private Random randomGenotypes = new Random(); private Set IDsToKeep = null; + private Map vcfRods; /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher @@ -365,7 +370,7 @@ public class SelectVariants extends RodWalker implements TreeR // Get list of samples to include in the output List rodNames = Arrays.asList(variantCollection.variants.getName()); - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); @@ -486,6 +491,7 @@ public class SelectVariants extends RodWalker implements TreeR } for (VariantContext vc : vcs) { + if ( fullyDecode ) vc = vc.fullyDecode(vcfRods.get(vc.getSource())); if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) ) continue; @@ -724,6 +730,8 @@ public class SelectVariants extends RodWalker implements TreeR } private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) { + if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data + if (KEEP_ORIGINAL_CHR_COUNTS) { if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 0c7ed08cf..dce8378f6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -323,7 +323,7 @@ public class VariantsToTable extends RodWalker { getters.put("REF", new Getter() { public String get(VariantContext vc) { StringBuilder x = new StringBuilder(); - x.append(vc.getAlleleWithRefPadding(vc.getReference())); + x.append(vc.getAlleleStringWithRefPadding(vc.getReference())); return x.toString(); } }); @@ -335,7 +335,7 @@ public class VariantsToTable extends RodWalker { for ( int i = 0; i < n; i++ ) { if ( i != 0 ) x.append(","); - x.append(vc.getAlleleWithRefPadding(vc.getAlternateAllele(i))); + x.append(vc.getAlleleStringWithRefPadding(vc.getAlternateAllele(i))); } return x.toString(); } @@ -377,11 +377,11 @@ public class VariantsToTable extends RodWalker { private static Object splitAltAlleles(VariantContext vc) { final int numAltAlleles = vc.getAlternateAlleles().size(); if ( numAltAlleles == 1 ) - return vc.getAlleleWithRefPadding(vc.getAlternateAllele(0)); + return vc.getAlleleStringWithRefPadding(vc.getAlternateAllele(0)); final List alleles = new ArrayList(numAltAlleles); for ( Allele allele : vc.getAlternateAlleles() ) - alleles.add(vc.getAlleleWithRefPadding(allele)); + alleles.add(vc.getAlleleStringWithRefPadding(allele)); return alleles; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index c4826a833..0701a9fdd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -233,7 +233,7 @@ public class BCF2Codec implements FeatureCodec { } public static ArrayList clipAllelesIfNecessary(int position, String ref, ArrayList unclippedAlleles) { - if ( AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { + if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { ArrayList clippedAlleles = new ArrayList(unclippedAlleles.size()); AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1); return clippedAlleles; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 3bebac4fa..78baf5759 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -24,15 +24,16 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.apache.commons.lang.ArrayUtils; import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import java.util.EnumMap; +import java.util.*; -public class GenotypeLikelihoods { +public class GenotypeLikelihoods implements List { public static final boolean CAP_PLS = false; public static final int PL_CAP = 255; @@ -205,6 +206,41 @@ public class GenotypeLikelihoods { return s.toString(); } + // ------------------------------------------------------------------------------------- + // + // List interface functions + // + // ------------------------------------------------------------------------------------- + + private final void notImplemented() { + throw new ReviewedStingException("BUG: code not implemented"); + } + + @Override public int size() { return this.log10Likelihoods.length; } + @Override public Double get(final int i) { return log10Likelihoods[i];} + @Override public Double set(final int i, final Double aDouble) { return log10Likelihoods[i] = aDouble; } + @Override public boolean isEmpty() { return false; } + @Override public Iterator iterator() { return Arrays.asList(ArrayUtils.toObject(log10Likelihoods)).iterator(); } + @Override public Object[] toArray() { return ArrayUtils.toObject(log10Likelihoods); } + + // none of these are implemented + @Override public boolean contains(final Object o) { notImplemented(); return false; } + @Override public T[] toArray(final T[] ts) { notImplemented(); return null; } + @Override public boolean add(final Double aDouble) { notImplemented(); return false; } + @Override public boolean remove(final Object o) {notImplemented(); return false; } + @Override public boolean containsAll(final Collection objects) { notImplemented(); return false; } + @Override public boolean addAll(final Collection doubles) { notImplemented(); return false; } + @Override public boolean addAll(final int i, final Collection doubles) { notImplemented(); return false; } + @Override public boolean removeAll(final Collection objects) { notImplemented(); return false; } + @Override public boolean retainAll(final Collection objects) { notImplemented(); return false; } + @Override public void clear() { notImplemented(); } + @Override public void add(final int i, final Double aDouble) { notImplemented(); } + @Override public Double remove(final int i) { notImplemented(); return null; } + @Override public int indexOf(final Object o) { notImplemented(); return -1; } + @Override public int lastIndexOf(final Object o) { notImplemented(); return 0; } + @Override public ListIterator listIterator() { notImplemented(); return null; } + @Override public ListIterator listIterator(final int i) { notImplemented(); return null; } + @Override public List subList(final int i, final int i1) { notImplemented(); return null; } // ------------------------------------------------------------------------------------- // @@ -280,7 +316,6 @@ public class GenotypeLikelihoods { * @param ploidy Ploidy, or number of chromosomes in set * @return Number of likelihood elements we need to hold. */ - public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { // fast, closed form solution for diploid samples (most common use case) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 3e6f9dbb0..5604cc620 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -519,13 +519,10 @@ public class VariantContext implements Feature { // to enable tribble integratio return REFERENCE_BASE_FOR_INDEL; } - public String getAlleleWithRefPadding(final Allele allele) { - if ( hasReferenceBaseForIndel() && isIndel() ) { - StringBuilder sb = new StringBuilder(); - sb.append((char)getReferenceBaseForIndel().byteValue()); - sb.append(allele.getDisplayString()); - return sb.toString(); - } else + public String getAlleleStringWithRefPadding(final Allele allele) { + if ( VariantContextUtils.needsPadding(this) ) + return VariantContextUtils.padAllele(this, allele); + else return allele.getDisplayString(); } @@ -1288,6 +1285,7 @@ public class VariantContext implements Feature { // to enable tribble integratio final String field = attr.getKey(); final VCFCompoundHeaderLine format = getMetaDataForField(header, field); final Object decoded = decodeValue(field, attr.getValue(), format); + if ( decoded != null ) newAttributes.put(field, decoded); } @@ -1297,6 +1295,9 @@ public class VariantContext implements Feature { // to enable tribble integratio private final Object decodeValue(final String field, final Object value, final VCFCompoundHeaderLine format) { if ( value instanceof String ) { + if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) + return value.equals(".") ? null : new GenotypeLikelihoods((String)value); + final String string = (String)value; if ( string.indexOf(",") != -1 ) { final String[] splits = string.split(","); @@ -1307,6 +1308,12 @@ public class VariantContext implements Feature { // to enable tribble integratio } else { return decodeOne(field, string, format); } + } else if ( value instanceof List && (((List) value).get(0)) instanceof String ) { + final List asList = (List)value; + final List values = new ArrayList(asList.size()); + for ( final String s : asList ) + values.add(decodeOne(field, s, format)); + return values; } else { return value; } @@ -1321,7 +1328,7 @@ public class VariantContext implements Feature { // to enable tribble integratio case Flag: return Boolean.valueOf(string); case String: return string; case Integer: return Integer.valueOf(string); - case Float: return Float.valueOf(string); + case Float: return Double.valueOf(string); default: throw new ReviewedStingException("Unexpected type for field" + field); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 82a4c055f..b720f8558 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -197,6 +197,16 @@ public class VariantContextUtils { return false; } + public static String padAllele(final VariantContext vc, final Allele allele) { + assert needsPadding(vc); + + StringBuilder sb = new StringBuilder(); + sb.append((char)vc.getReferenceBaseForIndel().byteValue()); + sb.append(allele.getDisplayString()); + return sb.toString(); + } + + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { final boolean padVC = needsPadding(inputVC); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index f41c3243d..7e0a2f524 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -32,10 +32,7 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -177,8 +174,9 @@ class BCF2Writer extends IndexingVariantContextWriter { } private void buildAlleles( VariantContext vc ) throws IOException { + final boolean needsPadding = VariantContextUtils.needsPadding(vc); for ( final Allele allele : vc.getAlleles() ) { - final String s = vc.getAlleleWithRefPadding(allele); + final String s = needsPadding ? VariantContextUtils.padAllele(vc,allele) : allele.getDisplayString(); encoder.encodeTyped(s, BCF2Type.CHAR); } } @@ -211,6 +209,8 @@ class BCF2Writer extends IndexingVariantContextWriter { addGQ(vc); } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { addGenotypeFilters(vc); +// } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) { +// addPLs(vc); } else { addGenericGenotypeField(vc, field); } @@ -221,7 +221,7 @@ class BCF2Writer extends IndexingVariantContextWriter { private final int getNGenotypeFieldValues(final String field, final VariantContext vc) { final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field); - int nFields = metaData.getCount(vc.getAlternateAlleles().size()); + int nFields = metaData.getCount(vc.getNAlleles()); if ( nFields == -1 ) { // unbounded, need to look at values return computeMaxSizeOfGenotypeFieldFromValues(field, vc); } else { @@ -264,6 +264,10 @@ class BCF2Writer extends IndexingVariantContextWriter { encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type); } else { final Object val = g.getAttribute(field); + if ( (val instanceof List) && (((List) val).size() != numInFormatField )) + throw new ReviewedStingException("BUG: header for " + field + + " has inconsistent number of values " + numInFormatField + + " compared to values in VariantContext " + ((List) val).size()); final Collection vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val; encoder.encodeRawValues(vals, encoding.BCF2Type); } @@ -297,7 +301,7 @@ class BCF2Writer extends IndexingVariantContextWriter { case Flag: return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1)); case String: - final List s = isList ? (List)value : Collections.singletonList((String)value); + final List s = isList ? (List)value : Collections.singletonList((String) value); return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s); case Integer: // note integer calculation is a bit complex because of the need to determine sizes List l; @@ -346,6 +350,19 @@ class BCF2Writer extends IndexingVariantContextWriter { } } +// private final void addPLs(final VariantContext vc) throws IOException { +// startGenotypeField(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 1, BCF2Type.INT16); +// for ( final Genotype g : vc.getGenotypes() ) { +// if ( g.hasLog10PError() ) { +// final int GQ = (int)Math.round(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL)); +// if ( GQ > VCFConstants.MAX_GENOTYPE_QUAL ) throw new ReviewedStingException("Unexpectedly large GQ " + GQ + " at " + vc); +// encoder.encodeRawValue(GQ, BCF2Type.INT8); +// } else { +// encoder.encodeRawMissingValues(1, BCF2Type.INT8); +// } +// } +// } + private final void addGenotypes(final VariantContext vc) throws IOException { if ( vc.getNAlleles() > 127 ) throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " + @@ -390,7 +407,9 @@ class BCF2Writer extends IndexingVariantContextWriter { // iterate over strings until we find one that needs 16 bits, and break for ( final String string : strings ) { - final int offset = stringDictionaryMap.get(string); + final Integer got = stringDictionaryMap.get(string); + if ( got == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF"); + final int offset = got; offsets.add(offset); final BCF2Type type1 = encoder.determineIntegerType(offset); switch ( type1 ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 531626540..c7d50b6b1 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -33,7 +33,6 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import java.util.EnumMap; diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 990dfeec6..a7e8b783f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -27,7 +27,9 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.writer.Options; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.testng.Assert; @@ -44,10 +46,17 @@ import java.util.*; * @since Date created */ public class VariantContextTestProvider { - final private static boolean ADVANCED_TESTS = false; - final static VCFHeader header; + final private static boolean ENABLE_VARARRAY_TESTS = false; + final private static boolean ENABLE_PLOIDY_TESTS = false; + final private static boolean ENABLE_PL_TESTS = true; + private static VCFHeader syntheticHeader; final static List TEST_DATAs = new ArrayList(); - final static VariantContext ROOT; + private static VariantContext ROOT; + + private final static List testSourceVCFs = Arrays.asList( + new File(BaseTest.testDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"), + new File(BaseTest.testDir + "dbsnp_135.b37.1000.vcf") + ); public abstract static class VariantContextIOTest { public String toString() { @@ -67,17 +76,19 @@ public class VariantContextTestProvider { } public static class VariantContextTestData { + public final VCFHeader header; public List vcs; - public VariantContextTestData(final VariantContextBuilder builder) { - this(Collections.singletonList(builder.make())); + public VariantContextTestData(final VCFHeader header, final VariantContextBuilder builder) { + this(header, Collections.singletonList(builder.make())); } - public VariantContextTestData(final VariantContext vc) { - this(Collections.singletonList(vc)); - } - - public VariantContextTestData(final List vcs) { + public VariantContextTestData(final VCFHeader header, final List vcs) { + final Set samples = new HashSet(); + for ( final VariantContext vc : vcs ) + if ( vc.hasGenotypes() ) + samples.addAll(vc.getSampleNames()); + this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaData(), samples); this.vcs = vcs; } @@ -88,9 +99,11 @@ public class VariantContextTestProvider { public String toString() { StringBuilder b = new StringBuilder(); b.append("VariantContextTestData: ["); - for ( VariantContext vc : vcs ) { - b.append(vc.toString()).append(" ----- "); - } + final VariantContext vc = vcs.get(0); + final VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.noGenotypes(); + b.append(builder.make().toString()).append(" nGenotypes = ").append(vc.getNSamples()); + if ( vcs.size() > 1 ) b.append(" ----- with another ").append(vcs.size() - 1).append(" VariantContext records"); b.append("]"); return b.toString(); } @@ -101,11 +114,55 @@ public class VariantContextTestProvider { } private final static void add(VariantContextBuilder builder) { - TEST_DATAs.add(new VariantContextTestData(builder)); + TEST_DATAs.add(new VariantContextTestData(syntheticHeader, builder)); } - static { + public static void initializeTests() throws IOException { + createSyntheticHeader(); + makeSyntheticTests(); + makeEmpiricalTests(); + } + + private static void makeEmpiricalTests() throws IOException { + for ( final File file : testSourceVCFs ) { + VCFCodec codec = new VCFCodec(); + Pair> x = readAllVCs( file, codec ); + List fullyDecoded = new ArrayList(x.getSecond().size()); + for ( final VariantContext raw : x.getSecond() ) + fullyDecoded.add(raw.fullyDecode(x.getFirst())); + TEST_DATAs.add(new VariantContextTestData(x.getFirst(), x.getSecond())); + } + } + + private static void createSyntheticHeader() { Set metaData = new TreeSet(); + + metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x")); + metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x")); + metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x")); + + metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")); + metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")); + metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + // prep the header + metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0)); + + metaData.add(new VCFFilterHeaderLine("FILTER1")); + metaData.add(new VCFFilterHeaderLine("FILTER2")); + + metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x")); + metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x")); + metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x")); + metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x")); + metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x")); + metaData.add(new VCFInfoHeaderLine("FLAG", 1, VCFHeaderLineType.Flag, "x")); + + syntheticHeader = new VCFHeader(metaData); + } + + + private static void makeSyntheticTests() { VariantContextBuilder rootBuilder = new VariantContextBuilder(); rootBuilder.source("test"); rootBuilder.loc("1", 10, 10); @@ -126,8 +183,6 @@ public class VariantContextTestProvider { add(builder().passFilters()); add(builder().filters("FILTER1")); add(builder().filters("FILTER1", "FILTER2")); - metaData.add(new VCFFilterHeaderLine("FILTER1")); - metaData.add(new VCFFilterHeaderLine("FILTER2")); add(builder().log10PError(VariantContext.NO_LOG10_PERROR)); add(builder().log10PError(-1)); @@ -147,12 +202,6 @@ public class VariantContextTestProvider { add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000))); add(builder().attribute("INT3", null)); add(builder().attribute("INT20", Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20))); - metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x")); - metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x")); - metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x")); - - - metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); add(builder().attribute("FLOAT1", 1.0)); add(builder().attribute("FLOAT1", 100.0)); @@ -163,32 +212,17 @@ public class VariantContextTestProvider { add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0))); add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0))); add(builder().attribute("FLOAT3", null)); - metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x")); - metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x")); add(builder().attribute("FLAG", true)); add(builder().attribute("FLAG", false)); - metaData.add(new VCFInfoHeaderLine("FLAG", 1, VCFHeaderLineType.Flag, "x")); add(builder().attribute("STRING1", "s1")); add(builder().attribute("STRING1", null)); add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3"))); add(builder().attribute("STRING3", null)); add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20"))); - metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x")); - metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x")); - metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x")); - - metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype")); - metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality")); - metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); addGenotypesToTestData(); - - // prep the header - metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0)); - - header = new VCFHeader(metaData); } private static void addGenotypesToTestData() { @@ -250,7 +284,7 @@ public class VariantContextTestProvider { addGenotypeTests(site, homRef, het, homVar); // ploidy - if ( ADVANCED_TESTS ) { + if ( ENABLE_PLOIDY_TESTS ) { addGenotypeTests(site, new Genotype("dip", Arrays.asList(ref, alt1)), new Genotype("hap", Arrays.asList(ref))); @@ -261,7 +295,7 @@ public class VariantContextTestProvider { } } - if ( ADVANCED_TESTS ) { + if ( ENABLE_PL_TESTS ) { // testing PLs addGenotypeTests(site, new Genotype("g1", Arrays.asList(ref, ref), -1, new double[]{0, -1, -2}), @@ -294,7 +328,7 @@ public class VariantContextTestProvider { attr("g1", ref, "INT3", 1, 2, 3), attr("g2", ref, "INT3")); - if ( ADVANCED_TESTS ) { + if (ENABLE_VARARRAY_TESTS) { addGenotypeTests(site, attr("g1", ref, "INT.VAR", 1, 2, 3), attr("g2", ref, "INT.VAR", 4, 5), @@ -331,13 +365,6 @@ public class VariantContextTestProvider { } } - private static VCFHeader getHeader(final List vcs) { - final Set samples = new HashSet(); - for ( final VariantContext vc : vcs ) - samples.addAll(vc.getSampleNames()); - return new VCFHeader(header.getMetaData(), samples); - } - public static List generateSiteTests() { return TEST_DATAs; } @@ -351,31 +378,37 @@ public class VariantContextTestProvider { // write final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); final VariantContextWriter writer = tester.makeWriter(tmpFile, options); - writer.writeHeader(VariantContextTestProvider.getHeader(data.vcs)); + writer.writeHeader(data.header); final List expected = data.vcs; for ( VariantContext vc : expected ) writer.add(vc); writer.close(); - // read in the features - FeatureCodec codec = tester.makeCodec(); - PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(tmpFile)); - FeatureCodecHeader header = codec.readHeader(pbs); - pbs.close(); - // TODO -- test header quality - - pbs = new PositionalBufferedStream(new FileInputStream(tmpFile)); - pbs.skip(header.getHeaderEnd()); - - final List actual = new ArrayList(expected.size()); - while ( ! pbs.isDone() ) { actual.add(codec.decode(pbs)); }; - + final List actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond(); Assert.assertEquals(actual.size(), expected.size()); for ( int i = 0; i < expected.size(); i++ ) VariantContextTestProvider.assertEquals(actual.get(i), expected.get(i)); } + private final static Pair> readAllVCs( final File source, final FeatureCodec codec ) throws IOException { + // read in the features + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final List read = new ArrayList(); + while ( ! pbs.isDone() ) { + final VariantContext vc = codec.decode(pbs); + if ( vc != null ) read.add(vc); + }; + + return new Pair>((VCFHeader)header.getHeaderValue(), read); + } + public static void assertEquals( final VariantContext actual, final VariantContext expected ) { Assert.assertNotNull(actual); Assert.assertEquals(actual.getChr(), expected.getChr()); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java index 13ec9aee2..b5beb885d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java @@ -57,6 +57,7 @@ public class VariantContextWritersUnitTest extends BaseTest { public void before() throws IOException { IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); dictionary = seq.getSequenceDictionary(); + VariantContextTestProvider.initializeTests(); } @DataProvider(name = "VariantContextTest_SingleContexts")