Near final BCF2 implementation
-- Trivial import changes in some walkers -- SelectVariants has a new hidden mode to fully decode a VCF file -- DepthPerAlleleBySample (AD) changed to have not UNBOUNDED by A type, which is actually the right type -- GenotypeLikelihoods now implements List<Double> for convenience. The PL duality here is going to be removed in a subsequent commit -- BugFixes in BCF2Writer. Proper handling of padding. Bugfix for nFields for a field -- padAllele function in VariantContextUtils -- Much better tests for VariantContextTestProvider, including loading parts of dbSNP 135 and the Phase II 1000G call set with genotypes to test encoding / decoding of fields.
This commit is contained in:
parent
dfee17a672
commit
aaf11f00e3
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -15,10 +16,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -79,9 +77,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
|
||||
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), counts);
|
||||
return map;
|
||||
return toADAnnotation(counts);
|
||||
}
|
||||
|
||||
private Map<String,Object> annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) {
|
||||
|
|
@ -132,11 +128,11 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
|
||||
counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) );
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), counts);
|
||||
return toADAnnotation(counts);
|
||||
}
|
||||
|
||||
//map.put(getKeyNames().get(0), counts);
|
||||
return map;
|
||||
private final Map<String, Object> toADAnnotation(final Integer[] counts) {
|
||||
return Collections.singletonMap(getKeyNames().get(0), (Object)Arrays.asList(counts));
|
||||
}
|
||||
|
||||
private String getAlleleRepresentation(Allele allele) {
|
||||
|
|
@ -151,5 +147,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
// public String getIndelBases()
|
||||
public List<String> getKeyNames() { return Arrays.asList("AD"); }
|
||||
|
||||
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); }
|
||||
public List<VCFFormatHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(
|
||||
new VCFFormatHeaderLine(
|
||||
getKeyNames().get(0),
|
||||
VCFHeaderLineCount.A,
|
||||
VCFHeaderLineType.Integer,
|
||||
"Allelic depths for the ref and alt alleles in the order listed"));
|
||||
}
|
||||
}
|
||||
|
|
@ -310,6 +310,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
@Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false)
|
||||
private String outMVFile = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
|
||||
private boolean fullyDecode = false;
|
||||
|
||||
/* Private class used to store the intermediate variants in the integer random selection process */
|
||||
private class RandomVariantStructure {
|
||||
private VariantContext vc;
|
||||
|
|
@ -357,6 +361,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
private Random randomGenotypes = new Random();
|
||||
|
||||
private Set<String> IDsToKeep = null;
|
||||
private Map<String, VCFHeader> vcfRods;
|
||||
|
||||
/**
|
||||
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
|
||||
|
|
@ -365,7 +370,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
// Get list of samples to include in the output
|
||||
List<String> rodNames = Arrays.asList(variantCollection.variants.getName());
|
||||
|
||||
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
|
||||
vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
|
||||
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
|
||||
|
||||
Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
|
||||
|
|
@ -486,6 +491,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
}
|
||||
|
||||
for (VariantContext vc : vcs) {
|
||||
if ( fullyDecode ) vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
|
||||
if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
|
||||
continue;
|
||||
|
||||
|
|
@ -724,6 +730,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
}
|
||||
|
||||
private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) {
|
||||
if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data
|
||||
|
||||
if (KEEP_ORIGINAL_CHR_COUNTS) {
|
||||
if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) )
|
||||
builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
|
||||
|
|
|
|||
|
|
@ -323,7 +323,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
getters.put("REF", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
StringBuilder x = new StringBuilder();
|
||||
x.append(vc.getAlleleWithRefPadding(vc.getReference()));
|
||||
x.append(vc.getAlleleStringWithRefPadding(vc.getReference()));
|
||||
return x.toString();
|
||||
}
|
||||
});
|
||||
|
|
@ -335,7 +335,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( i != 0 ) x.append(",");
|
||||
x.append(vc.getAlleleWithRefPadding(vc.getAlternateAllele(i)));
|
||||
x.append(vc.getAlleleStringWithRefPadding(vc.getAlternateAllele(i)));
|
||||
}
|
||||
return x.toString();
|
||||
}
|
||||
|
|
@ -377,11 +377,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
private static Object splitAltAlleles(VariantContext vc) {
|
||||
final int numAltAlleles = vc.getAlternateAlleles().size();
|
||||
if ( numAltAlleles == 1 )
|
||||
return vc.getAlleleWithRefPadding(vc.getAlternateAllele(0));
|
||||
return vc.getAlleleStringWithRefPadding(vc.getAlternateAllele(0));
|
||||
|
||||
final List<String> alleles = new ArrayList<String>(numAltAlleles);
|
||||
for ( Allele allele : vc.getAlternateAlleles() )
|
||||
alleles.add(vc.getAlleleWithRefPadding(allele));
|
||||
alleles.add(vc.getAlleleStringWithRefPadding(allele));
|
||||
return alleles;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -233,7 +233,7 @@ public class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
}
|
||||
|
||||
public static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
|
||||
if ( AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
|
||||
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
|
||||
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
|
||||
AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1);
|
||||
return clippedAlleles;
|
||||
|
|
|
|||
|
|
@ -24,15 +24,16 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.variantcontext;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.*;
|
||||
|
||||
public class GenotypeLikelihoods {
|
||||
public class GenotypeLikelihoods implements List<Double> {
|
||||
public static final boolean CAP_PLS = false;
|
||||
public static final int PL_CAP = 255;
|
||||
|
||||
|
|
@ -205,6 +206,41 @@ public class GenotypeLikelihoods {
|
|||
return s.toString();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// List interface functions
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
private final void notImplemented() {
|
||||
throw new ReviewedStingException("BUG: code not implemented");
|
||||
}
|
||||
|
||||
@Override public int size() { return this.log10Likelihoods.length; }
|
||||
@Override public Double get(final int i) { return log10Likelihoods[i];}
|
||||
@Override public Double set(final int i, final Double aDouble) { return log10Likelihoods[i] = aDouble; }
|
||||
@Override public boolean isEmpty() { return false; }
|
||||
@Override public Iterator<Double> iterator() { return Arrays.asList(ArrayUtils.toObject(log10Likelihoods)).iterator(); }
|
||||
@Override public Object[] toArray() { return ArrayUtils.toObject(log10Likelihoods); }
|
||||
|
||||
// none of these are implemented
|
||||
@Override public boolean contains(final Object o) { notImplemented(); return false; }
|
||||
@Override public <T> T[] toArray(final T[] ts) { notImplemented(); return null; }
|
||||
@Override public boolean add(final Double aDouble) { notImplemented(); return false; }
|
||||
@Override public boolean remove(final Object o) {notImplemented(); return false; }
|
||||
@Override public boolean containsAll(final Collection<?> objects) { notImplemented(); return false; }
|
||||
@Override public boolean addAll(final Collection<? extends Double> doubles) { notImplemented(); return false; }
|
||||
@Override public boolean addAll(final int i, final Collection<? extends Double> doubles) { notImplemented(); return false; }
|
||||
@Override public boolean removeAll(final Collection<?> objects) { notImplemented(); return false; }
|
||||
@Override public boolean retainAll(final Collection<?> objects) { notImplemented(); return false; }
|
||||
@Override public void clear() { notImplemented(); }
|
||||
@Override public void add(final int i, final Double aDouble) { notImplemented(); }
|
||||
@Override public Double remove(final int i) { notImplemented(); return null; }
|
||||
@Override public int indexOf(final Object o) { notImplemented(); return -1; }
|
||||
@Override public int lastIndexOf(final Object o) { notImplemented(); return 0; }
|
||||
@Override public ListIterator<Double> listIterator() { notImplemented(); return null; }
|
||||
@Override public ListIterator<Double> listIterator(final int i) { notImplemented(); return null; }
|
||||
@Override public List<Double> subList(final int i, final int i1) { notImplemented(); return null; }
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -280,7 +316,6 @@ public class GenotypeLikelihoods {
|
|||
* @param ploidy Ploidy, or number of chromosomes in set
|
||||
* @return Number of likelihood elements we need to hold.
|
||||
*/
|
||||
|
||||
public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) {
|
||||
|
||||
// fast, closed form solution for diploid samples (most common use case)
|
||||
|
|
|
|||
|
|
@ -519,13 +519,10 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
return REFERENCE_BASE_FOR_INDEL;
|
||||
}
|
||||
|
||||
public String getAlleleWithRefPadding(final Allele allele) {
|
||||
if ( hasReferenceBaseForIndel() && isIndel() ) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append((char)getReferenceBaseForIndel().byteValue());
|
||||
sb.append(allele.getDisplayString());
|
||||
return sb.toString();
|
||||
} else
|
||||
public String getAlleleStringWithRefPadding(final Allele allele) {
|
||||
if ( VariantContextUtils.needsPadding(this) )
|
||||
return VariantContextUtils.padAllele(this, allele);
|
||||
else
|
||||
return allele.getDisplayString();
|
||||
}
|
||||
|
||||
|
|
@ -1288,6 +1285,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
final String field = attr.getKey();
|
||||
final VCFCompoundHeaderLine format = getMetaDataForField(header, field);
|
||||
final Object decoded = decodeValue(field, attr.getValue(), format);
|
||||
|
||||
if ( decoded != null )
|
||||
newAttributes.put(field, decoded);
|
||||
}
|
||||
|
|
@ -1297,6 +1295,9 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
|
||||
private final Object decodeValue(final String field, final Object value, final VCFCompoundHeaderLine format) {
|
||||
if ( value instanceof String ) {
|
||||
if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) )
|
||||
return value.equals(".") ? null : new GenotypeLikelihoods((String)value);
|
||||
|
||||
final String string = (String)value;
|
||||
if ( string.indexOf(",") != -1 ) {
|
||||
final String[] splits = string.split(",");
|
||||
|
|
@ -1307,6 +1308,12 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
} else {
|
||||
return decodeOne(field, string, format);
|
||||
}
|
||||
} else if ( value instanceof List && (((List) value).get(0)) instanceof String ) {
|
||||
final List<String> asList = (List<String>)value;
|
||||
final List<Object> values = new ArrayList<Object>(asList.size());
|
||||
for ( final String s : asList )
|
||||
values.add(decodeOne(field, s, format));
|
||||
return values;
|
||||
} else {
|
||||
return value;
|
||||
}
|
||||
|
|
@ -1321,7 +1328,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
case Flag: return Boolean.valueOf(string);
|
||||
case String: return string;
|
||||
case Integer: return Integer.valueOf(string);
|
||||
case Float: return Float.valueOf(string);
|
||||
case Float: return Double.valueOf(string);
|
||||
default: throw new ReviewedStingException("Unexpected type for field" + field);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -197,6 +197,16 @@ public class VariantContextUtils {
|
|||
return false;
|
||||
}
|
||||
|
||||
public static String padAllele(final VariantContext vc, final Allele allele) {
|
||||
assert needsPadding(vc);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append((char)vc.getReferenceBaseForIndel().byteValue());
|
||||
sb.append(allele.getDisplayString());
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) {
|
||||
final boolean padVC = needsPadding(inputVC);
|
||||
|
||||
|
|
|
|||
|
|
@ -32,10 +32,7 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
|
@ -177,8 +174,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
|
||||
private void buildAlleles( VariantContext vc ) throws IOException {
|
||||
final boolean needsPadding = VariantContextUtils.needsPadding(vc);
|
||||
for ( final Allele allele : vc.getAlleles() ) {
|
||||
final String s = vc.getAlleleWithRefPadding(allele);
|
||||
final String s = needsPadding ? VariantContextUtils.padAllele(vc,allele) : allele.getDisplayString();
|
||||
encoder.encodeTyped(s, BCF2Type.CHAR);
|
||||
}
|
||||
}
|
||||
|
|
@ -211,6 +209,8 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
addGQ(vc);
|
||||
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||
addGenotypeFilters(vc);
|
||||
// } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
|
||||
// addPLs(vc);
|
||||
} else {
|
||||
addGenericGenotypeField(vc, field);
|
||||
}
|
||||
|
|
@ -221,7 +221,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
|
||||
private final int getNGenotypeFieldValues(final String field, final VariantContext vc) {
|
||||
final VCFCompoundHeaderLine metaData = VariantContext.getMetaDataForField(header, field);
|
||||
int nFields = metaData.getCount(vc.getAlternateAlleles().size());
|
||||
int nFields = metaData.getCount(vc.getNAlleles());
|
||||
if ( nFields == -1 ) { // unbounded, need to look at values
|
||||
return computeMaxSizeOfGenotypeFieldFromValues(field, vc);
|
||||
} else {
|
||||
|
|
@ -264,6 +264,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
encoder.encodeRawMissingValues(numInFormatField, encoding.BCF2Type);
|
||||
} else {
|
||||
final Object val = g.getAttribute(field);
|
||||
if ( (val instanceof List) && (((List) val).size() != numInFormatField ))
|
||||
throw new ReviewedStingException("BUG: header for " + field +
|
||||
" has inconsistent number of values " + numInFormatField +
|
||||
" compared to values in VariantContext " + ((List) val).size());
|
||||
final Collection<Object> vals = numInFormatField == 1 ? Collections.singleton(val) : (Collection)val;
|
||||
encoder.encodeRawValues(vals, encoding.BCF2Type);
|
||||
}
|
||||
|
|
@ -297,7 +301,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
case Flag:
|
||||
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
|
||||
case String:
|
||||
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String)value);
|
||||
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String) value);
|
||||
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
|
||||
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
|
||||
List<Integer> l;
|
||||
|
|
@ -346,6 +350,19 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
}
|
||||
|
||||
// private final void addPLs(final VariantContext vc) throws IOException {
|
||||
// startGenotypeField(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 1, BCF2Type.INT16);
|
||||
// for ( final Genotype g : vc.getGenotypes() ) {
|
||||
// if ( g.hasLog10PError() ) {
|
||||
// final int GQ = (int)Math.round(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
|
||||
// if ( GQ > VCFConstants.MAX_GENOTYPE_QUAL ) throw new ReviewedStingException("Unexpectedly large GQ " + GQ + " at " + vc);
|
||||
// encoder.encodeRawValue(GQ, BCF2Type.INT8);
|
||||
// } else {
|
||||
// encoder.encodeRawMissingValues(1, BCF2Type.INT8);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
private final void addGenotypes(final VariantContext vc) throws IOException {
|
||||
if ( vc.getNAlleles() > 127 )
|
||||
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
|
||||
|
|
@ -390,7 +407,9 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
|
||||
// iterate over strings until we find one that needs 16 bits, and break
|
||||
for ( final String string : strings ) {
|
||||
final int offset = stringDictionaryMap.get(string);
|
||||
final Integer got = stringDictionaryMap.get(string);
|
||||
if ( got == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF");
|
||||
final int offset = got;
|
||||
offsets.add(offset);
|
||||
final BCF2Type type1 = encoder.determineIntegerType(offset);
|
||||
switch ( type1 ) {
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
|
||||
import java.util.EnumMap;
|
||||
|
||||
|
|
|
|||
|
|
@ -27,7 +27,9 @@ package org.broadinstitute.sting.utils.variantcontext;
|
|||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -44,10 +46,17 @@ import java.util.*;
|
|||
* @since Date created
|
||||
*/
|
||||
public class VariantContextTestProvider {
|
||||
final private static boolean ADVANCED_TESTS = false;
|
||||
final static VCFHeader header;
|
||||
final private static boolean ENABLE_VARARRAY_TESTS = false;
|
||||
final private static boolean ENABLE_PLOIDY_TESTS = false;
|
||||
final private static boolean ENABLE_PL_TESTS = true;
|
||||
private static VCFHeader syntheticHeader;
|
||||
final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>();
|
||||
final static VariantContext ROOT;
|
||||
private static VariantContext ROOT;
|
||||
|
||||
private final static List<File> testSourceVCFs = Arrays.asList(
|
||||
new File(BaseTest.testDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"),
|
||||
new File(BaseTest.testDir + "dbsnp_135.b37.1000.vcf")
|
||||
);
|
||||
|
||||
public abstract static class VariantContextIOTest {
|
||||
public String toString() {
|
||||
|
|
@ -67,17 +76,19 @@ public class VariantContextTestProvider {
|
|||
}
|
||||
|
||||
public static class VariantContextTestData {
|
||||
public final VCFHeader header;
|
||||
public List<VariantContext> vcs;
|
||||
|
||||
public VariantContextTestData(final VariantContextBuilder builder) {
|
||||
this(Collections.singletonList(builder.make()));
|
||||
public VariantContextTestData(final VCFHeader header, final VariantContextBuilder builder) {
|
||||
this(header, Collections.singletonList(builder.make()));
|
||||
}
|
||||
|
||||
public VariantContextTestData(final VariantContext vc) {
|
||||
this(Collections.singletonList(vc));
|
||||
}
|
||||
|
||||
public VariantContextTestData(final List<VariantContext> vcs) {
|
||||
public VariantContextTestData(final VCFHeader header, final List<VariantContext> vcs) {
|
||||
final Set<String> samples = new HashSet<String>();
|
||||
for ( final VariantContext vc : vcs )
|
||||
if ( vc.hasGenotypes() )
|
||||
samples.addAll(vc.getSampleNames());
|
||||
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaData(), samples);
|
||||
this.vcs = vcs;
|
||||
}
|
||||
|
||||
|
|
@ -88,9 +99,11 @@ public class VariantContextTestProvider {
|
|||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append("VariantContextTestData: [");
|
||||
for ( VariantContext vc : vcs ) {
|
||||
b.append(vc.toString()).append(" ----- ");
|
||||
}
|
||||
final VariantContext vc = vcs.get(0);
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
builder.noGenotypes();
|
||||
b.append(builder.make().toString()).append(" nGenotypes = ").append(vc.getNSamples());
|
||||
if ( vcs.size() > 1 ) b.append(" ----- with another ").append(vcs.size() - 1).append(" VariantContext records");
|
||||
b.append("]");
|
||||
return b.toString();
|
||||
}
|
||||
|
|
@ -101,11 +114,55 @@ public class VariantContextTestProvider {
|
|||
}
|
||||
|
||||
private final static void add(VariantContextBuilder builder) {
|
||||
TEST_DATAs.add(new VariantContextTestData(builder));
|
||||
TEST_DATAs.add(new VariantContextTestData(syntheticHeader, builder));
|
||||
}
|
||||
|
||||
static {
|
||||
public static void initializeTests() throws IOException {
|
||||
createSyntheticHeader();
|
||||
makeSyntheticTests();
|
||||
makeEmpiricalTests();
|
||||
}
|
||||
|
||||
private static void makeEmpiricalTests() throws IOException {
|
||||
for ( final File file : testSourceVCFs ) {
|
||||
VCFCodec codec = new VCFCodec();
|
||||
Pair<VCFHeader, List<VariantContext>> x = readAllVCs( file, codec );
|
||||
List<VariantContext> fullyDecoded = new ArrayList<VariantContext>(x.getSecond().size());
|
||||
for ( final VariantContext raw : x.getSecond() )
|
||||
fullyDecoded.add(raw.fullyDecode(x.getFirst()));
|
||||
TEST_DATAs.add(new VariantContextTestData(x.getFirst(), x.getSecond()));
|
||||
}
|
||||
}
|
||||
|
||||
private static void createSyntheticHeader() {
|
||||
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
|
||||
|
||||
metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));
|
||||
|
||||
metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
|
||||
metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
|
||||
metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
|
||||
// prep the header
|
||||
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
|
||||
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER1"));
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER2"));
|
||||
|
||||
metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("FLAG", 1, VCFHeaderLineType.Flag, "x"));
|
||||
|
||||
syntheticHeader = new VCFHeader(metaData);
|
||||
}
|
||||
|
||||
|
||||
private static void makeSyntheticTests() {
|
||||
VariantContextBuilder rootBuilder = new VariantContextBuilder();
|
||||
rootBuilder.source("test");
|
||||
rootBuilder.loc("1", 10, 10);
|
||||
|
|
@ -126,8 +183,6 @@ public class VariantContextTestProvider {
|
|||
add(builder().passFilters());
|
||||
add(builder().filters("FILTER1"));
|
||||
add(builder().filters("FILTER1", "FILTER2"));
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER1"));
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER2"));
|
||||
|
||||
add(builder().log10PError(VariantContext.NO_LOG10_PERROR));
|
||||
add(builder().log10PError(-1));
|
||||
|
|
@ -147,12 +202,6 @@ public class VariantContextTestProvider {
|
|||
add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000)));
|
||||
add(builder().attribute("INT3", null));
|
||||
add(builder().attribute("INT20", Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)));
|
||||
metaData.add(new VCFInfoHeaderLine("INT1", 1, VCFHeaderLineType.Integer, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("INT3", 3, VCFHeaderLineType.Integer, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("INT20", 20, VCFHeaderLineType.Integer, "x"));
|
||||
|
||||
|
||||
metaData.add(new VCFInfoHeaderLine("INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
|
||||
add(builder().attribute("FLOAT1", 1.0));
|
||||
add(builder().attribute("FLOAT1", 100.0));
|
||||
|
|
@ -163,32 +212,17 @@ public class VariantContextTestProvider {
|
|||
add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0)));
|
||||
add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0)));
|
||||
add(builder().attribute("FLOAT3", null));
|
||||
metaData.add(new VCFInfoHeaderLine("FLOAT1", 1, VCFHeaderLineType.Float, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("FLOAT3", 3, VCFHeaderLineType.Float, "x"));
|
||||
|
||||
add(builder().attribute("FLAG", true));
|
||||
add(builder().attribute("FLAG", false));
|
||||
metaData.add(new VCFInfoHeaderLine("FLAG", 1, VCFHeaderLineType.Flag, "x"));
|
||||
|
||||
add(builder().attribute("STRING1", "s1"));
|
||||
add(builder().attribute("STRING1", null));
|
||||
add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3")));
|
||||
add(builder().attribute("STRING3", null));
|
||||
add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20")));
|
||||
metaData.add(new VCFInfoHeaderLine("STRING1", 1, VCFHeaderLineType.String, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("STRING3", 3, VCFHeaderLineType.String, "x"));
|
||||
metaData.add(new VCFInfoHeaderLine("STRING20", 20, VCFHeaderLineType.String, "x"));
|
||||
|
||||
metaData.add(new VCFInfoHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
|
||||
metaData.add(new VCFInfoHeaderLine("GQ", 1, VCFHeaderLineType.Integer, "Genotype Quality"));
|
||||
metaData.add(new VCFInfoHeaderLine("PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
|
||||
|
||||
addGenotypesToTestData();
|
||||
|
||||
// prep the header
|
||||
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
|
||||
|
||||
header = new VCFHeader(metaData);
|
||||
}
|
||||
|
||||
private static void addGenotypesToTestData() {
|
||||
|
|
@ -250,7 +284,7 @@ public class VariantContextTestProvider {
|
|||
addGenotypeTests(site, homRef, het, homVar);
|
||||
|
||||
// ploidy
|
||||
if ( ADVANCED_TESTS ) {
|
||||
if ( ENABLE_PLOIDY_TESTS ) {
|
||||
addGenotypeTests(site,
|
||||
new Genotype("dip", Arrays.asList(ref, alt1)),
|
||||
new Genotype("hap", Arrays.asList(ref)));
|
||||
|
|
@ -261,7 +295,7 @@ public class VariantContextTestProvider {
|
|||
}
|
||||
}
|
||||
|
||||
if ( ADVANCED_TESTS ) {
|
||||
if ( ENABLE_PL_TESTS ) {
|
||||
// testing PLs
|
||||
addGenotypeTests(site,
|
||||
new Genotype("g1", Arrays.asList(ref, ref), -1, new double[]{0, -1, -2}),
|
||||
|
|
@ -294,7 +328,7 @@ public class VariantContextTestProvider {
|
|||
attr("g1", ref, "INT3", 1, 2, 3),
|
||||
attr("g2", ref, "INT3"));
|
||||
|
||||
if ( ADVANCED_TESTS ) {
|
||||
if (ENABLE_VARARRAY_TESTS) {
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT.VAR", 1, 2, 3),
|
||||
attr("g2", ref, "INT.VAR", 4, 5),
|
||||
|
|
@ -331,13 +365,6 @@ public class VariantContextTestProvider {
|
|||
}
|
||||
}
|
||||
|
||||
private static VCFHeader getHeader(final List<VariantContext> vcs) {
|
||||
final Set<String> samples = new HashSet<String>();
|
||||
for ( final VariantContext vc : vcs )
|
||||
samples.addAll(vc.getSampleNames());
|
||||
return new VCFHeader(header.getMetaData(), samples);
|
||||
}
|
||||
|
||||
public static List<VariantContextTestData> generateSiteTests() {
|
||||
return TEST_DATAs;
|
||||
}
|
||||
|
|
@ -351,31 +378,37 @@ public class VariantContextTestProvider {
|
|||
// write
|
||||
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
|
||||
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
|
||||
writer.writeHeader(VariantContextTestProvider.getHeader(data.vcs));
|
||||
writer.writeHeader(data.header);
|
||||
final List<VariantContext> expected = data.vcs;
|
||||
for ( VariantContext vc : expected )
|
||||
writer.add(vc);
|
||||
writer.close();
|
||||
|
||||
// read in the features
|
||||
FeatureCodec<VariantContext> codec = tester.makeCodec();
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(tmpFile));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
pbs.close();
|
||||
// TODO -- test header quality
|
||||
|
||||
pbs = new PositionalBufferedStream(new FileInputStream(tmpFile));
|
||||
pbs.skip(header.getHeaderEnd());
|
||||
|
||||
final List<VariantContext> actual = new ArrayList<VariantContext>(expected.size());
|
||||
while ( ! pbs.isDone() ) { actual.add(codec.decode(pbs)); };
|
||||
|
||||
final List<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
|
||||
Assert.assertEquals(actual.size(), expected.size());
|
||||
|
||||
for ( int i = 0; i < expected.size(); i++ )
|
||||
VariantContextTestProvider.assertEquals(actual.get(i), expected.get(i));
|
||||
}
|
||||
|
||||
private final static Pair<VCFHeader, List<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
|
||||
// read in the features
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
pbs.close();
|
||||
|
||||
pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
pbs.skip(header.getHeaderEnd());
|
||||
|
||||
final List<VariantContext> read = new ArrayList<VariantContext>();
|
||||
while ( ! pbs.isDone() ) {
|
||||
final VariantContext vc = codec.decode(pbs);
|
||||
if ( vc != null ) read.add(vc);
|
||||
};
|
||||
|
||||
return new Pair<VCFHeader, List<VariantContext>>((VCFHeader)header.getHeaderValue(), read);
|
||||
}
|
||||
|
||||
public static void assertEquals( final VariantContext actual, final VariantContext expected ) {
|
||||
Assert.assertNotNull(actual);
|
||||
Assert.assertEquals(actual.getChr(), expected.getChr());
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ public class VariantContextWritersUnitTest extends BaseTest {
|
|||
public void before() throws IOException {
|
||||
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
|
||||
dictionary = seq.getSequenceDictionary();
|
||||
VariantContextTestProvider.initializeTests();
|
||||
}
|
||||
|
||||
@DataProvider(name = "VariantContextTest_SingleContexts")
|
||||
|
|
|
|||
Loading…
Reference in New Issue