diff --git a/ivy.xml b/ivy.xml index 1802c1627..13ecfa2d2 100644 --- a/ivy.xml +++ b/ivy.xml @@ -35,6 +35,9 @@ + + + diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java index 31ed3dcc8..6d38940bc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java @@ -62,6 +62,27 @@ import java.util.*; */ public class CombineVariantsUnitTest { + public static int VCF4headerStringCount = 16; + + public static String VCF4headerStrings = + "##fileformat=VCFv4.0\n"+ + "##filedate=2010-06-21\n"+ + "##reference=NCBI36\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##FILTER=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n"+ + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + // this header is a small subset of the header in VCFHeaderUnitTest: VCF4headerStrings public static String VCF4headerStringsSmallSubset = "##fileformat=VCFv4.0\n" + @@ -159,34 +180,34 @@ public class CombineVariantsUnitTest { @Test public void testHeadersWhereOneIsAStrictSubsetOfTheOther() { - VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings); + VCFHeader one = createHeader(VCF4headerStrings); VCFHeader two = createHeader(VCF4headerStringsSmallSubset); ArrayList headers = new ArrayList(); headers.add(one); headers.add(two); Set lines = VCFUtils.smartMergeHeaders(headers, false); - Assert.assertEquals(lines.size(), VCFHeaderUnitTest.VCF4headerStringCount); + Assert.assertEquals(lines.size(), VCF4headerStringCount); } @Test(expectedExceptions=IllegalStateException.class) public void testHeadersInfoDifferentValues() { - VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings); + VCFHeader one = createHeader(VCF4headerStrings); VCFHeader two = createHeader(VCF4headerStringsBrokenInfo); ArrayList headers = new ArrayList(); headers.add(one); headers.add(two); Set lines = VCFUtils.smartMergeHeaders(headers, false); - Assert.assertEquals(lines.size(), VCFHeaderUnitTest.VCF4headerStringCount); + Assert.assertEquals(lines.size(), VCF4headerStringCount); } @Test public void testHeadersFormatDifferentValues() { - VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings); + VCFHeader one = createHeader(VCF4headerStrings); VCFHeader two = createHeader(VCF4headerStringsBrokenFormat); ArrayList headers = new ArrayList(); headers.add(one); headers.add(two); Set lines = VCFUtils.smartMergeHeaders(headers, false); - Assert.assertEquals(lines.size(), VCFHeaderUnitTest.VCF4headerStringCount); + Assert.assertEquals(lines.size(), VCF4headerStringCount); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMapUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index 6053a0fde..84bdfd19b 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -45,6 +45,7 @@ */ package org.broadinstitute.sting.utils.genotyper; + import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.BaseUtils; @@ -79,7 +80,6 @@ import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextTestProvider; import org.broadinstitute.variant.vcf.VCFCodec; import java.io.File; import java.io.FileNotFoundException; diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java index cbc7c01ed..0fba432e7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java @@ -26,12 +26,14 @@ package org.broadinstitute.sting.utils.variant; import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.variant.bcf2.BCF2Codec; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.vcf.*; @@ -162,6 +164,67 @@ public class GATKVCFUtils { return rsID; } + /** + * Utility class to read all of the VC records from a file + * + * @param source + * @param codec + * @return + * @throws IOException + */ + public final static Pair readAllVCs( final File source, final FeatureCodec codec ) throws IOException { + // read in the features + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + return new Pair(vcfHeader, new VCIterable(pbs, codec, vcfHeader)); + } + + public static class VCIterable implements Iterable, Iterator { + final PositionalBufferedStream pbs; + final FeatureCodec codec; + final VCFHeader header; + + private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec codec, final VCFHeader header) { + this.pbs = pbs; + this.codec = codec; + this.header = header; + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + try { + return ! pbs.isDone(); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + @Override + public VariantContext next() { + try { + final VariantContext vc = codec.decode(pbs); + return vc == null ? null : vc.fullyDecode(header, false); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + @Override + public void remove() { + } + } + /** * Read all of the VCF records from source into memory, returning the header and the VariantContexts * diff --git a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/variant/bcf2/BCF2Codec.java deleted file mode 100644 index 098b2a5b0..000000000 --- a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Codec.java +++ /dev/null @@ -1,499 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.FeatureCodecHeader; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.ByteArrayInputStream; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Decode BCF2 files - */ -public final class BCF2Codec implements FeatureCodec { - private final static int ALLOWED_MAJOR_VERSION = 2; - private final static int MIN_MINOR_VERSION = 1; - - private BCFVersion bcfVersion = null; - - private VCFHeader header = null; - - /** - * Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field - */ - private final ArrayList contigNames = new ArrayList(); - - /** - * Maps header string names (encoded in VCF) into strings found in the BCF header - * - * Initialized when processing the header - */ - private ArrayList dictionary; - - /** - * Our decoder that reads low-level objects from the BCF2 records - */ - private final BCF2Decoder decoder = new BCF2Decoder(); - - /** - * Provides some sanity checking on the header - */ - private final static int MAX_HEADER_SIZE = 0x08000000; - - /** - * Genotype field decoders that are initialized when the header is read - */ - private BCF2GenotypeFieldDecoders gtFieldDecoders = null; - - /** - * A cached array of GenotypeBuilders for efficient genotype decoding. - * - * Caching it allows us to avoid recreating this intermediate data - * structure each time we decode genotypes - */ - private GenotypeBuilder[] builders = null; - - // for error handling - private int recordNo = 0; - private int pos = 0; - - - // ---------------------------------------------------------------------- - // - // Feature codec interface functions - // - // ---------------------------------------------------------------------- - - @Override - public Feature decodeLoc( final PositionalBufferedStream inputStream ) { - return decode(inputStream); - } - - @Override - public VariantContext decode( final PositionalBufferedStream inputStream ) { - try { - recordNo++; - final VariantContextBuilder builder = new VariantContextBuilder(); - - final int sitesBlockSize = decoder.readBlockSize(inputStream); - final int genotypeBlockSize = decoder.readBlockSize(inputStream); - - decoder.readNextBlock(sitesBlockSize, inputStream); - decodeSiteLoc(builder); - final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder); - - decoder.readNextBlock(genotypeBlockSize, inputStream); - createLazyGenotypesDecoder(info, builder); - return builder.fullyDecoded(true).make(); - } catch ( IOException e ) { - throw new TribbleException("Failed to read BCF file", e); - } - } - - @Override - public Class getFeatureType() { - return VariantContext.class; - } - - @Override - public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) { - try { - // note that this reads the magic as well, and so does double duty - bcfVersion = BCFVersion.readBCFVersion(inputStream); - if ( bcfVersion == null ) - error("Input stream does not contain a BCF encoded file; BCF magic header info not found"); - - if ( bcfVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION ) - error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion()); - if ( bcfVersion.getMinorVersion() < MIN_MINOR_VERSION ) - error("BCF2Codec can only process BCF2 files with minor version >= " + MIN_MINOR_VERSION + " but this file has minor version " + bcfVersion.getMinorVersion()); - - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Parsing data stream with BCF version " + bcfVersion); - } - - final int headerSizeInBytes = BCF2Type.INT32.read(inputStream); - - if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB - error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE); - - final byte[] headerBytes = new byte[headerSizeInBytes]; - if ( inputStream.read(headerBytes) != headerSizeInBytes ) - error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes); - - final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes)); - final AsciiLineReader headerReader = new AsciiLineReader(bps); - final VCFCodec headerParser = new VCFCodec(); - this.header = (VCFHeader)headerParser.readHeader(headerReader); - bps.close(); - } catch ( IOException e ) { - throw new TribbleException("I/O error while reading BCF2 header"); - } - - // create the config offsets - if ( ! header.getContigLines().isEmpty() ) { - contigNames.clear(); - for ( final VCFContigHeaderLine contig : header.getContigLines()) { - if ( contig.getID() == null || contig.getID().equals("") ) - error("found a contig with an invalid ID " + contig); - contigNames.add(contig.getID()); - } - } else { - error("Didn't find any contig lines in BCF2 file header"); - } - - // create the string dictionary - dictionary = parseDictionary(header); - - // prepare the genotype field decoders - gtFieldDecoders = new BCF2GenotypeFieldDecoders(header); - - // create and initialize the genotype builder array - final int nSamples = header.getNGenotypeSamples(); - builders = new GenotypeBuilder[nSamples]; - for ( int i = 0; i < nSamples; i++ ) { - builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i)); - } - - // position right before next line (would be right before first real record byte at end of header) - return new FeatureCodecHeader(header, inputStream.getPosition()); - } - - @Override - public boolean canDecode( final String path ) { - FileInputStream fis = null; - try { - fis = new FileInputStream(path); - final BCFVersion version = BCFVersion.readBCFVersion(fis); - return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION; - } catch ( FileNotFoundException e ) { - return false; - } catch ( IOException e ) { - return false; - } finally { - try { - if ( fis != null ) fis.close(); - } catch ( IOException e ) { - // do nothing - } - } - } - - // -------------------------------------------------------------------------------- - // - // implicit block - // - // The first four records of BCF are inline untype encoded data of: - // - // 4 byte integer chrom offset - // 4 byte integer start - // 4 byte integer ref length - // 4 byte float qual - // - // -------------------------------------------------------------------------------- - - /** - * Decode the sites level data from this classes decoder - * - * @param builder - * @return - */ - @Requires({"builder != null"}) - private final void decodeSiteLoc(final VariantContextBuilder builder) throws IOException { - final int contigOffset = decoder.decodeInt(BCF2Type.INT32); - final String contig = lookupContigName(contigOffset); - builder.chr(contig); - - this.pos = decoder.decodeInt(BCF2Type.INT32) + 1; // GATK is one based, BCF2 is zero-based - final int refLength = decoder.decodeInt(BCF2Type.INT32); - builder.start((long)pos); - builder.stop((long)(pos + refLength - 1)); // minus one because GATK has closed intervals but BCF2 is open - } - - /** - * Decode the sites level data from this classes decoder - * - * @param builder - * @return - */ - @Requires({"builder != null", "decoder != null"}) - @Ensures({"result != null", "result.isValid()"}) - private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) throws IOException { - final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT); - if ( qual != null ) { - builder.log10PError(((Double)qual) / -10.0); - } - - final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32); - final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32); - final int nAlleles = nAlleleInfo >> 16; - final int nInfo = nAlleleInfo & 0x0000FFFF; - final int nFormatFields = nFormatSamples >> 24; - final int nSamples = nFormatSamples & 0x00FFFFF; - - if ( header.getNGenotypeSamples() != nSamples ) - error("Reading BCF2 files with different numbers of samples per record " + - "is not currently supported. Saw " + header.getNGenotypeSamples() + - " samples in header but have a record with " + nSamples + " samples"); - - decodeID(builder); - final List alleles = decodeAlleles(builder, pos, nAlleles); - decodeFilter(builder); - decodeInfo(builder, nInfo); - - final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles); - if ( ! info.isValid() ) - error("Sites info is malformed: " + info); - return info; - } - - protected final static class SitesInfoForDecoding { - final int nFormatFields; - final int nSamples; - final List alleles; - - private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final List alleles) { - this.nFormatFields = nFormatFields; - this.nSamples = nSamples; - this.alleles = alleles; - } - - public boolean isValid() { - return nFormatFields >= 0 && - nSamples >= 0 && - alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference(); - } - - @Override - public String toString() { - return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles); - } - } - - /** - * Decode the id field in this BCF2 file and store it in the builder - * @param builder - */ - private void decodeID( final VariantContextBuilder builder ) throws IOException { - final String id = (String)decoder.decodeTypedValue(); - - if ( id == null ) - builder.noID(); - else - builder.id(id); - } - - /** - * Decode the alleles from this BCF2 file and put the results in builder - * @param builder - * @param pos - * @param nAlleles - * @return the alleles - */ - @Requires("nAlleles > 0") - private List decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) throws IOException { - // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes - List alleles = new ArrayList(nAlleles); - String ref = null; - - for ( int i = 0; i < nAlleles; i++ ) { - final String alleleBases = (String)decoder.decodeTypedValue(); - - final boolean isRef = i == 0; - final Allele allele = Allele.create(alleleBases, isRef); - if ( isRef ) ref = alleleBases; - - alleles.add(allele); - } - assert ref != null; - - builder.alleles(alleles); - - assert ref.length() > 0; - - return alleles; - } - - /** - * Decode the filter field of this BCF2 file and store the result in the builder - * @param builder - */ - private void decodeFilter( final VariantContextBuilder builder ) throws IOException { - final Object value = decoder.decodeTypedValue(); - - if ( value == null ) - builder.unfiltered(); - else { - if ( value instanceof Integer ) { - // fast path for single integer result - final String filterString = getDictionaryString((Integer)value); - if ( VCFConstants.PASSES_FILTERS_v4.equals(filterString)) - builder.passFilters(); - else - builder.filter(filterString); - } else { - for ( final int offset : (List)value ) - builder.filter(getDictionaryString(offset)); - } - } - } - - /** - * Loop over the info field key / value pairs in this BCF2 file and decode them into the builder - * - * @param builder - * @param numInfoFields - */ - @Requires("numInfoFields >= 0") - private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) throws IOException { - if ( numInfoFields == 0 ) - // fast path, don't bother doing any work if there are no fields - return; - - final Map infoFieldEntries = new HashMap(numInfoFields); - for ( int i = 0; i < numInfoFields; i++ ) { - final String key = getDictionaryString(); - Object value = decoder.decodeTypedValue(); - final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, key); - if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags - infoFieldEntries.put(key, value); - } - - builder.attributes(infoFieldEntries); - } - - // -------------------------------------------------------------------------------- - // - // Decoding Genotypes - // - // -------------------------------------------------------------------------------- - - /** - * Create the lazy loader for the genotypes data, and store it in the builder - * so that the VC will be able to decode on demand the genotypes data - * - * @param siteInfo - * @param builder - */ - private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo, - final VariantContextBuilder builder ) { - if (siteInfo.nSamples > 0) { - final LazyGenotypesContext.LazyParser lazyParser = - new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); - - final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes()); - final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples()); - - // did we resort the sample names? If so, we need to load the genotype data - if ( !header.samplesWereAlreadySorted() ) - lazy.decode(); - - builder.genotypesNoValidation(lazy); - } - } - - public static class LazyData { - final public VCFHeader header; - final public int nGenotypeFields; - final public byte[] bytes; - - @Requires({"nGenotypeFields > 0", "bytes != null"}) - public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) { - this.header = header; - this.nGenotypeFields = nGenotypeFields; - this.bytes = bytes; - } - } - - @Ensures("result != null") - private final String getDictionaryString() throws IOException { - return getDictionaryString((Integer) decoder.decodeTypedValue()); - } - - @Requires("offset < dictionary.size()") - @Ensures("result != null") - protected final String getDictionaryString(final int offset) { - return dictionary.get(offset); - } - - /** - * Translate the config offset as encoded in the BCF file into the actual string - * name of the contig from the dictionary - * - * @param contigOffset - * @return - */ - @Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"}) - @Ensures("result != null") - private final String lookupContigName( final int contigOffset ) { - return contigNames.get(contigOffset); - } - - @Requires("header != null") - @Ensures({"result != null", "! result.isEmpty()"}) - private final ArrayList parseDictionary(final VCFHeader header) { - final ArrayList dict = BCF2Utils.makeDictionary(header); - - // if we got here we never found a dictionary, or there are no elements in the dictionary - if ( dict.isEmpty() ) - error("Dictionary header element was absent or empty"); - - return dict; - } - - /** - * @return the VCFHeader we found in this BCF2 file - */ - protected VCFHeader getHeader() { - return header; - } - - @Requires("field != null") - @Ensures("result != null") - protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) { - return gtFieldDecoders.getDecoder(field); - } - - private void error(final String message) throws RuntimeException { - throw new TribbleException(String.format("%s, at record %d with position %d:", message, recordNo, pos)); - } -} diff --git a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/variant/bcf2/BCF2Decoder.java deleted file mode 100644 index b9970706b..000000000 --- a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Decoder.java +++ /dev/null @@ -1,375 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.utils.GeneralUtils; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; - -public final class BCF2Decoder { - byte[] recordBytes = null; - ByteArrayInputStream recordStream = null; - - public BCF2Decoder() { - // nothing to do - } - - /** - * Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes - * - * @param recordBytes - */ - protected BCF2Decoder(final byte[] recordBytes) { - setRecordBytes(recordBytes); - } - - // ---------------------------------------------------------------------- - // - // Routines to load, set, skip blocks of underlying data we are decoding - // - // ---------------------------------------------------------------------- - - /** - * Reads the next record from input stream and prepare this decoder to decode values from it - * - * @param stream - * @return - */ - public void readNextBlock(final int blockSizeInBytes, final InputStream stream) { - if ( blockSizeInBytes < 0 ) throw new TribbleException("Invalid block size " + blockSizeInBytes); - setRecordBytes(readRecordBytes(blockSizeInBytes, stream)); - } - - /** - * Skips the next record from input stream, invalidating current block data - * - * @param stream - * @return - */ - public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) { - try { - final int bytesRead = (int)stream.skip(blockSizeInBytes); - validateReadBytes(bytesRead, 1, blockSizeInBytes); - } catch ( IOException e ) { - throw new TribbleException("I/O error while reading BCF2 file", e); - } - this.recordBytes = null; - this.recordStream = null; - } - - /** - * Returns the byte[] for the block of data we are currently decoding - * @return - */ - public byte[] getRecordBytes() { - return recordBytes; - } - - /** - * The size of the current block in bytes - * - * @return - */ - public int getBlockSize() { - return recordBytes.length; - } - - public boolean blockIsFullyDecoded() { - return recordStream.available() == 0; - } - - /** - * Use the recordBytes[] to read BCF2 records from now on - * - * @param recordBytes - */ - @Requires("recordBytes != null") - @Ensures({"this.recordBytes == recordBytes", "recordStream != null"}) - public void setRecordBytes(final byte[] recordBytes) { - this.recordBytes = recordBytes; - this.recordStream = new ByteArrayInputStream(recordBytes); - } - - // ---------------------------------------------------------------------- - // - // High-level decoder - // - // ---------------------------------------------------------------------- - - public final Object decodeTypedValue() throws IOException { - final byte typeDescriptor = readTypeDescriptor(); - return decodeTypedValue(typeDescriptor); - } - - public final Object decodeTypedValue(final byte typeDescriptor) throws IOException { - final int size = decodeNumberOfElements(typeDescriptor); - return decodeTypedValue(typeDescriptor, size); - } - - @Requires("size >= 0") - public final Object decodeTypedValue(final byte typeDescriptor, final int size) throws IOException { - if ( size == 0 ) { - // missing value => null in java - return null; - } else { - final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency - return decodeLiteralString(size); - } else if ( size == 1 ) { - return decodeSingleValue(type); - } else { - final ArrayList ints = new ArrayList(size); - for ( int i = 0; i < size; i++ ) { - final Object val = decodeSingleValue(type); - if ( val == null ) continue; // auto-pruning. We remove trailing nulls - ints.add(val); - } - return ints.isEmpty() ? null : ints; // return null when all of the values are null - } - } - } - - public final Object decodeSingleValue(final BCF2Type type) throws IOException { - // TODO -- decodeTypedValue should integrate this routine - final int value = decodeInt(type); - - if ( value == type.getMissingBytes() ) - return null; - else { - switch (type) { - case INT8: - case INT16: - case INT32: return value; - case FLOAT: return rawFloatToFloat(value); - case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased - default: throw new TribbleException("BCF2 codec doesn't know how to decode type " + type ); - } - } - } - - // ---------------------------------------------------------------------- - // - // Decode raw primitive data types (ints, floats, and strings) - // - // ---------------------------------------------------------------------- - - private final Object decodeLiteralString(final int size) { - assert size > 0; - - // TODO -- assumes size > 0 - final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array - try { - recordStream.read(bytes); - - int goodLength = 0; - for ( ; goodLength < bytes.length ; goodLength++ ) - if ( bytes[goodLength] == 0 ) break; - - if ( goodLength == 0 ) - return null; - else { - final String s = new String(bytes, 0, goodLength); - return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s; - } - } catch ( IOException e ) { - throw new TribbleException("readByte failure", e); - } - } - - @Ensures("result >= 0") - public final int decodeNumberOfElements(final byte typeDescriptor) throws IOException { - if ( BCF2Utils.sizeIsOverflow(typeDescriptor) ) - // -1 ensures we explode immediately with a bad size if the result is missing - return decodeInt(readTypeDescriptor(), -1); - else - // the size is inline, so just decode it - return BCF2Utils.decodeSize(typeDescriptor); - } - - /** - * Decode an int from the stream. If the value in the stream is missing, - * returns missingValue. Requires the typeDescriptor indicate an inline - * single element event - * - * @param typeDescriptor - * @return - */ - @Requires("BCF2Utils.decodeSize(typeDescriptor) == 1") - public final int decodeInt(final byte typeDescriptor, final int missingValue) throws IOException { - final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - final int i = decodeInt(type); - return i == type.getMissingBytes() ? missingValue : i; - } - - @Requires("type != null") - public final int decodeInt(final BCF2Type type) throws IOException { - return type.read(recordStream); - } - - /** - * Low-level reader for int[] - * - * Requires a typeDescriptor so the function knows how many elements to read, - * and how they are encoded. - * - * If size == 0 => result is null - * If size > 0 => result depends on the actual values in the stream - * -- If the first element read is MISSING, result is null (all values are missing) - * -- Else result = int[N] where N is the first N non-missing values decoded - * - * @param maybeDest if not null we'll not allocate space for the vector, but instead use - * the externally allocated array of ints to store values. If the - * size of this vector is < the actual size of the elements, we'll be - * forced to use freshly allocated arrays. Also note that padded - * int elements are still forced to do a fresh allocation as well. - * @return see description - */ - @Requires({"type != null", "type.isIntegerType()", "size >= 0"}) - public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException { - if ( size == 0 ) { - return null; - } else { - if ( maybeDest != null && maybeDest.length < size ) - maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small - - final int val1 = decodeInt(type); - if ( val1 == type.getMissingBytes() ) { - // fast path for first element being missing - for ( int i = 1; i < size; i++ ) decodeInt(type); - return null; - } else { - // we know we will have at least 1 element, so making the int[] is worth it - final int[] ints = maybeDest == null ? new int[size] : maybeDest; - ints[0] = val1; // we already read the first one - for ( int i = 1; i < size; i++ ) { - ints[i] = decodeInt(type); - if ( ints[i] == type.getMissingBytes() ) { - // read the rest of the missing values, dropping them - for ( int j = i + 1; j < size; j++ ) decodeInt(type); - // deal with auto-pruning by returning an int[] containing - // only the non-MISSING values. We do this by copying the first - // i elements, as i itself is missing - return Arrays.copyOf(ints, i); - } - } - return ints; // all of the elements were non-MISSING - } - } - } - - public final int[] decodeIntArray(final byte typeDescriptor, final int size) throws IOException { - final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - return decodeIntArray(size, type, null); - } - - private double rawFloatToFloat(final int rawFloat) { - return (double)Float.intBitsToFloat(rawFloat); - } - - // ---------------------------------------------------------------------- - // - // Utility functions - // - // ---------------------------------------------------------------------- - - /** - * Read the size of the next block from inputStream - * - * @param inputStream - * @return - */ - public final int readBlockSize(final InputStream inputStream) throws IOException { - return BCF2Type.INT32.read(inputStream); - } - - /** - * Read all bytes for a BCF record block into a byte[], and return it - * - * Is smart about reading from the stream multiple times to fill the buffer, if necessary - * - * @param blockSizeInBytes number of bytes to read - * @param inputStream the stream to read from - * @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream - */ - @Requires({"blockSizeInBytes >= 0", "inputStream != null"}) - @Ensures("result != null") - private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { - assert blockSizeInBytes >= 0; - - final byte[] record = new byte[blockSizeInBytes]; - try { - int bytesRead = 0; - int nReadAttempts = 0; // keep track of how many times we've read - - // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF - while ( bytesRead < blockSizeInBytes ) { - final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead); - if ( read1 == -1 ) - validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); - else - bytesRead += read1; - } - - if ( GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1 ) { // TODO -- remove me - System.err.println("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior"); - } - - validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); - } catch ( IOException e ) { - throw new TribbleException("I/O error while reading BCF2 file", e); - } - - return record; - } - - /** - * Make sure we read the right number of bytes, or throw an error - * - * @param actuallyRead - * @param nReadAttempts - * @param expected - */ - private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) { - assert expected >= 0; - - if ( actuallyRead < expected ) { - throw new TribbleException( - String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", - expected, actuallyRead, nReadAttempts)); - } - } - - public final byte readTypeDescriptor() throws IOException { - return BCF2Utils.readByte(recordStream); - } -} diff --git a/public/java/src/org/broadinstitute/variant/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/variant/bcf2/BCF2GenotypeFieldDecoders.java deleted file mode 100644 index 87d676526..000000000 --- a/public/java/src/org/broadinstitute/variant/bcf2/BCF2GenotypeFieldDecoders.java +++ /dev/null @@ -1,284 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; - -import java.io.IOException; -import java.util.*; - -/** - * An efficient scheme for building and obtaining specialized - * genotype field decoders. Used by the BCFCodec to parse - * with little overhead the fields from BCF2 encoded genotype - * records - * - * @author Mark DePristo - * @since 6/12 - */ -public class BCF2GenotypeFieldDecoders { - private final static boolean ENABLE_FASTPATH_GT = true; - private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number - - // initialized once per writer to allow parallel writers to work - private final HashMap genotypeFieldDecoder = new HashMap(); - private final Decoder defaultDecoder = new GenericDecoder(); - - public BCF2GenotypeFieldDecoders(final VCFHeader header) { - // TODO -- fill in appropriate decoders for each FORMAT field in the header - - genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder()); - // currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype - genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder()); - genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder()); - genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder()); - genotypeFieldDecoder.put(VCFConstants.GENOTYPE_PL_KEY, new PLDecoder()); - genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder()); - } - - // ----------------------------------------------------------------- - // - // Genotype field decoder - // - // ----------------------------------------------------------------- - - /** - * Return decoder appropriate for field, or the generic decoder if no - * specialized one is bound - * @param field the GT field to decode - * @return a non-null decoder - */ - @Requires("field != null") - @Ensures("result != null") - public Decoder getDecoder(final String field) { - final Decoder d = genotypeFieldDecoder.get(field); - return d == null ? defaultDecoder : d; - } - - /** - * Decoder a field (implicit from creation) encoded as - * typeDescriptor in the decoder object in the GenotypeBuilders - * one for each sample in order. - * - * The way this works is that this decode method - * iterates over the builders, decoding a genotype field - * in BCF2 for each sample from decoder. - * - * This system allows us to easily use specialized - * decoders for specific genotype field values. For example, - * we use a special decoder to directly read the BCF2 data for - * the PL field into a int[] rather than the generic List of Integer - */ - public interface Decoder { - @Requires({"siteAlleles != null", "! siteAlleles.isEmpty()", - "field != null", "decoder != null", "gbs != null", "gbs.length != 0"}) - public void decode(final List siteAlleles, - final String field, - final BCF2Decoder decoder, - final byte typeDescriptor, - final int numElements, - final GenotypeBuilder[] gbs) throws IOException; - } - - private class GTDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES ) - fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs); - else { - generalDecode(siteAlleles, numElements, decoder, typeDescriptor, gbs); - } - } - - /** - * fast path for many samples with diploid genotypes - * - * The way this would work is simple. Create a List diploidGenotypes[] object - * After decoding the offset, if that sample is diploid compute the - * offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1 - * if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype - * cache it and use that - * - * Some notes. If there are nAlleles at the site, there are implicitly actually - * n + 1 options including - */ - @Requires("siteAlleles.size() == 2") - @SuppressWarnings({"unchecked"}) - private final void fastBiallelicDiploidDecode(final List siteAlleles, - final BCF2Decoder decoder, - final byte typeDescriptor, - final GenotypeBuilder[] gbs) throws IOException { - final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - - final int nPossibleGenotypes = 3 * 3; - final Object allGenotypes[] = new Object[nPossibleGenotypes]; - - for ( final GenotypeBuilder gb : gbs ) { - final int a1 = decoder.decodeInt(type); - final int a2 = decoder.decodeInt(type); - - if ( a1 == type.getMissingBytes() ) { - assert a2 == type.getMissingBytes(); - // no called sample GT = . - gb.alleles(null); - } else if ( a2 == type.getMissingBytes() ) { - gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1))); - } else { - // downshift to remove phase - final int offset = (a1 >> 1) * 3 + (a2 >> 1); - assert offset < allGenotypes.length; - - // TODO -- how can I get rid of this cast? - List gt = (List)allGenotypes[offset]; - if ( gt == null ) { - final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1); - final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2); - gt = Arrays.asList(allele1, allele2); - allGenotypes[offset] = gt; - } - - gb.alleles(gt); - } - - final boolean phased = (a1 & 0x01) == 1; - gb.phased(phased); - } - } - - private final void generalDecode(final List siteAlleles, - final int ploidy, - final BCF2Decoder decoder, - final byte typeDescriptor, - final GenotypeBuilder[] gbs) throws IOException { - final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); - - // a single cache for the encoded genotypes, since we don't actually need this vector - final int[] tmp = new int[ploidy]; - - for ( final GenotypeBuilder gb : gbs ) { - final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp); - if ( encoded == null ) - // no called sample GT = . - gb.alleles(null); - else { - assert encoded.length > 0; - - // we have at least some alleles to decode - final List gt = new ArrayList(encoded.length); - - // note that the auto-pruning of fields magically handles different - // ploidy per sample at a site - for ( final int encode : encoded ) - gt.add(getAlleleFromEncoded(siteAlleles, encode)); - - gb.alleles(gt); - final boolean phased = (encoded[0] & 0x01) == 1; - gb.phased(phased); - } - } - } - - @Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"}) - @Ensures("result != null") - private final Allele getAlleleFromEncoded(final List siteAlleles, final int encode) { - final int offset = encode >> 1; - return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1); - } - } - - private class DPDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - // the -1 is for missing - gb.DP(decoder.decodeInt(typeDescriptor, -1)); - } - } - } - - private class GQDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - // the -1 is for missing - gb.GQ(decoder.decodeInt(typeDescriptor, -1)); - } - } - } - - private class ADDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - gb.AD(decoder.decodeIntArray(typeDescriptor, numElements)); - } - } - } - - private class PLDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - gb.PL(decoder.decodeIntArray(typeDescriptor, numElements)); - } - } - } - - private class GenericDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - if ( value != null ) { // don't add missing values - if ( value instanceof List && ((List)value).size() == 1) { - // todo -- I really hate this, and it suggests that the code isn't completely right - // the reason it's here is that it's possible to prune down a vector to a singleton - // value and there we have the contract that the value comes back as an atomic value - // not a vector of size 1 - value = ((List)value).get(0); - } - gb.attribute(field, value); - } - } - } - } - - private class FTDecoder implements Decoder { - @Override - public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException { - for ( final GenotypeBuilder gb : gbs ) { - Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - assert value == null || value instanceof String; - gb.filter((String)value); - } - } - } -} diff --git a/public/java/src/org/broadinstitute/variant/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/variant/bcf2/BCF2LazyGenotypesDecoder.java deleted file mode 100644 index ffbfe81e6..000000000 --- a/public/java/src/org/broadinstitute/variant/bcf2/BCF2LazyGenotypesDecoder.java +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import com.google.java.contract.Requires; -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.IOException; -import java.util.*; - -/** - * Lazy version of genotypes decoder for BCF2 genotypes - * - * @author Mark DePristo - * @since 5/12 - */ -public class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { - // the essential information for us to use to decode the genotypes data - // initialized when this lazy decoder is created, as we know all of this from the BCF2Codec - // and its stored here again for code cleanliness - private final BCF2Codec codec; - private final List siteAlleles; - private final int nSamples; - private final int nFields; - private final GenotypeBuilder[] builders; - - @Requires("codec.getHeader().getNGenotypeSamples() == builders.length") - BCF2LazyGenotypesDecoder(final BCF2Codec codec, final List alleles, final int nSamples, - final int nFields, final GenotypeBuilder[] builders) { - this.codec = codec; - this.siteAlleles = alleles; - this.nSamples = nSamples; - this.nFields = nFields; - this.builders = builders; - } - - @Override - public LazyGenotypesContext.LazyData parse(final Object data) { - try { - - // load our byte[] data into the decoder - final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes); - - for ( int i = 0; i < nSamples; i++ ) - builders[i].reset(true); - - for ( int i = 0; i < nFields; i++ ) { - // get the field name - final int offset = (Integer) decoder.decodeTypedValue(); - final String field = codec.getDictionaryString(offset); - - // the type of each element - final byte typeDescriptor = decoder.readTypeDescriptor(); - final int numElements = decoder.decodeNumberOfElements(typeDescriptor); - final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field); - try { - fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, numElements, builders); - } catch ( ClassCastException e ) { - throw new TribbleException("BUG: expected encoding of field " + field - + " inconsistent with the value observed in the decoded value"); - } - } - - final ArrayList genotypes = new ArrayList(nSamples); - for ( final GenotypeBuilder gb : builders ) - genotypes.add(gb.make()); - - return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset()); - } catch ( IOException e ) { - throw new TribbleException("Unexpected IOException parsing already read genotypes data block", e); - } - } -} diff --git a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Type.java b/public/java/src/org/broadinstitute/variant/bcf2/BCF2Type.java deleted file mode 100644 index 4504b8d75..000000000 --- a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Type.java +++ /dev/null @@ -1,219 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import com.google.java.contract.Requires; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.EnumSet; - -/** - * BCF2 types and associated information - * - * @author depristo - * @since 05/12 - */ -public enum BCF2Type { - // the actual values themselves - MISSING(0, 0, 0x00) { - @Override public int read(final InputStream in) throws IOException { - throw new IllegalArgumentException("Cannot read MISSING type"); - } - @Override public void write(final int value, final OutputStream out) throws IOException { - throw new IllegalArgumentException("Cannot write MISSING type"); - } - }, - - INT8 (1, 1, 0xFFFFFF80, -127, 127) { - @Override - public int read(final InputStream in) throws IOException { - return BCF2Utils.readByte(in); - } - - @Override - public void write(final int value, final OutputStream out) throws IOException { - out.write(0xFF & value); // TODO -- do we need this operation? - } - }, - - INT16(2, 2, 0xFFFF8000, -32767, 32767) { - @Override - public int read(final InputStream in) throws IOException { - final int b2 = BCF2Utils.readByte(in) & 0xFF; - final int b1 = BCF2Utils.readByte(in) & 0xFF; - return (short)((b1 << 8) | b2); - } - - @Override - public void write(final int value, final OutputStream out) throws IOException { - // TODO -- optimization -- should we put this in a local buffer? - out.write((0x00FF & value)); - out.write((0xFF00 & value) >> 8); - } - }, - - INT32(3, 4, 0x80000000, -2147483647, 2147483647) { - @Override - public int read(final InputStream in) throws IOException { - final int b4 = BCF2Utils.readByte(in) & 0xFF; - final int b3 = BCF2Utils.readByte(in) & 0xFF; - final int b2 = BCF2Utils.readByte(in) & 0xFF; - final int b1 = BCF2Utils.readByte(in) & 0xFF; - return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4); - } - - @Override - public void write(final int value, final OutputStream out) throws IOException { - out.write((0x000000FF & value)); - out.write((0x0000FF00 & value) >> 8); - out.write((0x00FF0000 & value) >> 16); - out.write((0xFF000000 & value) >> 24); - } - }, - - FLOAT(5, 4, 0x7F800001) { - @Override - public int read(final InputStream in) throws IOException { - return INT32.read(in); - } - - @Override - public void write(final int value, final OutputStream out) throws IOException { - INT32.write(value, out); - } - }, - - CHAR (7, 1, 0x00000000) { - @Override - public int read(final InputStream in) throws IOException { - return INT8.read(in); - } - - @Override - public void write(final int value, final OutputStream out) throws IOException { - INT8.write(value, out); - } - }; - - private final int id; - private final Object missingJavaValue; - private final int missingBytes; - private final int sizeInBytes; - private final long minValue, maxValue; - - BCF2Type(final int id, final int sizeInBytes, final int missingBytes) { - this(id, sizeInBytes, missingBytes, 0, 0); - } - - BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) { - this.id = id; - this.sizeInBytes = sizeInBytes; - this.missingJavaValue = null; - this.missingBytes = missingBytes; - this.minValue = minValue; - this.maxValue = maxValue; - } - - /** - * How many bytes are used to represent this type on disk? - * @return - */ - public int getSizeInBytes() { - return sizeInBytes; - } - - /** - * The ID according to the BCF2 specification - * @return - */ - public int getID() { return id; } - - /** - * Can we encode value v in this type, according to its declared range. - * - * Only makes sense for integer values - * - * @param v - * @return - */ - @Requires("this.isIntegerType()") - public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } - - /** - * Return the java object (aka null) that is used to represent a missing value for this - * type in Java - * - * @return - */ - public Object getMissingJavaValue() { return missingJavaValue; } - - /** - * The bytes (encoded as an int) that are used to represent a missing value - * for this type in BCF2 - * - * @return - */ - public int getMissingBytes() { return missingBytes; } - - /** - * An enum set of the types that might represent Integer values - */ - private final static EnumSet INTEGERS = EnumSet.of(INT8, INT16, INT32); - - /** - * @return true if this BCF2Type corresponds to the magic "MISSING" type (0x00) - */ - public boolean isMissingType() { - return this == MISSING; - } - - public boolean isIntegerType() { - return INTEGERS.contains(this); - } - - /** - * Read a value from in stream of this BCF2 type as an int [32 bit] collection of bits - * - * For intX and char values this is just the int / byte value of the underlying data represented as a 32 bit int - * For a char the result must be converted to a char by (char)(byte)(0x0F & value) - * For doubles it's necessary to convert subsequently this value to a double via Double.bitsToDouble() - * - * @param in - * @return - * @throws IOException - */ - @Requires("in != null") - public int read(final InputStream in) throws IOException { - throw new IllegalArgumentException("Not implemented"); - } - - @Requires("out != null") - public void write(final int value, final OutputStream out) throws IOException { - throw new IllegalArgumentException("Not implemented"); - } -} diff --git a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/variant/bcf2/BCF2Utils.java deleted file mode 100644 index 0b16fd52b..000000000 --- a/public/java/src/org/broadinstitute/variant/bcf2/BCF2Utils.java +++ /dev/null @@ -1,333 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.vcf.*; - -import java.io.*; -import java.util.*; - -/** - * Common utilities for working with BCF2 files - * - * Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type) - * - * @author depristo - * @since 5/12 - */ -public final class BCF2Utils { - public static final int MAX_ALLELES_IN_GENOTYPES = 127; - - public static final int OVERFLOW_ELEMENT_MARKER = 15; - public static final int MAX_INLINE_ELEMENTS = 14; - - public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32}; - public final static BCF2Type[] ID_TO_ENUM; - - static { - int maxID = -1; - for ( BCF2Type v : BCF2Type.values() ) maxID = Math.max(v.getID(), maxID); - ID_TO_ENUM = new BCF2Type[maxID+1]; - for ( BCF2Type v : BCF2Type.values() ) ID_TO_ENUM[v.getID()] = v; - } - - private BCF2Utils() {} - - /** - * Create a strings dictionary from the VCF header - * - * The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT) - * fields. - * - * Note that its critical that the list be dedupped and sorted in a consistent manner each time, - * as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly - * the same way as in the header each time it's very bad - * - * @param header the VCFHeader from which to build the dictionary - * @return a non-null dictionary of elements, may be empty - */ - @Requires("header != null") - @Ensures({"result != null", "new HashSet(result).size() == result.size()"}) - public static ArrayList makeDictionary(final VCFHeader header) { - final Set seen = new HashSet(); - final ArrayList dict = new ArrayList(); - - // special case the special PASS field which doesn't show up in the FILTER field definitions - seen.add(VCFConstants.PASSES_FILTERS_v4); - dict.add(VCFConstants.PASSES_FILTERS_v4); - - // set up the strings dictionary - for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { - if ( line instanceof VCFIDHeaderLine && ! (line instanceof VCFContigHeaderLine) ) { - final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; - if ( ! seen.contains(idLine.getID())) { - dict.add(idLine.getID()); - seen.add(idLine.getID()); - } - } - } - - return dict; - } - - @Requires({"nElements >= 0", "nElements <= OVERFLOW_ELEMENT_MARKER", "type != null"}) - public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { - return (byte)((0x0F & nElements) << 4 | (type.getID() & 0x0F)); - } - - @Ensures("result >= 0") - public static int decodeSize(final byte typeDescriptor) { - return (0xF0 & typeDescriptor) >> 4; - } - - @Ensures("result >= 0") - public static int decodeTypeID(final byte typeDescriptor) { - return typeDescriptor & 0x0F; - } - - @Ensures("result != null") - public static BCF2Type decodeType(final byte typeDescriptor) { - return ID_TO_ENUM[decodeTypeID(typeDescriptor)]; - } - - public static boolean sizeIsOverflow(final byte typeDescriptor) { - return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER; - } - - public static byte readByte(final InputStream stream) throws IOException { - return (byte)(stream.read() & 0xFF); - } - - /** - * Collapse multiple strings into a comma separated list - * - * ["s1", "s2", "s3"] => ",s1,s2,s3" - * - * @param strings size > 1 list of strings - * @return - */ - @Requires({"strings != null"}) - @Ensures("result != null") - public static String collapseStringList(final List strings) { - if ( strings.isEmpty() ) return ""; - else if ( strings.size() == 1 ) return strings.get(0); - else { - final StringBuilder b = new StringBuilder(); - for ( final String s : strings ) { - if ( s != null ) { - assert s.indexOf(",") == -1; // no commas in individual strings - b.append(",").append(s); - } - } - return b.toString(); - } - } - - /** - * Inverse operation of collapseStringList. - * - * ",s1,s2,s3" => ["s1", "s2", "s3"] - * - * - * @param collapsed - * @return - */ - @Requires({"collapsed != null", "isCollapsedString(collapsed)"}) - @Ensures("result != null") - public static List explodeStringList(final String collapsed) { - assert isCollapsedString(collapsed); - final String[] exploded = collapsed.substring(1).split(","); - return Arrays.asList(exploded); - } - - @Requires("s != null") - public static boolean isCollapsedString(final String s) { - return s.length() > 0 && s.charAt(0) == ','; - } - - /** - * Returns a good name for a shadow BCF file for vcfFile. - * - * foo.vcf => foo.bcf - * foo.xxx => foo.xxx.bcf - * - * If the resulting BCF file cannot be written, return null. Happens - * when vcfFile = /dev/null for example - * - * @param vcfFile - * @return the BCF - */ - @Requires("vcfFile != null") - public static final File shadowBCF(final File vcfFile) { - final String path = vcfFile.getAbsolutePath(); - if ( path.contains(".vcf") ) - return new File(path.replace(".vcf", ".bcf")); - else { - final File bcf = new File( path + ".bcf" ); - if ( bcf.canRead() ) - return bcf; - else { - try { - // this is the only way to robustly decide if we could actually write to BCF - final FileOutputStream o = new FileOutputStream(bcf); - o.close(); - bcf.delete(); - return bcf; - } catch ( FileNotFoundException e ) { - return null; - } catch ( IOException e ) { - return null; - } - } - } - } - - @Ensures("result.isIntegerType()") - public static BCF2Type determineIntegerType(final int value) { - for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { - if ( potentialType.withinRange(value) ) - return potentialType; - } - - throw new TribbleException("Integer cannot be encoded in allowable range of even INT32: " + value); - } - - @Ensures("result.isIntegerType()") - public static BCF2Type determineIntegerType(final int[] values) { - // find the min and max values in the array - int max = 0, min = 0; - for ( final int v : values ) { - if ( v > max ) max = v; - if ( v < min ) min = v; - } - - final BCF2Type maxType = determineIntegerType(max); - final BCF2Type minType = determineIntegerType(min); - - // INT8 < INT16 < INT32 so this returns the larger of the two - return maxType.compareTo(minType) >= 0 ? maxType : minType; - } - - /** - * Returns the maximum BCF2 integer size of t1 and t2 - * - * For example, if t1 == INT8 and t2 == INT16 returns INT16 - * - * @param t1 - * @param t2 - * @return - */ - @Requires({"t1.isIntegerType()","t2.isIntegerType()"}) - @Ensures("result.isIntegerType()") - public static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { - switch ( t1 ) { - case INT8: return t2; - case INT16: return t2 == BCF2Type.INT32 ? t2 : t1; - case INT32: return t1; - default: throw new TribbleException("BUG: unexpected BCF2Type " + t1); - } - } - - @Ensures("result.isIntegerType()") - public static BCF2Type determineIntegerType(final List values) { - BCF2Type maxType = BCF2Type.INT8; - for ( final int value : values ) { - final BCF2Type type1 = determineIntegerType(value); - switch ( type1 ) { - case INT8: break; - case INT16: maxType = BCF2Type.INT16; break; - case INT32: return BCF2Type.INT32; // fast path for largest possible value - default: throw new TribbleException("Unexpected integer type " + type1 ); - } - } - return maxType; - } - - /** - * Helper function that takes an object and returns a list representation - * of it: - * - * o == null => [] - * o is a list => o - * else => [o] - * - * @param o - * @return - */ - public static List toList(final Object o) { - if ( o == null ) return Collections.emptyList(); - else if ( o instanceof List ) return (List)o; - else return Collections.singletonList(o); - } - - /** - * Are the elements and their order in the output and input headers consistent so that - * we can write out the raw genotypes block without decoding and recoding it? - * - * If the order of INFO, FILTER, or contrig elements in the output header is different than - * in the input header we must decode the blocks using the input header and then recode them - * based on the new output order. - * - * If they are consistent, we can simply pass through the raw genotypes block bytes, which is - * a *huge* performance win for large blocks. - * - * Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc) - * don't modify the ordering of the header fields and so can safely pass through the genotypes - * undecoded. Some operations -- those at add filters or info fields -- can change the ordering - * of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded - */ - public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) { - // first, we have to have the same samples in the same order - if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) - return false; - - final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); - - while ( inputLinesIt.hasNext() ) { - if ( ! outputLinesIt.hasNext() ) // missing lines in output - return false; - - final VCFIDHeaderLine outputLine = outputLinesIt.next(); - final VCFIDHeaderLine inputLine = inputLinesIt.next(); - - if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) - return false; - } - - return true; - } - - private static List nullAsEmpty(List l) { - if ( l == null ) - return Collections.emptyList(); - else - return l; - } -} diff --git a/public/java/src/org/broadinstitute/variant/bcf2/BCFVersion.java b/public/java/src/org/broadinstitute/variant/bcf2/BCFVersion.java deleted file mode 100644 index dcb2d60d8..000000000 --- a/public/java/src/org/broadinstitute/variant/bcf2/BCFVersion.java +++ /dev/null @@ -1,105 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Arrays; - -/** - * Simple holder for BCF version information - * - * User: depristo - * Date: 8/2/12 - * Time: 2:16 PM - */ -public class BCFVersion { - /** - * BCF2 begins with the MAGIC info BCF_M_m where M is the major version (currently 2) - * and m is the minor version, currently 1 - */ - public static final byte[] MAGIC_HEADER_START = "BCF".getBytes(); - - final int majorVersion; - final int minorVersion; - - public BCFVersion(int majorVersion, int minorVersion) { - this.majorVersion = majorVersion; - this.minorVersion = minorVersion; - } - - /** - * @return the major version number of this BCF file - */ - public int getMajorVersion() { - return majorVersion; - } - - /** - * @return the minor version number of this BCF file - */ - public int getMinorVersion() { - return minorVersion; - } - - /** - * Return a new BCFVersion object describing the major and minor version of the BCF file in stream - * - * Note that stream must be at the very start of the file. - * - * @param stream - * @return a BCFVersion object, or null if stream doesn't contain a BCF file - * @throws IOException - */ - public static BCFVersion readBCFVersion(final InputStream stream) throws IOException { - final byte[] magicBytes = new byte[MAGIC_HEADER_START.length]; - stream.read(magicBytes); - if ( Arrays.equals(magicBytes, MAGIC_HEADER_START) ) { - // we're a BCF file - final int majorByte = stream.read(); - final int minorByte = stream.read(); - return new BCFVersion( majorByte, minorByte ); - } else - return null; - } - - /** - * Write out the BCF magic information indicating this is a BCF file with corresponding major and minor versions - * @param out - * @throws IOException - */ - public void write(final OutputStream out) throws IOException { - out.write(MAGIC_HEADER_START); - out.write(getMajorVersion() & 0xFF); - out.write(getMinorVersion() & 0xFF); - } - - @Override - public String toString() { - return String.format("BCF%d.%d", getMajorVersion(), getMinorVersion()); - } -} diff --git a/public/java/src/org/broadinstitute/variant/utils/GeneralUtils.java b/public/java/src/org/broadinstitute/variant/utils/GeneralUtils.java deleted file mode 100644 index 2dbc865b5..000000000 --- a/public/java/src/org/broadinstitute/variant/utils/GeneralUtils.java +++ /dev/null @@ -1,242 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.utils; - -import java.util.*; - -/** - * Constants and utility methods used throughout the VCF/BCF/VariantContext classes - */ -public class GeneralUtils { - - /** - * Setting this to true causes the VCF/BCF/VariantContext classes to emit debugging information - * to standard error - */ - public static final boolean DEBUG_MODE_ENABLED = false; - - /** - * The smallest log10 value we'll emit from normalizeFromLog10 and other functions - * where the real-space value is 0.0. - */ - public final static double LOG10_P_OF_ZERO = -1000000.0; - - /** - * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of - * elti objects (note there's no actual space between sep and the elti elements). Returns - * "" if collection is empty. If collection contains just elt, then returns elt.toString() - * - * @param separator the string to use to separate objects - * @param objects a collection of objects. the element order is defined by the iterator over objects - * @param the type of the objects - * @return a non-null string - */ - public static String join(final String separator, final Collection objects) { - if (objects.isEmpty()) { // fast path for empty collection - return ""; - } else { - final Iterator iter = objects.iterator(); - final T first = iter.next(); - - if ( ! iter.hasNext() ) // fast path for singleton collections - return first.toString(); - else { // full path for 2+ collection that actually need a join - final StringBuilder ret = new StringBuilder(first.toString()); - while(iter.hasNext()) { - ret.append(separator); - ret.append(iter.next().toString()); - } - return ret.toString(); - } - } - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @return a newly allocated array corresponding the normalized values in array - */ - public static double[] normalizeFromLog10(double[] array) { - return normalizeFromLog10(array, false); - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @param takeLog10OfOutput if true, the output will be transformed back into log10 units - * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed - */ - public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) { - return normalizeFromLog10(array, takeLog10OfOutput, false); - } - - /** - * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space - * - * @param array - * @param takeLog10OfOutput - * @param keepInLogSpace - * - * @return - */ - public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) { - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = arrayMax(array); - - // we may decide to just normalize in log space without converting to linear space - if (keepInLogSpace) { - for (int i = 0; i < array.length; i++) { - array[i] -= maxValue; - } - return array; - } - - // default case: go to linear space - double[] normalized = new double[array.length]; - - for (int i = 0; i < array.length; i++) - normalized[i] = Math.pow(10, array[i] - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.length; i++) - sum += normalized[i]; - for (int i = 0; i < array.length; i++) { - double x = normalized[i] / sum; - if (takeLog10OfOutput) { - x = Math.log10(x); - if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) - x = array[i] - maxValue; - } - - normalized[i] = x; - } - - return normalized; - } - - public static double arrayMax(final double[] array) { - return array[maxElementIndex(array, array.length)]; - } - - public static int maxElementIndex(final double[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final double[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static List cons(final T elt, final List l) { - List l2 = new ArrayList(); - l2.add(elt); - if (l != null) l2.addAll(l); - return l2; - } - - /** - * Make all combinations of N size of objects - * - * if objects = [A, B, C] - * if N = 1 => [[A], [B], [C]] - * if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]] - * - * @param objects - * @param n - * @param - * @param withReplacement if false, the resulting permutations will only contain unique objects from objects - * @return - */ - public static List> makePermutations(final List objects, final int n, final boolean withReplacement) { - final List> combinations = new ArrayList>(); - - if ( n <= 0 ) - ; - else if ( n == 1 ) { - for ( final T o : objects ) - combinations.add(Collections.singletonList(o)); - } else { - final List> sub = makePermutations(objects, n - 1, withReplacement); - for ( List subI : sub ) { - for ( final T a : objects ) { - if ( withReplacement || ! subI.contains(a) ) - combinations.add(cons(a, subI)); - } - } - } - - return combinations; - } - - /** - * Compares double values for equality (within 1e-6), or inequality. - * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. - */ - public static byte compareDoubles(double a, double b) { - return compareDoubles(a, b, 1e-6); - } - - /** - * Compares double values for equality (within epsilon), or inequality. - * - * @param a the first double value - * @param b the second double value - * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. - */ - public static byte compareDoubles(double a, double b, double epsilon) { - if (Math.abs(a - b) < epsilon) { - return 0; - } - if (a > b) { - return -1; - } - return 1; - } - - static public final List reverse(final List l) { - final List newL = new ArrayList(l); - Collections.reverse(newL); - return newL; - } -} - - diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java deleted file mode 100644 index e0a6495a5..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java +++ /dev/null @@ -1,476 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import net.sf.samtools.util.StringUtil; - -import java.util.Arrays; -import java.util.Collection; - -/** - * Immutable representation of an allele - * - * Types of alleles: - * - * Ref: a t C g a // C is the reference base - * - * : a t G g a // C base is a G in some individuals - * - * : a t - g a // C base is deleted w.r.t. the reference - * - * : a t CAg a // A base is inserted w.r.t. the reference sequence - * - * In these cases, where are the alleles? - * - * SNP polymorphism of C/G -> { C , G } -> C is the reference allele - * 1 base deletion of C -> { C , - } -> C is the reference allele - * 1 base insertion of A -> { - ; A } -> Null is the reference allele - * - * Suppose I see a the following in the population: - * - * Ref: a t C g a // C is the reference base - * : a t G g a // C base is a G in some individuals - * : a t - g a // C base is deleted w.r.t. the reference - * - * How do I represent this? There are three segregating alleles: - * - * { C , G , - } - * - * Now suppose I have this more complex example: - * - * Ref: a t C g a // C is the reference base - * : a t - g a - * : a t - - a - * : a t CAg a - * - * There are actually four segregating alleles: - * - * { C g , - g, - -, and CAg } over bases 2-4 - * - * However, the molecular equivalence explicitly listed above is usually discarded, so the actual - * segregating alleles are: - * - * { C g, g, -, C a g } - * - * Critically, it should be possible to apply an allele to a reference sequence to create the - * correct haplotype sequence: - * - * Allele + reference => haplotype - * - * For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the - * Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context. - * - * Given list of alleles it's possible to determine the "type" of the variation - * - * A / C @ loc => SNP with - * - / A => INDEL - * - * If you know where allele is the reference, you can determine whether the variant is an insertion or deletion. - * - * Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be - * determined. This is usually represented by a '.' allele. - * - * Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an - * Allele. - - * @author ebanks, depristo - */ -public class Allele implements Comparable { - private static final byte[] EMPTY_ALLELE_BASES = new byte[0]; - - private boolean isRef = false; - private boolean isNoCall = false; - private boolean isSymbolic = false; - - private byte[] bases = null; - - public final static String NO_CALL_STRING = "."; - /** A generic static NO_CALL allele for use */ - - // no public way to create an allele - protected Allele(byte[] bases, boolean isRef) { - // null alleles are no longer allowed - if ( wouldBeNullAllele(bases) ) { - throw new IllegalArgumentException("Null alleles are not supported"); - } - - // no-calls are represented as no bases - if ( wouldBeNoCallAllele(bases) ) { - this.bases = EMPTY_ALLELE_BASES; - isNoCall = true; - if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); - return; - } - - if ( wouldBeSymbolicAllele(bases) ) { - isSymbolic = true; - if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); - } - else { - StringUtil.toUpperCase(bases); - } - - this.isRef = isRef; - this.bases = bases; - - if ( ! acceptableAlleleBases(bases) ) - throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); - } - - protected Allele(String bases, boolean isRef) { - this(bases.getBytes(), isRef); - } - - - private final static Allele REF_A = new Allele("A", true); - private final static Allele ALT_A = new Allele("A", false); - private final static Allele REF_C = new Allele("C", true); - private final static Allele ALT_C = new Allele("C", false); - private final static Allele REF_G = new Allele("G", true); - private final static Allele ALT_G = new Allele("G", false); - private final static Allele REF_T = new Allele("T", true); - private final static Allele ALT_T = new Allele("T", false); - private final static Allele REF_N = new Allele("N", true); - private final static Allele ALT_N = new Allele("N", false); - public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false); - - // --------------------------------------------------------------------------------------------------------- - // - // creation routines - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases - * == '-', a Null allele is created. If bases == '.', a no call Allele is created. - * - * @param bases the DNA sequence of this variation, '-', of '.' - * @param isRef should we make this a reference allele? - * @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated - */ - public static Allele create(byte[] bases, boolean isRef) { - if ( bases == null ) - throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele"); - - if ( bases.length == 1 ) { - // optimization to return a static constant Allele for each single base object - switch (bases[0]) { - case '.': - if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); - return NO_CALL; - case 'A': case 'a' : return isRef ? REF_A : ALT_A; - case 'C': case 'c' : return isRef ? REF_C : ALT_C; - case 'G': case 'g' : return isRef ? REF_G : ALT_G; - case 'T': case 't' : return isRef ? REF_T : ALT_T; - case 'N': case 'n' : return isRef ? REF_N : ALT_N; - default: throw new IllegalArgumentException("Illegal base [" + (char)bases[0] + "] seen in the allele"); - } - } else { - return new Allele(bases, isRef); - } - } - - public static Allele create(byte base, boolean isRef) { -// public Allele(byte base, boolean isRef) { - return create( new byte[]{ base }, isRef); - } - - public static Allele create(byte base) { - return create( base, false ); - } - - public static Allele extend(Allele left, byte[] right) { - if (left.isSymbolic()) - throw new IllegalArgumentException("Cannot extend a symbolic allele"); - byte[] bases = new byte[left.length() + right.length]; - System.arraycopy(left.getBases(), 0, bases, 0, left.length()); - System.arraycopy(right, 0, bases, left.length(), right.length); - - return create(bases, left.isReference()); - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the null allele - */ - public static boolean wouldBeNullAllele(byte[] bases) { - return (bases.length == 1 && bases[0] == '-') || bases.length == 0; - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the NO_CALL allele - */ - public static boolean wouldBeNoCallAllele(byte[] bases) { - return bases.length == 1 && bases[0] == '.'; - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent a symbolic allele - */ - public static boolean wouldBeSymbolicAllele(byte[] bases) { - if ( bases.length <= 2 ) - return false; - else { - final String strBases = new String(bases); - return (bases[0] == '<' && bases[bases.length-1] == '>') || - (strBases.contains("[") || strBases.contains("]")); - } - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the well formatted allele - */ - public static boolean acceptableAlleleBases(String bases) { - return acceptableAlleleBases(bases.getBytes(), true); - } - - public static boolean acceptableAlleleBases(String bases, boolean allowNsAsAcceptable) { - return acceptableAlleleBases(bases.getBytes(), allowNsAsAcceptable); - } - - /** - * @param bases bases representing an allele - * @return true if the bases represent the well formatted allele - */ - public static boolean acceptableAlleleBases(byte[] bases) { - return acceptableAlleleBases(bases, true); // default: N bases are acceptable - } - - public static boolean acceptableAlleleBases(byte[] bases, boolean allowNsAsAcceptable) { - if ( wouldBeNullAllele(bases) ) - return false; - - if ( wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) ) - return true; - - for (byte base : bases ) { - switch (base) { - case 'A': case 'C': case 'G': case 'T': case 'a': case 'c': case 'g': case 't': - break; - case 'N' : case 'n' : - if (allowNsAsAcceptable) - break; - else - return false; - default: - return false; - } - } - - return true; - } - - /** - * @see Allele(byte[], boolean) - * - * @param bases bases representing an allele - * @param isRef is this the reference allele? - */ - public static Allele create(String bases, boolean isRef) { - //public Allele(String bases, boolean isRef) { - return create(bases.getBytes(), isRef); - } - - - /** - * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information - * - * @param bases bases representing an allele - */ - public static Allele create(String bases) { - return create(bases, false); - } - - /** - * Creates a non-Ref allele. @see Allele(byte[], boolean) for full information - * - * @param bases bases representing an allele - */ - public static Allele create(byte[] bases) { - return create(bases, false); - //this(bases, false); - } - - // --------------------------------------------------------------------------------------------------------- - // - // accessor routines - // - // --------------------------------------------------------------------------------------------------------- - - // Returns true if this is the NO_CALL allele - public boolean isNoCall() { return isNoCall; } - // Returns true if this is not the NO_CALL allele - public boolean isCalled() { return ! isNoCall(); } - - // Returns true if this Allele is the reference allele - public boolean isReference() { return isRef; } - // Returns true if this Allele is not the reference allele - public boolean isNonReference() { return ! isReference(); } - - // Returns true if this Allele is symbolic (i.e. no well-defined base sequence) - public boolean isSymbolic() { return isSymbolic; } - - // Returns a nice string representation of this object - public String toString() { - return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : ""); - } - - /** - * Return the DNA bases segregating in this allele. Note this isn't reference polarized, - * so the Null allele is represented by a vector of length 0 - * - * @return the segregating bases - */ - public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; } - - /** - * Return the DNA bases segregating in this allele in String format. - * This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it. - * - * @return the segregating bases - */ - public String getBaseString() { return isNoCall() ? NO_CALL_STRING : new String(getBases()); } - - /** - * Return the printed representation of this allele. - * Same as getBaseString(), except for symbolic alleles. - * For symbolic alleles, the base string is empty while the display string contains . - * - * @return the allele string representation - */ - public String getDisplayString() { return new String(bases); } - - /** - * Same as #getDisplayString() but returns the result as byte[]. - * - * Slightly faster then getDisplayString() - * - * @return the allele string representation - */ - public byte[] getDisplayBases() { return bases; } - - /** - * @param other the other allele - * - * @return true if these alleles are equal - */ - public boolean equals(Object other) { - return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) ); - } - - /** - * @return hash code - */ - public int hashCode() { - int hash = 1; - for (int i = 0; i < bases.length; i++) - hash += (i+1) * bases[i]; - return hash; - } - - /** - * Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the - * same ref tag - * - * @param other allele to compare to - * @param ignoreRefState if true, ignore ref state in comparison - * @return true if this and other are equal - */ - public boolean equals(Allele other, boolean ignoreRefState) { - return this == other || (isRef == other.isRef || ignoreRefState) && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases)); - } - - /** - * @param test bases to test against - * - * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles - */ - public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); } - - /** - * @param test bases to test against - * - * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles - */ - public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); } - - /** - * @param test allele to test against - * - * @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles - */ - public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); } - - /** - * @return the length of this allele. Null and NO_CALL alleles have 0 length. - */ - public int length() { - return isSymbolic ? 0 : bases.length; - } - - // --------------------------------------------------------------------------------------------------------- - // - // useful static functions - // - // --------------------------------------------------------------------------------------------------------- - - public static Allele getMatchingAllele(Collection allAlleles, byte[] alleleBases) { - for ( Allele a : allAlleles ) { - if ( a.basesMatch(alleleBases) ) { - return a; - } - } - - if ( wouldBeNoCallAllele(alleleBases) ) - return NO_CALL; - else - return null; // couldn't find anything - } - - public int compareTo(Allele other) { - if ( isReference() && other.isNonReference() ) - return -1; - else if ( isNonReference() && other.isReference() ) - return 1; - else - return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue - } - - public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) { - if ( a2.length() >= a1.length() ) - return firstIsPrefixOfSecond(a1, a2); - else - return firstIsPrefixOfSecond(a2, a1); - } - - private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) { - String a1String = a1.getBaseString(); - return a2.getBaseString().substring(0, a1String.length()).equals(a1String); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/CommonInfo.java b/public/java/src/org/broadinstitute/variant/variantcontext/CommonInfo.java deleted file mode 100644 index 16fa52ee0..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/CommonInfo.java +++ /dev/null @@ -1,263 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.util.*; - - -/** - * Common utility routines for VariantContext and Genotype - * - * @author depristo - */ -public final class CommonInfo { - public static final double NO_LOG10_PERROR = 1.0; - - private static Set NO_FILTERS = Collections.emptySet(); - private static Map NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap()); - - private double log10PError = NO_LOG10_PERROR; - private String name = null; - private Set filters = null; - private Map attributes = NO_ATTRIBUTES; - - public CommonInfo(String name, double log10PError, Set filters, Map attributes) { - this.name = name; - setLog10PError(log10PError); - this.filters = filters; - if ( attributes != null && ! attributes.isEmpty() ) { - this.attributes = attributes; - } - } - - /** - * @return the name - */ - public String getName() { - return name; - } - - /** - * Sets the name - * - * @param name the name associated with this information - */ - public void setName(String name) { - if ( name == null ) throw new IllegalArgumentException("Name cannot be null " + this); - this.name = name; - } - - - // --------------------------------------------------------------------------------------------------------- - // - // Filter - // - // --------------------------------------------------------------------------------------------------------- - - public Set getFiltersMaybeNull() { - return filters; - } - - public Set getFilters() { - return filters == null ? NO_FILTERS : Collections.unmodifiableSet(filters); - } - - public boolean filtersWereApplied() { - return filters != null; - } - - public boolean isFiltered() { - return filters == null ? false : filters.size() > 0; - } - - public boolean isNotFiltered() { - return ! isFiltered(); - } - - public void addFilter(String filter) { - if ( filters == null ) // immutable -> mutable - filters = new HashSet(); - - if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this); - if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this); - filters.add(filter); - } - - public void addFilters(Collection filters) { - if ( filters == null ) throw new IllegalArgumentException("BUG: Attempting to add null filters at" + this); - for ( String f : filters ) - addFilter(f); - } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with log error rates - // - // --------------------------------------------------------------------------------------------------------- - - public boolean hasLog10PError() { - return getLog10PError() != NO_LOG10_PERROR; - } - - /** - * @return the -1 * log10-based error estimate - */ - public double getLog10PError() { return log10PError; } - public double getPhredScaledQual() { return getLog10PError() * -10; } - - public void setLog10PError(double log10PError) { - if ( log10PError > 0 && log10PError != NO_LOG10_PERROR) - throw new IllegalArgumentException("BUG: log10PError cannot be > 0 : " + this.log10PError); - if ( Double.isInfinite(this.log10PError) ) - throw new IllegalArgumentException("BUG: log10PError should not be Infinity"); - if ( Double.isNaN(this.log10PError) ) - throw new IllegalArgumentException("BUG: log10PError should not be NaN"); - this.log10PError = log10PError; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with attributes - // - // --------------------------------------------------------------------------------------------------------- - public void clearAttributes() { - attributes = new HashMap(); - } - - /** - * @return the attribute map - */ - public Map getAttributes() { - return Collections.unmodifiableMap(attributes); - } - - // todo -- define common attributes as enum - - public void setAttributes(Map map) { - clearAttributes(); - putAttributes(map); - } - - public void putAttribute(String key, Object value) { - putAttribute(key, value, false); - } - - public void putAttribute(String key, Object value, boolean allowOverwrites) { - if ( ! allowOverwrites && hasAttribute(key) ) - throw new IllegalStateException("Attempting to overwrite key->value binding: key = " + key + " this = " + this); - - if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable - attributes = new HashMap(); - - attributes.put(key, value); - } - - public void removeAttribute(String key) { - if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable - attributes = new HashMap(); - attributes.remove(key); - } - - public void putAttributes(Map map) { - if ( map != null ) { - // for efficiency, we can skip the validation if the map is empty - if ( attributes.size() == 0 ) { - if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable - attributes = new HashMap(); - attributes.putAll(map); - } else { - for ( Map.Entry elt : map.entrySet() ) { - putAttribute(elt.getKey(), elt.getValue(), false); - } - } - } - } - - public boolean hasAttribute(String key) { - return attributes.containsKey(key); - } - - public int getNumAttributes() { - return attributes.size(); - } - - /** - * @param key the attribute key - * - * @return the attribute value for the given key (or null if not set) - */ - public Object getAttribute(String key) { - return attributes.get(key); - } - - public Object getAttribute(String key, Object defaultValue) { - if ( hasAttribute(key) ) - return attributes.get(key); - else - return defaultValue; - } - - public String getAttributeAsString(String key, String defaultValue) { - Object x = getAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof String ) return (String)x; - return String.valueOf(x); // throws an exception if this isn't a string - } - - public int getAttributeAsInt(String key, int defaultValue) { - Object x = getAttribute(key); - if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; - if ( x instanceof Integer ) return (Integer)x; - return Integer.valueOf((String)x); // throws an exception if this isn't a string - } - - public double getAttributeAsDouble(String key, double defaultValue) { - Object x = getAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof Double ) return (Double)x; - if ( x instanceof Integer ) return (Integer)x; - return Double.valueOf((String)x); // throws an exception if this isn't a string - } - - public boolean getAttributeAsBoolean(String key, boolean defaultValue) { - Object x = getAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof Boolean ) return (Boolean)x; - return Boolean.valueOf((String)x); // throws an exception if this isn't a string - } - -// public String getAttributeAsString(String key) { return (String.valueOf(getExtendedAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" -// public int getAttributeAsInt(String key) { Object x = getExtendedAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } -// public double getAttributeAsDouble(String key) { Object x = getExtendedAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } -// public boolean getAttributeAsBoolean(String key) { Object x = getExtendedAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } -// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } -// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } -// public String getAttributeAsStringNoException(String key) { if (getExtendedAttribute(key) == null) return null; return getAttributeAsString(key); } -// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/FastGenotype.java b/public/java/src/org/broadinstitute/variant/variantcontext/FastGenotype.java deleted file mode 100644 index 2ed89147e..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/FastGenotype.java +++ /dev/null @@ -1,182 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * This class encompasses all the basic information about a genotype. It is immutable. - * - * A genotype has several key fields - * - * -- a sample name, must be a non-null string - * - * -- an ordered list of alleles, intrepreted as the genotype of the sample, - * each allele for each chromosome given in order. If alleles = [a*, t] - * then the sample is a/t, with a (the reference from the *) the first - * chromosome and t on the second chromosome - * - * -- a isPhased marker indicting where the alleles are phased with respect to some global - * coordinate system. See VCF4.1 spec for a detailed discussion - * - * -- Inline, optimized ints and int[] values for: - * -- GQ: the phred-scaled genotype quality, of -1 if it's missing - * - * -- DP: the count of reads at this locus for this sample, of -1 if missing - * - * -- AD: an array of counts of reads at this locus, one for each Allele at the site. - * that is, for each allele in the surrounding VariantContext. Null if missing. - * - * -- PL: phred-scaled genotype likelihoods in standard VCF4.1 order for - * all combinations of the alleles in the surrounding VariantContext, given - * the ploidy of the sample (from the alleles vector). Null if missing. - * - * -- A general map from String keys to -> Object values for all other attributes in - * this genotype. Note that this map should not contain duplicate values for the - * standard bindings for GQ, DP, AD, and PL. Genotype filters can be put into - * this genotype, but it isn't respected by the GATK in analyses - * - * The only way to build a Genotype object is with a GenotypeBuilder, which permits values - * to be set in any order, which means that GenotypeBuilder may at some in the chain of - * sets pass through invalid states that are not permitted in a fully formed immutable - * Genotype. - * - * Note this is a simplified, refactored Genotype object based on the original - * generic (and slow) implementation from the original VariantContext + Genotype - * codebase. - * - * @author Mark DePristo - * @since 05/12 - */ -public final class FastGenotype extends Genotype { - private final List alleles; - private final boolean isPhased; - private final int GQ; - private final int DP; - private final int[] AD; - private final int[] PL; - private final Map extendedAttributes; - - /** - * The only way to make one of these, for use by GenotypeBuilder only - * - * @param sampleName - * @param alleles - * @param isPhased - * @param GQ - * @param DP - * @param AD - * @param PL - * @param extendedAttributes - */ - @Requires({ - "sampleName != null", - "alleles != null", - "GQ >= -1", - "DP >= -1", - "validADorPLField(AD)", - "validADorPLField(PL)", - "extendedAttributes != null", - "! hasForbiddenKey(extendedAttributes)"}) - protected FastGenotype(final String sampleName, - final List alleles, - final boolean isPhased, - final int GQ, - final int DP, - final int[] AD, - final int[] PL, - final String filters, - final Map extendedAttributes) { - super(sampleName, filters); - this.alleles = alleles; - this.isPhased = isPhased; - this.GQ = GQ; - this.DP = DP; - this.AD = AD; - this.PL = PL; - this.extendedAttributes = extendedAttributes; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Implmenting the abstract methods - // - // --------------------------------------------------------------------------------------------------------- - - @Override public List getAlleles() { - return alleles; - } - - @Override public Allele getAllele(int i) { - return alleles.get(i); - } - - @Override public boolean isPhased() { - return isPhased; - } - - @Override public int getDP() { - return DP; - } - - @Override public int[] getAD() { - return AD; - } - - @Override public int getGQ() { - return GQ; - } - - @Override public int[] getPL() { - return PL; - } - - // --------------------------------------------------------------------------------------------------------- - // - // get routines for extended attributes - // - // --------------------------------------------------------------------------------------------------------- - - public Map getExtendedAttributes() { - return extendedAttributes; - } - - /** - * Is values a valid AD or PL field - * @param values - * @return - */ - private static boolean validADorPLField(final int[] values) { - if ( values != null ) - for ( int v : values ) - if ( v < 0 ) - return false; - return true; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/variant/variantcontext/Genotype.java deleted file mode 100644 index 3695c39eb..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/Genotype.java +++ /dev/null @@ -1,676 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.util.*; - -/** - * This class encompasses all the basic information about a genotype. It is immutable. - * - * @author Mark DePristo - */ -@Invariant({ - "getAlleles() != null", - "getSampleName() != null", - "getPloidy() >= 0", - "! hasForbiddenKey(getExtendedAttributes())"}) -public abstract class Genotype implements Comparable { - /** - * A list of genotype field keys corresponding to values we - * manage inline in the Genotype object. They must not appear in the - * extended attributes map - */ - public final static Collection PRIMARY_KEYS = Arrays.asList( - VCFConstants.GENOTYPE_FILTER_KEY, - VCFConstants.GENOTYPE_KEY, - VCFConstants.GENOTYPE_QUALITY_KEY, - VCFConstants.DEPTH_KEY, - VCFConstants.GENOTYPE_ALLELE_DEPTHS, - VCFConstants.GENOTYPE_PL_KEY); - - public final static String PHASED_ALLELE_SEPARATOR = "|"; - public final static String UNPHASED_ALLELE_SEPARATOR = "/"; - - private final String sampleName; - private GenotypeType type = null; - private final String filters; - - protected Genotype(final String sampleName, final String filters) { - this.sampleName = sampleName; - this.filters = filters; - } - - /** - * @return the alleles for this genotype. Cannot be null. May be empty - */ - @Ensures("result != null") - public abstract List getAlleles(); - - /** - * Returns how many times allele appears in this genotype object? - * - * @param allele - * @return a value >= 0 indicating how many times the allele occurred in this sample's genotype - */ - @Requires("allele != null") - @Ensures("result >= 0") - public int countAllele(final Allele allele) { - int c = 0; - for ( final Allele a : getAlleles() ) - if ( a.equals(allele) ) - c++; - - return c; - } - - /** - * Get the ith allele in this genotype - * - * @param i the ith allele, must be < the ploidy, starting with 0 - * @return the allele at position i, which cannot be null - */ - @Requires({"i >=0 && i < getPloidy()", "getType() != GenotypeType.UNAVAILABLE"}) - @Ensures("result != null") - public abstract Allele getAllele(int i); - - /** - * Are the alleles phased w.r.t. the global phasing system? - * - * @return true if yes - */ - public abstract boolean isPhased(); - - /** - * What is the ploidy of this sample? - * - * @return the ploidy of this genotype. 0 if the site is no-called. - */ - @Ensures("result >= 0") - public int getPloidy() { - return getAlleles().size(); - } - - /** - * @return the sequencing depth of this sample, or -1 if this value is missing - */ - @Ensures("result >= -1") - public abstract int getDP(); - - /** - * @return the count of reads, one for each allele in the surrounding Variant context, - * matching the corresponding allele, or null if this value is missing. MUST - * NOT BE MODIFIED! - */ - public abstract int[] getAD(); - - /** - * Returns the name associated with this sample. - * - * @return a non-null String - */ - @Ensures("result != null") - public String getSampleName() { - return sampleName; - } - - /** - * Returns a phred-scaled quality score, or -1 if none is available - * @return - */ - @Ensures("result >= -1") - public abstract int getGQ(); - - /** - * Does the PL field have a value? - * @return true if there's a PL field value - */ - @Ensures("(result == false && getPL() == null) || (result == true && getPL() != null)") - public boolean hasPL() { - return getPL() != null; - } - - /** - * Does the AD field have a value? - * @return true if there's a AD field value - */ - @Ensures("(result == false && getAD() == null) || (result == true && getAD() != null)") - public boolean hasAD() { - return getAD() != null; - } - - /** - * Does the GQ field have a value? - * @return true if there's a GQ field value - */ - @Ensures("(result == false && getGQ() == -1) || (result == true && getGQ() >= 0)") - public boolean hasGQ() { - return getGQ() != -1; - } - - /** - * Does the DP field have a value? - * @return true if there's a DP field value - */ - @Ensures("(result == false && getDP() == -1) || (result == true && getDP() >= 0)") - public boolean hasDP() { - return getDP() != -1; - } - - // --------------------------------------------------------------------------------------------------------- - // - // The type of this genotype - // - // --------------------------------------------------------------------------------------------------------- - - /** - * @return the high-level type of this sample's genotype - */ - @Ensures({"type != null", "result != null"}) - public GenotypeType getType() { - if ( type == null ) { - type = determineType(); - } - return type; - } - - /** - * Internal code to determine the type of the genotype from the alleles vector - * @return the type - */ - @Requires("type == null") // we should never call if already calculated - protected GenotypeType determineType() { - // TODO -- this code is slow and could be optimized for the diploid case - final List alleles = getAlleles(); - if ( alleles.isEmpty() ) - return GenotypeType.UNAVAILABLE; - - boolean sawNoCall = false, sawMultipleAlleles = false; - Allele observedAllele = null; - - for ( final Allele allele : alleles ) { - if ( allele.isNoCall() ) - sawNoCall = true; - else if ( observedAllele == null ) - observedAllele = allele; - else if ( !allele.equals(observedAllele) ) - sawMultipleAlleles = true; - } - - if ( sawNoCall ) { - if ( observedAllele == null ) - return GenotypeType.NO_CALL; - return GenotypeType.MIXED; - } - - if ( observedAllele == null ) - throw new IllegalStateException("BUG: there are no alleles present in this genotype but the alleles list is not null"); - - return sawMultipleAlleles ? GenotypeType.HET : observedAllele.isReference() ? GenotypeType.HOM_REF : GenotypeType.HOM_VAR; - } - - /** - * @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false. - */ - public boolean isHom() { return isHomRef() || isHomVar(); } - - /** - * @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false. - */ - public boolean isHomRef() { return getType() == GenotypeType.HOM_REF; } - - /** - * @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false. - */ - public boolean isHomVar() { return getType() == GenotypeType.HOM_VAR; } - - /** - * @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false. - */ - public boolean isHet() { return getType() == GenotypeType.HET; } - - /** - * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false. - */ - public boolean isNoCall() { return getType() == GenotypeType.NO_CALL; } - - /** - * @return true if this genotype is comprised of any alleles that are not no-calls (even if some are). - */ - public boolean isCalled() { return getType() != GenotypeType.NO_CALL && getType() != GenotypeType.UNAVAILABLE; } - - /** - * @return true if this genotype is comprised of both calls and no-calls. - */ - public boolean isMixed() { return getType() == GenotypeType.MIXED; } - - /** - * @return true if the type of this genotype is set. - */ - public boolean isAvailable() { return getType() != GenotypeType.UNAVAILABLE; } - - // ------------------------------------------------------------------------------ - // - // methods for getting genotype likelihoods for a genotype object, if present - // - // ------------------------------------------------------------------------------ - - /** - * @return Returns true if this Genotype has PL field values - */ - @Ensures("(result && getLikelihoods() != null) || (! result && getLikelihoods() == null)") - public boolean hasLikelihoods() { - return getPL() != null; - } - - /** - * Convenience function that returns a string representation of the PL field of this - * genotype, or . if none is available. - * - * @return a non-null String representation for the PL of this sample - */ - @Ensures("result != null") - public String getLikelihoodsString() { - return hasLikelihoods() ? getLikelihoods().toString() : VCFConstants.MISSING_VALUE_v4; - } - - /** - * Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing - * @return null or a GenotypesLikelihood object for this sample's PL field - */ - @Ensures("(hasLikelihoods() && result != null) || (! hasLikelihoods() && result == null)") - public GenotypeLikelihoods getLikelihoods() { - return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null; - } - - /** - * Are all likelihoods for this sample non-informative? - * - * Returns true if all PLs are 0 => 0,0,0 => true - * 0,0,0,0,0,0 => true - * 0,10,100 => false - * - * @return true if all samples PLs are equal and == 0 - */ - public boolean isNonInformative() { - if ( getPL() == null ) - return true; - else { - for ( final int PL : getPL() ) { - if ( PL != 0 ) - return false; - } - - return true; - } - } - - /** - * Unsafe low-level accessor the PL field itself, may be null. - * - * @return a pointer to the underlying PL data. MUST NOT BE MODIFIED! - */ - public abstract int[] getPL(); - - // --------------------------------------------------------------------------------------------------------- - // - // Many different string representations - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Return a VCF-like string representation for the alleles of this genotype. - * - * Does not append the reference * marker on the alleles. - * - * @return a string representing the genotypes, or null if the type is unavailable. - */ - @Ensures("result != null || ! isAvailable()") - public String getGenotypeString() { - return getGenotypeString(true); - } - - /** - * Return a VCF-like string representation for the alleles of this genotype. - * - * If ignoreRefState is true, will not append the reference * marker on the alleles. - * - * @return a string representing the genotypes, or null if the type is unavailable. - */ - @Ensures("result != null || ! isAvailable()") - public String getGenotypeString(boolean ignoreRefState) { - if ( getPloidy() == 0 ) - return "NA"; - - // Notes: - // 1. Make sure to use the appropriate separator depending on whether the genotype is phased - // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele) - // 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course) - return ParsingUtils.join(isPhased() ? PHASED_ALLELE_SEPARATOR : UNPHASED_ALLELE_SEPARATOR, - ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles()))); - } - - /** - * Utility that returns a list of allele strings corresponding to the alleles in this sample - * @return - */ - protected List getAlleleStrings() { - final List al = new ArrayList(getPloidy()); - for ( Allele a : getAlleles() ) - al.add(a.getBaseString()); - - return al; - } - - public String toString() { - return String.format("[%s %s%s%s%s%s%s%s]", - getSampleName(), - getGenotypeString(false), - toStringIfExists(VCFConstants.GENOTYPE_QUALITY_KEY, getGQ()), - toStringIfExists(VCFConstants.DEPTH_KEY, getDP()), - toStringIfExists(VCFConstants.GENOTYPE_ALLELE_DEPTHS, getAD()), - toStringIfExists(VCFConstants.GENOTYPE_PL_KEY, getPL()), - toStringIfExists(VCFConstants.GENOTYPE_FILTER_KEY, getFilters()), - sortedString(getExtendedAttributes())); - } - - public String toBriefString() { - return String.format("%s:Q%d", getGenotypeString(false), getGQ()); - } - - // --------------------------------------------------------------------------------------------------------- - // - // Comparison operations - // - // --------------------------------------------------------------------------------------------------------- - - /** - * comparable genotypes -> compareTo on the sample names - * @param genotype - * @return - */ - @Override - public int compareTo(final Genotype genotype) { - return getSampleName().compareTo(genotype.getSampleName()); - } - - public boolean sameGenotype(final Genotype other) { - return sameGenotype(other, true); - } - - public boolean sameGenotype(final Genotype other, boolean ignorePhase) { - if (getPloidy() != other.getPloidy()) - return false; // gotta have the same number of allele to be equal - - // By default, compare the elements in the lists of alleles, element-by-element - Collection thisAlleles = this.getAlleles(); - Collection otherAlleles = other.getAlleles(); - - if (ignorePhase) { // do not care about order, only identity of Alleles - thisAlleles = new TreeSet(thisAlleles); //implemented Allele.compareTo() - otherAlleles = new TreeSet(otherAlleles); - } - - return thisAlleles.equals(otherAlleles); - } - - // --------------------------------------------------------------------------------------------------------- - // - // get routines for extended attributes - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Returns the extended attributes for this object - * @return is never null, but is often isEmpty() - */ - @Ensures({"result != null", "! hasForbiddenKey(result)"}) - public abstract Map getExtendedAttributes(); - - /** - * Is key associated with a value (even a null one) in the extended attributes? - * - * Note this will not return true for the inline attributes DP, GQ, AD, or PL - * - * @param key a non-null string key to check for an association - * @return true if key has a value in the extendedAttributes - */ - @Requires({"key != null", "! isForbiddenKey(key)"}) - public boolean hasExtendedAttribute(final String key) { - return getExtendedAttributes().containsKey(key); - } - - /** - * Get the extended attribute value associated with key, if possible - * - * @param key a non-null string key to fetch a value for - * @param defaultValue the value to return if key isn't in the extended attributes - * @return a value (potentially) null associated with key, or defaultValue if no association exists - */ - @Requires({"key != null", "! isForbiddenKey(key)"}) - @Ensures("hasExtendedAttribute(key) || result == defaultValue") - public Object getExtendedAttribute(final String key, final Object defaultValue) { - return hasExtendedAttribute(key) ? getExtendedAttributes().get(key) : defaultValue; - } - - /** - * Same as #getExtendedAttribute with a null default - * - * @param key - * @return - */ - public Object getExtendedAttribute(final String key) { - return getExtendedAttribute(key, null); - } - - /** - * Returns the filter string associated with this Genotype. - * - * @return If this result == null, then the genotype is considered PASSing filters - * If the result != null, then the genotype has failed filtering for the reason(s) - * specified in result. To be reference compliant multiple filter field - * string values can be encoded with a ; separator. - */ - public final String getFilters() { - return filters; - } - - /** - * Is this genotype filtered or not? - * - * @return returns false if getFilters() == null - */ - @Ensures({"result != (getFilters() == null)"}) - public final boolean isFiltered() { - return getFilters() != null; - } - - @Deprecated public boolean hasLog10PError() { return hasGQ(); } - @Deprecated public double getLog10PError() { return getGQ() / -10.0; } - @Deprecated public int getPhredScaledQual() { return getGQ(); } - - @Deprecated - public String getAttributeAsString(String key, String defaultValue) { - Object x = getExtendedAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof String ) return (String)x; - return String.valueOf(x); // throws an exception if this isn't a string - } - - @Deprecated - public int getAttributeAsInt(String key, int defaultValue) { - Object x = getExtendedAttribute(key); - if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; - if ( x instanceof Integer ) return (Integer)x; - return Integer.valueOf((String)x); // throws an exception if this isn't a string - } - - @Deprecated - public double getAttributeAsDouble(String key, double defaultValue) { - Object x = getExtendedAttribute(key); - if ( x == null ) return defaultValue; - if ( x instanceof Double ) return (Double)x; - return Double.valueOf((String)x); // throws an exception if this isn't a string - } - - /** - * A totally generic getter, that allows you to specific keys that correspond - * to even inline values (GQ, for example). Can be very expensive. Additionally, - * all int[] are converted inline into List for convenience. - * - * @param key - * @return - */ - public Object getAnyAttribute(final String key) { - if (key.equals(VCFConstants.GENOTYPE_KEY)) { - return getAlleles(); - } else if (key.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { - return getGQ(); - } else if (key.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { - return Arrays.asList(getAD()); - } else if (key.equals(VCFConstants.GENOTYPE_PL_KEY)) { - return Arrays.asList(getPL()); - } else if (key.equals(VCFConstants.DEPTH_KEY)) { - return getDP(); - } else { - return getExtendedAttribute(key); - } - } - - public boolean hasAnyAttribute(final String key) { - if (key.equals(VCFConstants.GENOTYPE_KEY)) { - return isAvailable(); - } else if (key.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { - return hasGQ(); - } else if (key.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { - return hasAD(); - } else if (key.equals(VCFConstants.GENOTYPE_PL_KEY)) { - return hasPL(); - } else if (key.equals(VCFConstants.DEPTH_KEY)) { - return hasDP(); - } else { - return hasExtendedAttribute(key); - } - } - - // TODO -- add getAttributesAsX interface here - - // ------------------------------------------------------------------------------ - // - // private utilities - // - // ------------------------------------------------------------------------------ - - /** - * a utility method for generating sorted strings from a map key set. - * @param c the map - * @param the key type - * @param the value type - * @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys - */ - @Requires("c != null") - protected static , V> String sortedString(Map c) { - - // NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS - final List t = new ArrayList(c.keySet()); - Collections.sort(t); - - final List pairs = new ArrayList(); - for (final T k : t) { - pairs.add(k + "=" + c.get(k)); - } - - return pairs.isEmpty() ? "" : " {" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; - } - - /** - * Returns a display name for field name with value v if this isn't -1. Otherwise returns "" - * @param name of the field ("AD") - * @param v the value of the field, or -1 if missing - * @return a non-null string for display if the field is not missing - */ - @Requires("name != null") - @Ensures("result != null") - protected final static String toStringIfExists(final String name, final int v) { - return v == -1 ? "" : " " + name + " " + v; - } - - /** - * Returns a display name for field name with String value v if this isn't null. Otherwise returns "" - * @param name of the field ("FT") - * @param v the value of the field, or null if missing - * @return a non-null string for display if the field is not missing - */ - protected final static String toStringIfExists(final String name, final String v) { - return v == null ? "" : " " + name + " " + v; - } - - /** - * Returns a display name for field name with values vs if this isn't null. Otherwise returns "" - * @param name of the field ("AD") - * @param vs the value of the field, or null if missing - * @return a non-null string for display if the field is not missing - */ - @Requires("name != null") - @Ensures("result != null") - protected final static String toStringIfExists(final String name, final int[] vs) { - if ( vs == null ) - return ""; - else { - StringBuilder b = new StringBuilder(); - b.append(" ").append(name).append(" "); - for ( int i = 0; i < vs.length; i++ ) { - if ( i != 0 ) b.append(","); - b.append(vs[i]); - } - return b.toString(); - } - } - - /** - * Does the attribute map have a mapping involving a forbidden key (i.e., - * one that's managed inline by this Genotypes object? - * - * @param attributes the extended attributes key - * @return - */ - protected final static boolean hasForbiddenKey(final Map attributes) { - for ( final String forbidden : PRIMARY_KEYS) - if ( attributes.containsKey(forbidden) ) - return true; - return false; - } - - protected final static boolean isForbiddenKey(final String key) { - return PRIMARY_KEYS.contains(key); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeBuilder.java deleted file mode 100644 index 31ba94231..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeBuilder.java +++ /dev/null @@ -1,419 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.util.*; - -/** - * A builder class for genotypes - * - * Provides convenience setter methods for all of the Genotype field - * values. Setter methods can be used in any order, allowing you to - * pass through states that wouldn't be allowed in the highly regulated - * immutable Genotype class. - * - * All fields default to meaningful MISSING values. - * - * Call make() to actually create the corresponding Genotype object from - * this builder. Can be called multiple times to create independent copies, - * or with intervening sets to conveniently make similar Genotypes with - * slight modifications. - * - * @author Mark DePristo - * @since 06/12 - */ -@Invariant({"alleles != null"}) -public final class GenotypeBuilder { - private static final List HAPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL); - private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - - private String sampleName = null; - private List alleles = Collections.emptyList(); - - private boolean isPhased = false; - private int GQ = -1; - private int DP = -1; - private int[] AD = null; - private int[] PL = null; - private Map extendedAttributes = null; - private String filters = null; - private int initialAttributeMapSize = 5; - - private final static Map NO_ATTRIBUTES = - Collections.unmodifiableMap(new HashMap(0)); - - // ----------------------------------------------------------------- - // - // Factory methods - // - // ----------------------------------------------------------------- - - public static Genotype create(final String sampleName, final List alleles) { - return new GenotypeBuilder(sampleName, alleles).make(); - } - - public static Genotype create(final String sampleName, - final List alleles, - final Map attributes) { - return new GenotypeBuilder(sampleName, alleles).attributes(attributes).make(); - } - - protected static Genotype create(final String sampleName, - final List alleles, - final double[] gls) { - return new GenotypeBuilder(sampleName, alleles).PL(gls).make(); - } - - /** - * Create a new Genotype object for a sample that's missing from the VC (i.e., in - * the output header). Defaults to a diploid no call genotype ./. - * - * @param sampleName the name of this sample - * @return an initialized Genotype with sampleName that's a diploid ./. no call genotype - */ - public static Genotype createMissing(final String sampleName, final int ploidy) { - final GenotypeBuilder builder = new GenotypeBuilder(sampleName); - switch ( ploidy ) { - case 1: builder.alleles(HAPLOID_NO_CALL); break; - case 2: builder.alleles(DIPLOID_NO_CALL); break; - default: builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); break; - } - return builder.make(); - } - - /** - * Create a empty builder. Both a sampleName and alleles must be provided - * before trying to make a Genotype from this builder. - */ - public GenotypeBuilder() {} - - /** - * Create a builder using sampleName. Alleles must be provided - * before trying to make a Genotype from this builder. - * @param sampleName - */ - public GenotypeBuilder(final String sampleName) { - name(sampleName); - } - - /** - * Make a builder using sampleName and alleles for starting values - * @param sampleName - * @param alleles - */ - public GenotypeBuilder(final String sampleName, final List alleles) { - name(sampleName); - alleles(alleles); - } - - /** - * Create a new builder starting with the values in Genotype g - * @param g - */ - public GenotypeBuilder(final Genotype g) { - copy(g); - } - - /** - * Copy all of the values for this builder from Genotype g - * @param g - * @return - */ - public GenotypeBuilder copy(final Genotype g) { - name(g.getSampleName()); - alleles(g.getAlleles()); - phased(g.isPhased()); - GQ(g.getGQ()); - DP(g.getDP()); - AD(g.getAD()); - PL(g.getPL()); - filter(g.getFilters()); - attributes(g.getExtendedAttributes()); - return this; - } - - /** - * Reset all of the builder attributes to their defaults. After this - * function you must provide sampleName and alleles before trying to - * make more Genotypes. - */ - public final void reset(final boolean keepSampleName) { - if ( ! keepSampleName ) sampleName = null; - alleles = Collections.emptyList(); - isPhased = false; - GQ = -1; - DP = -1; - AD = null; - PL = null; - filters = null; - extendedAttributes = null; - } - - /** - * Create a new Genotype object using the values set in this builder. - * - * After creation the values in this builder can be modified and more Genotypes - * created, althrough the contents of array values like PL should never be modified - * inline as they are not copied for efficiency reasons. - * - * @return a newly minted Genotype object with values provided from this builder - */ - @Ensures({"result != null"}) - public Genotype make() { - final Map ea = extendedAttributes == null ? NO_ATTRIBUTES : extendedAttributes; - return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, filters, ea); - } - - /** - * Set this genotype's name - * @param sampleName - * @return - */ - @Requires({"sampleName != null"}) - @Ensures({"this.sampleName != null"}) - public GenotypeBuilder name(final String sampleName) { - this.sampleName = sampleName; - return this; - } - - /** - * Set this genotype's alleles - * @param alleles - * @return - */ - @Ensures({"this.alleles != null"}) - public GenotypeBuilder alleles(final List alleles) { - if ( alleles == null ) - this.alleles = Collections.emptyList(); - else - this.alleles = alleles; - return this; - } - - /** - * Is this genotype phased? - * @param phased - * @return - */ - public GenotypeBuilder phased(final boolean phased) { - isPhased = phased; - return this; - } - - @Requires({"GQ >= -1"}) - @Ensures({"this.GQ == GQ", "this.GQ >= -1"}) - public GenotypeBuilder GQ(final int GQ) { - this.GQ = GQ; - return this; - } - - /** - * Adaptor interface from the pLog10Error system. - * - * Will be retired when - * - * @param pLog10Error - * @return - */ - @Deprecated - public GenotypeBuilder log10PError(final double pLog10Error) { - if ( pLog10Error == CommonInfo.NO_LOG10_PERROR ) - return GQ(-1); - else - return GQ((int)Math.round(pLog10Error * -10)); - } - - /** - * This genotype has no GQ value - * @return - */ - public GenotypeBuilder noGQ() { GQ = -1; return this; } - - /** - * This genotype has no AD value - * @return - */ - public GenotypeBuilder noAD() { AD = null; return this; } - - /** - * This genotype has no DP value - * @return - */ - public GenotypeBuilder noDP() { DP = -1; return this; } - - /** - * This genotype has no PL value - * @return - */ - public GenotypeBuilder noPL() { PL = null; return this; } - - /** - * This genotype has this DP value - * @return - */ - @Requires({"DP >= -1"}) - @Ensures({"this.DP == DP"}) - public GenotypeBuilder DP(final int DP) { - this.DP = DP; - return this; - } - - /** - * This genotype has this AD value - * @return - */ - @Requires({"AD == null || AD.length > 0"}) - @Ensures({"this.AD == AD"}) - public GenotypeBuilder AD(final int[] AD) { - this.AD = AD; - return this; - } - - /** - * This genotype has this PL value, as int[]. FAST - * @return - */ - @Requires("PL == null || PL.length > 0") - @Ensures({"this.PL == PL"}) - public GenotypeBuilder PL(final int[] PL) { - this.PL = PL; - return this; - } - - /** - * This genotype has this PL value, converted from double[]. SLOW - * @return - */ - @Requires("PL == null || PL.length > 0") - @Ensures({"this.PL == PL"}) - public GenotypeBuilder PL(final double[] GLs) { - this.PL = GenotypeLikelihoods.fromLog10Likelihoods(GLs).getAsPLs(); - return this; - } - - /** - * This genotype has these attributes. - * - * Cannot contain inline attributes (DP, AD, GQ, PL) - * @return - */ - @Requires("attributes != null") - @Ensures("attributes.isEmpty() || extendedAttributes != null") - public GenotypeBuilder attributes(final Map attributes) { - for ( Map.Entry pair : attributes.entrySet() ) - attribute(pair.getKey(), pair.getValue()); - return this; - } - - /** - * Tells this builder to remove all extended attributes - * - * @return - */ - public GenotypeBuilder noAttributes() { - this.extendedAttributes = null; - return this; - } - - /** - * This genotype has this attribute key / value pair. - * - * Cannot contain inline attributes (DP, AD, GQ, PL) - * @return - */ - @Requires({"key != null"}) - @Ensures({"extendedAttributes != null", "extendedAttributes.containsKey(key)"}) - public GenotypeBuilder attribute(final String key, final Object value) { - if ( extendedAttributes == null ) - extendedAttributes = new HashMap(initialAttributeMapSize); - extendedAttributes.put(key, value); - return this; - } - - /** - * Tells this builder to make a Genotype object that has had filters applied, - * which may be empty (passes) or have some value indicating the reasons - * why it's been filtered. - * - * @param filters non-null list of filters. empty list => PASS - * @return this builder - */ - @Requires("filters != null") - public GenotypeBuilder filters(final List filters) { - if ( filters.isEmpty() ) - return filter(null); - else if ( filters.size() == 1 ) - return filter(filters.get(0)); - else - return filter(ParsingUtils.join(";", ParsingUtils.sortList(filters))); - } - - /** - * varargs version of #filters - * @param filters - * @return - */ - @Requires("filters != null") - public GenotypeBuilder filters(final String ... filters) { - return filters(Arrays.asList(filters)); - } - - /** - * Most efficient version of setting filters -- just set the filters string to filters - * - * @param filter if filters == null or filters.equals("PASS") => genotype is PASS - * @return - */ - public GenotypeBuilder filter(final String filter) { - this.filters = VCFConstants.PASSES_FILTERS_v4.equals(filter) ? null : filter; - return this; - } - - /** - * This genotype is unfiltered - * - * @return - */ - public GenotypeBuilder unfiltered() { - return filter(null); - } - - /** - * Tell's this builder that we have at most these number of attributes - * @return - */ - public GenotypeBuilder maxAttributes(final int i) { - initialAttributeMapSize = i; - return this; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeLikelihoods.java deleted file mode 100644 index 1f6da6ecc..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeLikelihoods.java +++ /dev/null @@ -1,463 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.util.Arrays; -import java.util.EnumMap; -import java.util.List; - -public class GenotypeLikelihoods { - private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; - private final static int NUM_LIKELIHOODS_CACHE_PLOIDY = 10; - // caching numAlleles up to 5 and ploidy up to 10 - private final static int[][] numLikelihoodCache = new int[NUM_LIKELIHOODS_CACHE_N_ALLELES][NUM_LIKELIHOODS_CACHE_PLOIDY]; - - public final static int MAX_PL = Short.MAX_VALUE; - - // - // There are two objects here because we are lazy in creating both representations - // for this object: a vector of log10 Probs and the PL phred-scaled string. Supports - // having one set during initializating, and dynamic creation of the other, if needed - // - private double[] log10Likelihoods = null; - private String likelihoodsAsString_PLs = null; - - - /** - * initialize num likelihoods cache - */ - static { - // must be done before PLIndexToAlleleIndex - for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) { - for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) { - numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy); - } - } - } - - /** - * The maximum number of alleles that we can represent as genotype likelihoods - */ - public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; - - /* - * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles - */ - private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); - - public final static GenotypeLikelihoods fromPLField(String PLs) { - return new GenotypeLikelihoods(PLs); - } - - @Deprecated - public final static GenotypeLikelihoods fromGLField(String GLs) { - return new GenotypeLikelihoods(parseDeprecatedGLString(GLs)); - } - - public final static GenotypeLikelihoods fromLog10Likelihoods(double[] log10Likelihoods) { - return new GenotypeLikelihoods(log10Likelihoods); - } - - public final static GenotypeLikelihoods fromPLs(final int[] pls) { - return new GenotypeLikelihoods(PLsToGLs(pls)); - } - - // - // You must use the factory methods now - // - private GenotypeLikelihoods(String asString) { - likelihoodsAsString_PLs = asString; - } - - private GenotypeLikelihoods(double[] asVector) { - log10Likelihoods = asVector; - } - - /** - * Returns the genotypes likelihoods in negative log10 vector format. pr{AA} = x, this - * vector returns math.log10(x) for each of the genotypes. Can return null if the - * genotype likelihoods are "missing". - * - * @return - */ - public double[] getAsVector() { - // assumes one of the likelihoods vector or the string isn't null - if ( log10Likelihoods == null ) { - // make sure we create the GL string if it doesn't already exist - log10Likelihoods = parsePLsIntoLikelihoods(likelihoodsAsString_PLs); - } - - return log10Likelihoods; - } - - public int[] getAsPLs() { - final double[] GLs = getAsVector(); - return GLs == null ? null : GLsToPLs(GLs); - } - - public String toString() { - return getAsString(); - } - - public String getAsString() { - if ( likelihoodsAsString_PLs == null ) { - // todo -- should we accept null log10Likelihoods and set PLs as MISSING? - if ( log10Likelihoods == null ) - throw new TribbleException("BUG: Attempted to get likelihoods as strings and neither the vector nor the string is set!"); - likelihoodsAsString_PLs = convertLikelihoodsToPLString(log10Likelihoods); - } - - return likelihoodsAsString_PLs; - } - - @Override public boolean equals(Object aThat) { - //check for self-comparison - if ( this == aThat ) return true; - - if ( !(aThat instanceof GenotypeLikelihoods) ) return false; - GenotypeLikelihoods that = (GenotypeLikelihoods)aThat; - - // now a proper field-by-field evaluation can be made. - // GLs are considered equal if the corresponding PLs are equal - return Arrays.equals(getAsPLs(), that.getAsPLs()); - } - - //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values - //Returns null in case of missing likelihoods - public EnumMap getAsMap(boolean normalizeFromLog10){ - //Make sure that the log10likelihoods are set - double[] likelihoods = normalizeFromLog10 ? GeneralUtils.normalizeFromLog10(getAsVector()) : getAsVector(); - if(likelihoods == null) - return null; - EnumMap likelihoodsMap = new EnumMap(GenotypeType.class); - likelihoodsMap.put(GenotypeType.HOM_REF,likelihoods[GenotypeType.HOM_REF.ordinal()-1]); - likelihoodsMap.put(GenotypeType.HET,likelihoods[GenotypeType.HET.ordinal()-1]); - likelihoodsMap.put(GenotypeType.HOM_VAR, likelihoods[GenotypeType.HOM_VAR.ordinal() - 1]); - return likelihoodsMap; - } - - //Return the neg log10 Genotype Quality (GQ) for the given genotype - //Returns Double.NEGATIVE_INFINITY in case of missing genotype - - /** - * This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context. - * Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List) in place of it. - * - * If you **know** you're biallelic, use getGQLog10FromLikelihoods directly. - * @param genotype - actually a genotype type (no call, hom ref, het, hom var) - * @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best). - */ - @Deprecated - public double getLog10GQ(GenotypeType genotype){ - return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); - } - - @Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"}) - private double getLog10GQ(List genotypeAlleles,List contextAlleles) { - int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0)); - int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1)); - int plIndex = calculatePLindex(allele1Index,allele2Index); - return getGQLog10FromLikelihoods(plIndex,getAsVector()); - } - - public double getLog10GQ(Genotype genotype, List vcAlleles ) { - return getLog10GQ(genotype.getAlleles(),vcAlleles); - } - - public double getLog10GQ(Genotype genotype, VariantContext context) { - return getLog10GQ(genotype,context.getAlleles()); - } - - public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ - if(likelihoods == null) - return Double.NEGATIVE_INFINITY; - - double qual = Double.NEGATIVE_INFINITY; - for (int i=0; i < likelihoods.length; i++) { - if (i==iOfChoosenGenotype) - continue; - if (likelihoods[i] >= qual) - qual = likelihoods[i]; - } - - // qual contains now max(likelihoods[k]) for all k != bestGTguess - qual = likelihoods[iOfChoosenGenotype] - qual; - - if (qual < 0) { - // QUAL can be negative if the chosen genotype is not the most likely one individually. - // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one - double[] normalized = GeneralUtils.normalizeFromLog10(likelihoods); - double chosenGenotype = normalized[iOfChoosenGenotype]; - return Math.log10(1.0 - chosenGenotype); - } else { - // invert the size, as this is the probability of making an error - return -1 * qual; - } - } - - private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { - if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { - String[] strings = likelihoodsAsString_PLs.split(","); - double[] likelihoodsAsVector = new double[strings.length]; - try { - for ( int i = 0; i < strings.length; i++ ) { - likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0; - } - } catch (NumberFormatException e) { - throw new TribbleException("The GL/PL tag contains non-integer values: " + likelihoodsAsString_PLs); - } - return likelihoodsAsVector; - } else - return null; - } - - /** - * Back-compatibility function to read old style GL formatted genotype likelihoods in VCF format - * @param GLString - * @return - */ - private final static double[] parseDeprecatedGLString(String GLString) { - if ( !GLString.equals(VCFConstants.MISSING_VALUE_v4) ) { - String[] strings = GLString.split(","); - double[] likelihoodsAsVector = new double[strings.length]; - for ( int i = 0; i < strings.length; i++ ) { - likelihoodsAsVector[i] = Double.parseDouble(strings[i]); - } - return likelihoodsAsVector; - } - - return null; - } - - private final static String convertLikelihoodsToPLString(final double[] GLs) { - if ( GLs == null ) - return VCFConstants.MISSING_VALUE_v4; - - final StringBuilder s = new StringBuilder(); - boolean first = true; - for ( final int pl : GLsToPLs(GLs) ) { - if ( ! first ) - s.append(","); - else - first = false; - - s.append(pl); - } - - return s.toString(); - } - - private final static int[] GLsToPLs(final double[] GLs) { - final int[] pls = new int[GLs.length]; - final double adjust = maxPL(GLs); - - for ( int i = 0; i < GLs.length; i++ ) { - pls[i] = (int)Math.round(Math.min(-10 * (GLs[i] - adjust), MAX_PL)); - } - - return pls; - } - - private final static double maxPL(final double[] GLs) { - double adjust = Double.NEGATIVE_INFINITY; - for ( double l : GLs ) adjust = Math.max(adjust, l); - return adjust; - } - - private final static double[] PLsToGLs(final int pls[]) { - double[] likelihoodsAsVector = new double[pls.length]; - for ( int i = 0; i < pls.length; i++ ) { - likelihoodsAsVector[i] = pls[i] / -10.0; - } - return likelihoodsAsVector; - } - - // ------------------------------------------------------------------------------------- - // - // Static conversion utilities, going from GL/PL index to allele index and vice versa. - // - // ------------------------------------------------------------------------------------- - - /* - * Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index. - * Note that the reference allele is always index=0. - */ - public static class GenotypeLikelihoodsAllelePair { - public final int alleleIndex1, alleleIndex2; - - public GenotypeLikelihoodsAllelePair(final int alleleIndex1, final int alleleIndex2) { - this.alleleIndex1 = alleleIndex1; - this.alleleIndex2 = alleleIndex2; - } - } - - private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { - final int numLikelihoods = numLikelihoods(1 + altAlleles, 2); - final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods]; - - // for all possible combinations of 2 alleles - for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) { - for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) { - cache[calculatePLindex(allele1, allele2)] = new GenotypeLikelihoodsAllelePair(allele1, allele2); - } - } - - // a bit of sanity checking - for ( int i = 0; i < cache.length; i++ ) { - if ( cache[i] == null ) - throw new IllegalStateException("BUG: cache entry " + i + " is unexpected null"); - } - - return cache; - } - - // ------------------------------------------------------------------------------------- - // - // num likelihoods given number of alleles and ploidy - // - // ------------------------------------------------------------------------------------- - - /** - * Actually does the computation in @see #numLikelihoods - * - * @param numAlleles - * @param ploidy - * @return - */ - private static final int calcNumLikelihoods(final int numAlleles, final int ploidy) { - if (numAlleles == 1) - return 1; - else if (ploidy == 1) - return numAlleles; - else { - int acc =0; - for (int k=0; k <= ploidy; k++ ) - acc += calcNumLikelihoods(numAlleles - 1, ploidy - k); - return acc; - } - } - - /** - * Compute how many likelihood elements are associated with the given number of alleles - * Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P) - * where P = ploidy (number of chromosomes) and N = total # of alleles. - * Each chromosome can be in one single state (0,...,N-1) and there are P of them. - * Naive solution would be to store N*P likelihoods, but this is not necessary because we can't distinguish chromosome states, but rather - * only total number of alt allele counts in all chromosomes. - * - * For example, S(3,2) = 6: For alleles A,B,C, on a diploid organism we have six possible genotypes: - * AA,AB,BB,AC,BC,CC. - * Another way of expressing is with vector (#of A alleles, # of B alleles, # of C alleles) - * which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2) - * In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2 - * - * Note this method caches the value for most common num Allele / ploidy combinations for efficiency - * - * Recursive implementation: - * S(N,P) = sum_{k=0}^P S(N-1,P-k) - * because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K - * With initial conditions - * S(N,1) = N (only way to have N integers add up to 1 is all-zeros except one element with a one. There are N of these vectors) - * S(1,P) = 1 (only way to have 1 integer add to P is with that integer P itself). - * - * @param numAlleles Number of alleles (including ref) - * @param ploidy Ploidy, or number of chromosomes in set - * @return Number of likelihood elements we need to hold. - */ - @Requires({"ploidy > 0", "numAlleles > 0"}) - @Ensures("result > 0") - public static int numLikelihoods(final int numAlleles, final int ploidy) { - if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES - && ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY ) - return numLikelihoodCache[numAlleles][ploidy]; - else { - // have to calculate on the fly - return calcNumLikelihoods(numAlleles, ploidy); - } - } - - // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. - // In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc." - // Assumes that allele1Index < allele2Index - public static int calculatePLindex(final int allele1Index, final int allele2Index) { - return (allele2Index * (allele2Index+1) / 2) + allele1Index; - } - - /** - * get the allele index pair for the given PL - * - * @param PLindex the PL index - * @return the allele index pair - */ - public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) { - // make sure that we've cached enough data - if ( PLindex >= PLIndexToAlleleIndex.length ) - throw new IllegalStateException("Internal limitation: cannot genotype more than " + MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + " alleles"); - - return PLIndexToAlleleIndex[PLindex]; - } - - // An index conversion from the deprecated PL ordering to the new VCF-based ordering for up to 3 alternate alleles - protected static final int[] PLindexConversion = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9}; - - /** - * get the allele index pair for the given PL using the deprecated PL ordering: - * AA,AB,AC,AD,BB,BC,BD,CC,CD,DD instead of AA,AB,BB,AC,BC,CC,AD,BD,CD,DD. - * Although it's painful to keep this conversion around, our DiploidSNPGenotypeLikelihoods class uses the deprecated - * ordering and I know with certainty that external users have built code on top of it; changing it now would - * cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method. - * This method assumes at most 3 alternate alleles. - * - * @param PLindex the PL index - * @return the allele index pair - */ - @Deprecated - public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) { - return getAllelePair(PLindexConversion[PLindex]); - } - - /** - * get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index. - * - * @param allele1Index the index in VariantContext.getAllele() of the first allele - * @param allele2Index the index in VariantContext.getAllele() of the second allele - * @return the PL indexes - */ - public static int[] getPLIndecesOfAlleles(final int allele1Index, final int allele2Index) { - - final int[] indexes = new int[3]; - indexes[0] = calculatePLindex(allele1Index, allele1Index); - indexes[1] = calculatePLindex(allele1Index, allele2Index); - indexes[2] = calculatePLindex(allele2Index, allele2Index); - return indexes; - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeType.java b/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeType.java deleted file mode 100644 index 707443121..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypeType.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -/** - * Summary types for Genotype objects - * - * @author Your Name - * @since Date created - */ -public enum GenotypeType { - /** The sample is no-called (all alleles are NO_CALL */ - NO_CALL, - /** The sample is homozygous reference */ - HOM_REF, - /** The sample is heterozygous, with at least one ref and at least one one alt in any order */ - HET, - /** All alleles are non-reference */ - HOM_VAR, - /** There is no allele data availble for this sample (alleles.isEmpty) */ - UNAVAILABLE, - /** Some chromosomes are NO_CALL and others are called */ - MIXED // no-call and call in the same genotype -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/variant/variantcontext/GenotypesContext.java deleted file mode 100644 index d0684d27e..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/GenotypesContext.java +++ /dev/null @@ -1,724 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * Represents an ordered collection of Genotype objects - */ -public class GenotypesContext implements List { - /** - * static constant value for an empty GenotypesContext. Useful since so many VariantContexts have no genotypes - */ - public final static GenotypesContext NO_GENOTYPES = - new GenotypesContext(new ArrayList(0), new HashMap(0), Collections.emptyList()).immutable(); - - /** - *sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical order - */ - List sampleNamesInOrder = null; - - /** - * a map optimized for efficient lookup. Each genotype in genotypes must have its - * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that - * genotype in the vector of genotypes - */ - Map sampleNameToOffset = null; - - /** - * An ArrayList of genotypes contained in this context - * - * WARNING: TO ENABLE THE LAZY VERSION OF THIS CLASS, NO METHODS SHOULD DIRECTLY - * ACCESS THIS VARIABLE. USE getGenotypes() INSTEAD. - * - */ - ArrayList notToBeDirectlyAccessedGenotypes; - - /** - * Cached value of the maximum ploidy observed among all samples - */ - private int maxPloidy = -1; - - /** Are we allowing users to modify the list? */ - boolean immutable = false; - - // --------------------------------------------------------------------------- - // - // private constructors -- you have to use static create methods to make these classes - // - // --------------------------------------------------------------------------- - - /** - * Create an empty GenotypeContext - */ - protected GenotypesContext() { - this(10); - } - - /** - * Create an empty GenotypeContext, with initial capacity for n elements - */ - @Requires("n >= 0") - protected GenotypesContext(final int n) { - this(new ArrayList(n)); - } - - /** - * Create an GenotypeContext containing genotypes - */ - @Requires("genotypes != null") - protected GenotypesContext(final ArrayList genotypes) { - this.notToBeDirectlyAccessedGenotypes = genotypes; - this.sampleNameToOffset = null; - } - - /** - * Create a fully resolved GenotypeContext containing genotypes, sample lookup table, - * and sorted sample names - * - * @param genotypes our genotypes in arbitrary - * @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its - * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that - * genotype in the vector of genotypes - * @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical - * order. - */ - @Requires({"genotypes != null", - "sampleNameToOffset != null", - "sampleNamesInOrder != null", - "genotypes.size() == sampleNameToOffset.size()", - "genotypes.size() == sampleNamesInOrder.size()"}) - protected GenotypesContext(final ArrayList genotypes, - final Map sampleNameToOffset, - final List sampleNamesInOrder) { - this.notToBeDirectlyAccessedGenotypes = genotypes; - this.sampleNameToOffset = sampleNameToOffset; - this.sampleNamesInOrder = sampleNamesInOrder; - } - - // --------------------------------------------------------------------------- - // - // public static factory methods - // - // --------------------------------------------------------------------------- - - /** - * Basic creation routine - * @return an empty, mutable GenotypeContext - */ - @Ensures({"result != null"}) - public static final GenotypesContext create() { - return new GenotypesContext(); - } - - /** - * Basic creation routine - * @return an empty, mutable GenotypeContext with initial capacity for nGenotypes - */ - @Requires("nGenotypes >= 0") - @Ensures({"result != null"}) - public static final GenotypesContext create(final int nGenotypes) { - return new GenotypesContext(nGenotypes); - } - - /** - * Create a fully resolved GenotypeContext containing genotypes, sample lookup table, - * and sorted sample names - * - * @param genotypes our genotypes in arbitrary - * @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its - * sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that - * genotype in the vector of genotypes - * @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical - * order. - * @return an mutable GenotypeContext containing genotypes with already present lookup data - */ - @Requires({"genotypes != null", - "sampleNameToOffset != null", - "sampleNamesInOrder != null"}) - @Ensures({"result != null"}) - public static final GenotypesContext create(final ArrayList genotypes, - final Map sampleNameToOffset, - final List sampleNamesInOrder) { - return new GenotypesContext(genotypes, sampleNameToOffset, sampleNamesInOrder); - } - - /** - * Create a fully resolved GenotypeContext containing genotypes - * - * @param genotypes our genotypes in arbitrary - * @return an mutable GenotypeContext containing genotypes - */ - @Requires({"genotypes != null"}) - @Ensures({"result != null"}) - public static final GenotypesContext create(final ArrayList genotypes) { - return genotypes == null ? NO_GENOTYPES : new GenotypesContext(genotypes); - } - - /** - * Create a fully resolved GenotypeContext containing genotypes - * - * @param genotypes our genotypes in arbitrary - * @return an mutable GenotypeContext containing genotypes - */ - @Requires({"genotypes != null"}) - @Ensures({"result != null"}) - public static final GenotypesContext create(final Genotype... genotypes) { - return create(new ArrayList(Arrays.asList(genotypes))); - } - - /** - * Create a freshly allocated GenotypeContext containing the genotypes in toCopy - * - * @param toCopy the GenotypesContext to copy - * @return an mutable GenotypeContext containing genotypes - */ - @Requires({"toCopy != null"}) - @Ensures({"result != null"}) - public static final GenotypesContext copy(final GenotypesContext toCopy) { - return create(new ArrayList(toCopy.getGenotypes())); - } - - /** - * Create a GenotypesContext containing the genotypes in iteration order contained - * in toCopy - * - * @param toCopy the collection of genotypes - * @return an mutable GenotypeContext containing genotypes - */ - @Ensures({"result != null"}) - public static final GenotypesContext copy(final Collection toCopy) { - return toCopy == null ? NO_GENOTYPES : create(new ArrayList(toCopy)); - } - - // --------------------------------------------------------------------------- - // - // Mutability methods - // - // --------------------------------------------------------------------------- - - public final GenotypesContext immutable() { - immutable = true; - return this; - } - - public boolean isMutable() { - return ! immutable; - } - - public final void checkImmutability() { - if ( immutable ) - throw new IllegalAccessError("GenotypeMap is currently immutable, but a mutator method was invoked on it"); - } - - // --------------------------------------------------------------------------- - // - // caches - // - // --------------------------------------------------------------------------- - - @Ensures({"sampleNameToOffset == null"}) - protected void invalidateSampleNameMap() { - sampleNameToOffset = null; - } - - @Ensures({"sampleNamesInOrder == null"}) - protected void invalidateSampleOrdering() { - sampleNamesInOrder = null; - } - - @Ensures({"sampleNamesInOrder != null"}) - protected void ensureSampleOrdering() { - if ( sampleNamesInOrder == null ) { - sampleNamesInOrder = new ArrayList(size()); - - for ( int i = 0; i < size(); i++ ) { - sampleNamesInOrder.add(getGenotypes().get(i).getSampleName()); - } - Collections.sort(sampleNamesInOrder); - } - } - - @Ensures({"sampleNameToOffset != null"}) - protected void ensureSampleNameMap() { - if ( sampleNameToOffset == null ) { - sampleNameToOffset = new HashMap(size()); - - for ( int i = 0; i < size(); i++ ) { - sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i); - } - } - } - - // --------------------------------------------------------------------------- - // - // Lazy methods - // - // --------------------------------------------------------------------------- - - public boolean isLazyWithData() { - return this instanceof LazyGenotypesContext && - ((LazyGenotypesContext)this).getUnparsedGenotypeData() != null; - } - - // --------------------------------------------------------------------------- - // - // Map methods - // - // --------------------------------------------------------------------------- - - protected ArrayList getGenotypes() { - return notToBeDirectlyAccessedGenotypes; - } - - @Override - public void clear() { - checkImmutability(); - invalidateSampleNameMap(); - invalidateSampleOrdering(); - getGenotypes().clear(); - } - - @Override - public int size() { - return getGenotypes().size(); - } - - @Override - public boolean isEmpty() { - return getGenotypes().isEmpty(); - } - - /** - * Adds a single genotype to this context. - * - * There are many constraints on this input, and important - * impacts on the performance of other functions provided by this - * context. - * - * First, the sample name of genotype must be unique within this - * context. However, this is not enforced in the code itself, through - * you will invalid the contract on this context if you add duplicate - * samples and are running with CoFoJa enabled. - * - * Second, adding genotype also updates the sample name -> index map, - * so add() followed by containsSample and related function is an efficient - * series of operations. - * - * Third, adding the genotype invalidates the sorted list of sample names, to - * add() followed by any of the SampleNamesInOrder operations is inefficient, as - * each SampleNamesInOrder must rebuild the sorted list of sample names at - * an O(n log n) cost. - * - * @param genotype - * @return - */ - @Override - @Requires({"genotype != null", "get(genotype.getSampleName()) == null"}) - public boolean add(final Genotype genotype) { - checkImmutability(); - invalidateSampleOrdering(); - - if ( sampleNameToOffset != null ) { - // update the name map by adding entries - sampleNameToOffset.put(genotype.getSampleName(), size()); - } - - return getGenotypes().add(genotype); - } - - @Override - @Requires("! contains(genotype)") - public void add(final int i, final Genotype genotype) { - throw new UnsupportedOperationException(); - } - - /** - * Adds all of the genotypes to this context - * - * See {@link #add(Genotype)} for important information about this functions - * constraints and performance costs - * - * @param genotypes - * @return - */ - @Override - @Requires("! containsAny(genotypes)") - public boolean addAll(final Collection genotypes) { - checkImmutability(); - invalidateSampleOrdering(); - - if ( sampleNameToOffset != null ) { - // update the name map by adding entries - int pos = size(); - for ( final Genotype g : genotypes ) { - sampleNameToOffset.put(g.getSampleName(), pos++); - } - } - - return getGenotypes().addAll(genotypes); - } - - @Override - public boolean addAll(final int i, final Collection genotypes) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean contains(final Object o) { - return getGenotypes().contains(o); - } - - @Override - public boolean containsAll(final Collection objects) { - return getGenotypes().containsAll(objects); - } - - private boolean containsAny(final Collection genotypes) { - for ( final Genotype g : genotypes ) { - if ( contains(g) ) return true; - } - return false; - } - - @Override - public Genotype get(final int i) { - return getGenotypes().get(i); - } - - /** - * What is the max ploidy among all samples? Returns defaultPloidy if no genotypes are present - * - * @param defaultPloidy the default ploidy, if all samples are no-called - * @return - */ - @Ensures("result >= 0") - public int getMaxPloidy(final int defaultPloidy) { - if ( defaultPloidy < 0 ) throw new IllegalArgumentException("defaultPloidy must be greater than or equal to 0"); - - if ( maxPloidy == -1 ) { - maxPloidy = 0; // necessary in the case where there are no genotypes - for ( final Genotype g : getGenotypes() ) { - maxPloidy = Math.max(g.getPloidy(), maxPloidy); - } - - // everything is no called so we return the default ploidy - if ( maxPloidy == 0 ) maxPloidy = defaultPloidy; - } - - return maxPloidy; - } - - /** - * Gets sample associated with this sampleName, or null if none is found - * - * @param sampleName - * @return - */ - public Genotype get(final String sampleName) { - Integer offset = getSampleI(sampleName); - return offset == null ? null : getGenotypes().get(offset); - } - - private Integer getSampleI(final String sampleName) { - ensureSampleNameMap(); - return sampleNameToOffset.get(sampleName); - } - - @Override - public int indexOf(final Object o) { - return getGenotypes().indexOf(o); - } - - @Override - public Iterator iterator() { - return getGenotypes().iterator(); - } - - @Override - public int lastIndexOf(final Object o) { - return getGenotypes().lastIndexOf(o); - } - - @Override - public ListIterator listIterator() { - // todo -- must be immutable - throw new UnsupportedOperationException(); -// return genotypes.listIterator(); - } - - @Override - public ListIterator listIterator(final int i) { - // todo -- must be immutable - throw new UnsupportedOperationException(); -// return genotypes.listIterator(i); - } - - /** - * Note that remove requires us to invalidate our sample -> index - * cache. The loop: - * - * GenotypesContext gc = ... - * for ( sample in samples ) - * if ( gc.containsSample(sample) ) - * gc.remove(sample) - * - * is extremely inefficient, as each call to remove invalidates the cache - * and containsSample requires us to rebuild it, an O(n) operation. - * - * If you must remove many samples from the GC, use either removeAll or retainAll - * to avoid this O(n * m) operation. - * - * @param i - * @return - */ - @Override - public Genotype remove(final int i) { - checkImmutability(); - invalidateSampleNameMap(); - invalidateSampleOrdering(); - return getGenotypes().remove(i); - } - - /** - * See for important warning {@link this.remove(Integer)} - * @param o - * @return - */ - @Override - public boolean remove(final Object o) { - checkImmutability(); - invalidateSampleNameMap(); - invalidateSampleOrdering(); - return getGenotypes().remove(o); - } - - @Override - public boolean removeAll(final Collection objects) { - checkImmutability(); - invalidateSampleNameMap(); - invalidateSampleOrdering(); - return getGenotypes().removeAll(objects); - } - - @Override - public boolean retainAll(final Collection objects) { - checkImmutability(); - invalidateSampleNameMap(); - invalidateSampleOrdering(); - return getGenotypes().retainAll(objects); - } - - @Override - public Genotype set(final int i, final Genotype genotype) { - checkImmutability(); - final Genotype prev = getGenotypes().set(i, genotype); - - invalidateSampleOrdering(); - if ( sampleNameToOffset != null ) { - // update the name map by removing the old entry and replacing it with the new one - sampleNameToOffset.remove(prev.getSampleName()); - sampleNameToOffset.put(genotype.getSampleName(), i); - } - - return prev; - } - - /** - * Replaces the genotype in this context -- note for efficiency - * reasons we do not add the genotype if it's not present. The - * return value will be null indicating this happened. - * - * Note this operation is preserves the map cache Sample -> Offset but - * invalidates the sorted list of samples. Using replace within a loop - * containing any of the SampleNameInOrder operation requires an O(n log n) - * resorting after each replace operation. - * - * @param genotype a non null genotype to bind in this context - * @return null if genotype was not added, otherwise returns the previous genotype - */ - @Requires("genotype != null") - public Genotype replace(final Genotype genotype) { - checkImmutability(); - Integer offset = getSampleI(genotype.getSampleName()); - if ( offset == null ) - return null; - else - return set(offset, genotype); - } - - @Override - public List subList(final int i, final int i1) { - return getGenotypes().subList(i, i1); - } - - @Override - public Object[] toArray() { - return getGenotypes().toArray(); - } - - @Override - public T[] toArray(final T[] ts) { - return getGenotypes().toArray(ts); - } - - /** - * Iterate over the Genotypes in this context in the order specified by sampleNamesInOrder - * - * @param sampleNamesInOrder a Iterable of String, containing exactly one entry for each Genotype sample name in - * this context - * @return a Iterable over the genotypes in this context. - */ - @Requires("sampleNamesInOrder != null") - public Iterable iterateInSampleNameOrder(final Iterable sampleNamesInOrder) { - return new Iterable() { - @Override - public Iterator iterator() { - return new InOrderIterator(sampleNamesInOrder.iterator()); - } - }; - } - - /** - * Iterate over the Genotypes in this context in their sample name order (A, B, C) - * regardless of the underlying order in the vector of genotypes - * @return a Iterable over the genotypes in this context. - */ - public Iterable iterateInSampleNameOrder() { - return iterateInSampleNameOrder(getSampleNamesOrderedByName()); - } - - private final class InOrderIterator implements Iterator { - final Iterator sampleNamesInOrder; - - private InOrderIterator(final Iterator sampleNamesInOrder) { - this.sampleNamesInOrder = sampleNamesInOrder; - } - - @Override - public boolean hasNext() { - return sampleNamesInOrder.hasNext(); - } - - @Override - public Genotype next() { - return get(sampleNamesInOrder.next()); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - } - - /** - * @return The set of sample names for all genotypes in this context, in arbitrary order - */ - @Ensures("result != null") - public Set getSampleNames() { - ensureSampleNameMap(); - return sampleNameToOffset.keySet(); - } - - /** - * @return The set of sample names for all genotypes in this context, in their natural ordering (A, B, C) - */ - @Ensures("result != null") - public List getSampleNamesOrderedByName() { - ensureSampleOrdering(); - return sampleNamesInOrder; - } - - @Requires("sample != null") - public boolean containsSample(final String sample) { - ensureSampleNameMap(); - return sampleNameToOffset.containsKey(sample); - } - - @Requires("samples != null") - public boolean containsSamples(final Collection samples) { - return getSampleNames().containsAll(samples); - } - - /** - * Return a freshly allocated subcontext of this context containing only the samples - * listed in samples. Note that samples can contain names not in this context, they - * will just be ignored. - * - * @param samples - * @return - */ - @Requires("samples != null") - @Ensures("result != null") - public GenotypesContext subsetToSamples( final Set samples ) { - final int nSamples = samples.size(); - - if ( nSamples == 0 ) - return NO_GENOTYPES; - else { // nGenotypes < nSamples - final GenotypesContext subset = create(samples.size()); - for ( final String sample : samples ) { - final Genotype g = get(sample); - if ( g != null ) - subset.add(g); - } - return subset; - } - } - - @Override - public String toString() { - final List gS = new ArrayList(); - for ( final Genotype g : this.iterateInSampleNameOrder() ) - gS.add(g.toString()); - return "[" + join(",", gS) + "]"; - } - - // copied from Utils - private static String join(final String separator, final Collection objects) { - if (objects.isEmpty()) { // fast path for empty collection - return ""; - } else { - final Iterator iter = objects.iterator(); - final T first = iter.next(); - - if ( ! iter.hasNext() ) // fast path for singleton collections - return first.toString(); - else { // full path for 2+ collection that actually need a join - final StringBuilder ret = new StringBuilder(first.toString()); - while(iter.hasNext()) { - ret.append(separator); - ret.append(iter.next().toString()); - } - return ret.toString(); - } - } - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/LazyGenotypesContext.java b/public/java/src/org/broadinstitute/variant/variantcontext/LazyGenotypesContext.java deleted file mode 100644 index 4825615a2..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/LazyGenotypesContext.java +++ /dev/null @@ -1,198 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -/** - * Lazy-loading GenotypesContext. A lazy-loading context has access to the - * VCFParser and a unparsed string of genotype data. If the user attempts to manipulate - * the genotypes contained in this context, we decode the data and become a full blown - * GenotypesContext. However, if the user never does this we are spared a lot of expense - * decoding the genotypes unnecessarily. - */ -public class LazyGenotypesContext extends GenotypesContext { - /** The LazyParser we'll use to decode unparsedGenotypeData if necessary */ - final LazyParser parser; - - Object unparsedGenotypeData; - - /** - * nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data - * (known already in the parser). Useful for isEmpty and size() optimizations - */ - final int nUnparsedGenotypes; - - /** - * True if we've already decoded the values in unparsedGenotypeData - */ - boolean loaded = false; - - private final static ArrayList EMPTY = new ArrayList(0); - - /** - * Simple lazy parser interface. Provide an object implementing this - * interface to LazyGenotypesContext, and it's parse method will be called - * when the use of the lazy context requires the underlying genotypes data - * be parsed into Genotype objects. The data argument is the data provided - * to the LazyGenotypesContext holding encoded genotypes data - */ - public interface LazyParser { - @Requires("data != null") - @Ensures("result != null") - public LazyData parse(Object data); - } - - /** - * Returns the data used in the full GenotypesContext constructor - * - * {@link GenotypesContext#GenotypesContext(java.util.ArrayList, java.util.Map, java.util.List)} - */ - public static class LazyData { - final ArrayList genotypes; - final Map sampleNameToOffset; - final List sampleNamesInOrder; - - @Requires({"genotypes != null", "sampleNamesInOrder != null", "sampleNameToOffset != null"}) - public LazyData(final ArrayList genotypes, - final List sampleNamesInOrder, - final Map sampleNameToOffset) { - this.genotypes = genotypes; - this.sampleNamesInOrder = sampleNamesInOrder; - this.sampleNameToOffset = sampleNameToOffset; - } - } - - /** - * Creates a new lazy loading genotypes context using the LazyParser to create - * genotypes data on demand. - * - * @param parser the parser to be used to load on-demand genotypes data - * @param unparsedGenotypeData the encoded genotypes data that we will decode if necessary - * @param nUnparsedGenotypes the number of genotypes that will be produced if / when we actually decode the genotypes data - */ - @Requires({"parser != null", "unparsedGenotypeData != null", "nUnparsedGenotypes >= 0"}) - public LazyGenotypesContext(final LazyParser parser, final Object unparsedGenotypeData, final int nUnparsedGenotypes) { - super(EMPTY); - this.parser = parser; - this.unparsedGenotypeData = unparsedGenotypeData; - this.nUnparsedGenotypes = nUnparsedGenotypes; - } - - /** - * Overrides the genotypes accessor. If we haven't already, decode the genotypes data - * and store the decoded results in the appropriate variables. Otherwise we just - * returned the decoded result directly. Note some care needs to be taken here as - * the value in notToBeDirectlyAccessedGenotypes may diverge from what would be produced - * by decode, if after the first decode the genotypes themselves are replaced - * @return - */ - @Override - @Ensures("result != null") - protected ArrayList getGenotypes() { - decode(); - return notToBeDirectlyAccessedGenotypes; - } - - /** - * Force us to decode the genotypes, if not already done - */ - public void decode() { - if ( ! loaded ) { - //System.out.printf("Loading genotypes... %s:%d%n", contig, start); - LazyData parsed = parser.parse(unparsedGenotypeData); - notToBeDirectlyAccessedGenotypes = parsed.genotypes; - sampleNamesInOrder = parsed.sampleNamesInOrder; - sampleNameToOffset = parsed.sampleNameToOffset; - loaded = true; - unparsedGenotypeData = null; // don't hold the unparsed data any longer - - // warning -- this path allows us to create a VariantContext that doesn't run validateGenotypes() - // That said, it's not such an important routine -- it's just checking that the genotypes - // are well formed w.r.t. the alleles list, but this will be enforced within the VCFCodec - } - } - - /** - * Overrides the ensure* functionality. If the data hasn't been loaded - * yet and we want to build the cache, just decode it and we're done. If we've - * already decoded the data, though, go through the super class - */ - @Override - protected synchronized void ensureSampleNameMap() { - if ( ! loaded ) { - decode(); // will load up all of the necessary data - } else { - super.ensureSampleNameMap(); - } - } - - @Override - protected synchronized void ensureSampleOrdering() { - if ( ! loaded ) { - decode(); // will load up all of the necessary data - } else { - super.ensureSampleOrdering(); - } - } - - @Override - protected void invalidateSampleNameMap() { - // if the cache is invalidated, and we haven't loaded our data yet, do so - if ( ! loaded ) decode(); - super.invalidateSampleNameMap(); - } - - @Override - protected void invalidateSampleOrdering() { - // if the cache is invalidated, and we haven't loaded our data yet, do so - if ( ! loaded ) decode(); - super.invalidateSampleOrdering(); - } - - @Override - public boolean isEmpty() { - // optimization -- we know the number of samples in the unparsed data, so use it here to - // avoid parsing just to know if the genotypes context is empty - return loaded ? super.isEmpty() : nUnparsedGenotypes == 0; - } - - @Override - public int size() { - // optimization -- we know the number of samples in the unparsed data, so use it here to - // avoid parsing just to know the size of the context - return loaded ? super.size() : nUnparsedGenotypes; - } - - public Object getUnparsedGenotypeData() { - return unparsedGenotypeData; - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContext.java deleted file mode 100644 index 1fce89431..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContext.java +++ /dev/null @@ -1,1571 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import org.broad.tribble.Feature; -import org.broad.tribble.TribbleException; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.*; - -import java.util.*; - -/** - * Class VariantContext - * - * == High-level overview == - * - * The VariantContext object is a single general class system for representing genetic variation data composed of: - * - * * Allele: representing single genetic haplotypes (A, T, ATC, -) - * * Genotype: an assignment of alleles for each chromosome of a single named sample at a particular locus - * * VariantContext: an abstract class holding all segregating alleles at a locus as well as genotypes - * for multiple individuals containing alleles at that locus - * - * The class system works by defining segregating alleles, creating a variant context representing the segregating - * information at a locus, and potentially creating and associating genotypes with individuals in the context. - * - * All of the classes are highly validating -- call validate() if you modify them -- so you can rely on the - * self-consistency of the data once you have a VariantContext in hand. The system has a rich set of assessor - * and manipulator routines, as well as more complex static support routines in VariantContextUtils. - * - * The VariantContext (and Genotype) objects are attributed (supporting addition of arbitrary key/value pairs) and - * filtered (can represent a variation that is viewed as suspect). - * - * VariantContexts are dynamically typed, so whether a VariantContext is a SNP, Indel, or NoVariant depends - * on the properties of the alleles in the context. See the detailed documentation on the Type parameter below. - * - * It's also easy to create subcontexts based on selected genotypes. - * - * == Working with Variant Contexts == - * By default, VariantContexts are immutable. In order to access (in the rare circumstances where you need them) - * setter routines, you need to create MutableVariantContexts and MutableGenotypes. - * - * === Some example data === - * - * Allele A, Aref, T, Tref; - * Allele del, delRef, ATC, ATCref; - * - * A [ref] / T at 10 - * GenomeLoc snpLoc = GenomeLocParser.createGenomeLoc("chr1", 10, 10); - * - * - / ATC [ref] from 20-23 - * GenomeLoc delLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 22); - * - * // - [ref] / ATC immediately after 20 - * GenomeLoc insLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 20); - * - * === Alleles === - * - * See the documentation in the Allele class itself - * - * What are they? - * - * Alleles can be either reference or non-reference - * - * Example alleles used here: - * - * del = new Allele("-"); - * A = new Allele("A"); - * Aref = new Allele("A", true); - * T = new Allele("T"); - * ATC = new Allele("ATC"); - * - * === Creating variant contexts === - * - * ==== By hand ==== - * - * Here's an example of a A/T polymorphism with the A being reference: - * - *
- * VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref, T));
- * 
- * - * If you want to create a non-variant site, just put in a single reference allele - * - *
- * VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref));
- * 
- * - * A deletion is just as easy: - * - *
- * VariantContext vc = new VariantContext(name, delLoc, Arrays.asList(ATCref, del));
- * 
- * - * The only 2 things that distinguishes between a insertion and deletion are the reference allele - * and the location of the variation. An insertion has a Null reference allele and at least - * one non-reference Non-Null allele. Additionally, the location of the insertion is immediately after - * a 1-bp GenomeLoc (at say 20). - * - *
- * VariantContext vc = new VariantContext("name", insLoc, Arrays.asList(delRef, ATC));
- * 
- * - * ==== Converting rods and other data structures to VCs ==== - * - * You can convert many common types into VariantContexts using the general function: - * - *
- * VariantContextAdaptors.convertToVariantContext(name, myObject)
- * 
- * - * dbSNP and VCFs, for example, can be passed in as myObject and a VariantContext corresponding to that - * object will be returned. A null return type indicates that the type isn't yet supported. This is the best - * and easiest way to create contexts using RODs. - * - * - * === Working with genotypes === - * - *
- * List alleles = Arrays.asList(Aref, T);
- * Genotype g1 = new Genotype(Arrays.asList(Aref, Aref), "g1", 10);
- * Genotype g2 = new Genotype(Arrays.asList(Aref, T), "g2", 10);
- * Genotype g3 = new Genotype(Arrays.asList(T, T), "g3", 10);
- * VariantContext vc = new VariantContext(snpLoc, alleles, Arrays.asList(g1, g2, g3));
- * 
- * - * At this point we have 3 genotypes in our context, g1-g3. - * - * You can assess a good deal of information about the genotypes through the VariantContext: - * - *
- * vc.hasGenotypes()
- * vc.isMonomorphicInSamples()
- * vc.isPolymorphicInSamples()
- * vc.getSamples().size()
- *
- * vc.getGenotypes()
- * vc.getGenotypes().get("g1")
- * vc.hasGenotype("g1")
- *
- * vc.getCalledChrCount()
- * vc.getCalledChrCount(Aref)
- * vc.getCalledChrCount(T)
- * 
- * - * === NO_CALL alleles === - * - * The system allows one to create Genotypes carrying special NO_CALL alleles that aren't present in the - * set of context alleles and that represent undetermined alleles in a genotype: - * - * Genotype g4 = new Genotype(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), "NO_DATA_FOR_SAMPLE", 10); - * - * - * === subcontexts === - * It's also very easy get subcontext based only the data in a subset of the genotypes: - * - *
- * VariantContext vc12 = vc.subContextFromGenotypes(Arrays.asList(g1,g2));
- * VariantContext vc1 = vc.subContextFromGenotypes(Arrays.asList(g1));
- * 
- * - * - * Fully decoding. Currently VariantContexts support some fields, particularly those - * stored as generic attributes, to be of any type. For example, a field AB might - * be naturally a floating point number, 0.51, but when it's read into a VC its - * not decoded into the Java presentation but left as a string "0.51". A fully - * decoded VariantContext is one where all values have been converted to their - * corresponding Java object types, based on the types declared in a VCFHeader. - * - * The fullyDecode() takes a header object and creates a new fully decoded VariantContext - * where all fields are converted to their true java representation. The VCBuilder - * can be told that all fields are fully decoded, in which case no work is done when - * asking for a fully decoded version of the VC. - * - * - * @author depristo - */ -public class VariantContext implements Feature { // to enable tribble integration - private final static boolean WARN_ABOUT_BAD_END = true; - private final static int MAX_ALLELE_SIZE_FOR_NON_SV = 150; - private boolean fullyDecoded = false; - protected CommonInfo commonInfo = null; - public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; - - public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); - - /** The location of this VariantContext */ - final protected String contig; - final protected long start; - final protected long stop; - private final String ID; - - /** The type (cached for performance reasons) of this context */ - protected Type type = null; - - /** A set of the alleles segregating in this context */ - final protected List alleles; - - /** A mapping from sampleName -> genotype objects for all genotypes associated with this context */ - protected GenotypesContext genotypes = null; - - /** Counts for each of the possible Genotype types in this context */ - protected int[] genotypeCounts = null; - - public final static GenotypesContext NO_GENOTYPES = GenotypesContext.NO_GENOTYPES; - - // a fast cached access point to the ref / alt alleles for biallelic case - private Allele REF = null; - - // set to the alt allele when biallelic, otherwise == null - private Allele ALT = null; - - /* cached monomorphic value: null -> not yet computed, False, True */ - private Boolean monomorphic = null; - - // --------------------------------------------------------------------------------------------------------- - // - // validation mode - // - // --------------------------------------------------------------------------------------------------------- - - public enum Validation { - ALLELES, - GENOTYPES - } - - private final static EnumSet NO_VALIDATION = EnumSet.noneOf(Validation.class); - - // --------------------------------------------------------------------------------------------------------- - // - // constructors: see VariantContextBuilder - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Copy constructor - * - * @param other the VariantContext to copy - */ - protected VariantContext(VariantContext other) { - this(other.getSource(), other.getID(), other.getChr(), other.getStart(), other.getEnd(), - other.getAlleles(), other.getGenotypes(), other.getLog10PError(), - other.getFiltersMaybeNull(), - other.getAttributes(), - other.fullyDecoded, NO_VALIDATION); - } - - /** - * the actual constructor. Private access only - * - * @param source source - * @param contig the contig - * @param start the start base (one based) - * @param stop the stop reference base (one based) - * @param alleles alleles - * @param genotypes genotypes map - * @param log10PError qual - * @param filters filters: use null for unfiltered and empty set for passes filters - * @param attributes attributes - * @param validationToPerform set of validation steps to take - */ - protected VariantContext(final String source, - final String ID, - final String contig, - final long start, - final long stop, - final Collection alleles, - final GenotypesContext genotypes, - final double log10PError, - final Set filters, - final Map attributes, - final boolean fullyDecoded, - final EnumSet validationToPerform ) { - if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } - this.contig = contig; - this.start = start; - this.stop = stop; - - // intern for efficiency. equals calls will generate NPE if ID is inappropriately passed in as null - if ( ID == null || ID.equals("") ) throw new IllegalArgumentException("ID field cannot be the null or the empty string"); - this.ID = ID.equals(VCFConstants.EMPTY_ID_FIELD) ? VCFConstants.EMPTY_ID_FIELD : ID; - - this.commonInfo = new CommonInfo(source, log10PError, filters, attributes); - - if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } - - // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles - this.alleles = makeAlleles(alleles); - - if ( genotypes == null || genotypes == NO_GENOTYPES ) { - this.genotypes = NO_GENOTYPES; - } else { - this.genotypes = genotypes.immutable(); - } - - // cache the REF and ALT alleles - int nAlleles = alleles.size(); - for ( Allele a : alleles ) { - if ( a.isReference() ) { - REF = a; - } else if ( nAlleles == 2 ) { // only cache ALT when biallelic - ALT = a; - } - } - - this.fullyDecoded = fullyDecoded; - - if ( ! validationToPerform.isEmpty() ) { - validate(validationToPerform); - } - } - - // --------------------------------------------------------------------------------------------------------- - // - // Selectors - // - // --------------------------------------------------------------------------------------------------------- - - /** - * This method subsets down to a set of samples. - * - * At the same time returns the alleles to just those in use by the samples, - * if rederiveAllelesFromGenotypes is true, otherwise the full set of alleles - * in this VC is returned as the set of alleles in the subContext, even if - * some of those alleles aren't in the samples - * - * WARNING: BE CAREFUL WITH rederiveAllelesFromGenotypes UNLESS YOU KNOW WHAT YOU ARE DOING? - * - * @param sampleNames the sample names - * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples, true should be default - * @return new VariantContext subsetting to just the given samples - */ - public VariantContext subContextFromSamples(Set sampleNames, final boolean rederiveAllelesFromGenotypes ) { - if ( sampleNames.containsAll(getSampleNames()) && ! rederiveAllelesFromGenotypes ) { - return this; // fast path when you don't have any work to do - } else { - VariantContextBuilder builder = new VariantContextBuilder(this); - GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames); - - if ( rederiveAllelesFromGenotypes ) - builder.alleles(allelesOfGenotypes(newGenotypes)); - else { - builder.alleles(alleles); - } - - return builder.genotypes(newGenotypes).make(); - } - } - - /** - * @see #subContextFromSamples(java.util.Set, boolean) with rederiveAllelesFromGenotypes = true - * - * @param sampleNames - * @return - */ - public VariantContext subContextFromSamples(final Set sampleNames) { - return subContextFromSamples(sampleNames, true); - } - - public VariantContext subContextFromSample(String sampleName) { - return subContextFromSamples(Collections.singleton(sampleName)); - } - - /** - * helper routine for subcontext - * @param genotypes genotypes - * @return allele set - */ - private final Set allelesOfGenotypes(Collection genotypes) { - final Set alleles = new HashSet(); - - boolean addedref = false; - for ( final Genotype g : genotypes ) { - for ( final Allele a : g.getAlleles() ) { - addedref = addedref || a.isReference(); - if ( a.isCalled() ) - alleles.add(a); - } - } - if ( ! addedref ) alleles.add(getReference()); - - return alleles; - } - - // --------------------------------------------------------------------------------------------------------- - // - // type operations - // - // --------------------------------------------------------------------------------------------------------- - - /** - * see: http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=handbook&part=ch5&rendertype=table&id=ch5.ch5_t3 - * - * Format: - * dbSNP variation class - * Rules for assigning allele classes - * Sample allele definition - * - * Single Nucleotide Polymorphisms (SNPs)a - * Strictly defined as single base substitutions involving A, T, C, or G. - * A/T - * - * Deletion/Insertion Polymorphisms (DIPs) - * Designated using the full sequence of the insertion as one allele, and either a fully - * defined string for the variant allele or a '-' character to specify the deleted allele. - * This class will be assigned to a variation if the variation alleles are of different lengths or - * if one of the alleles is deleted ('-'). - * T/-/CCTA/G - * - * No-variation - * Reports may be submitted for segments of sequence that are assayed and determined to be invariant - * in the sample. - * (NoVariation) - * - * Mixed - * Mix of other classes - * - * Also supports NO_VARIATION type, used to indicate that the site isn't polymorphic in the population - * - * - * Not currently supported: - * - * Heterozygous sequence - * The term heterozygous is used to specify a region detected by certain methods that do not - * resolve the polymorphism into a specific sequence motif. In these cases, a unique flanking - * sequence must be provided to define a sequence context for the variation. - * (heterozygous) - * - * Microsatellite or short tandem repeat (STR) - * Alleles are designated by providing the repeat motif and the copy number for each allele. - * Expansion of the allele repeat motif designated in dbSNP into full-length sequence will - * be only an approximation of the true genomic sequence because many microsatellite markers are - * not fully sequenced and are resolved as size variants only. - * (CAC)8/9/10/11 - * - * Named variant - * Applies to insertion/deletion polymorphisms of longer sequence features, such as retroposon - * dimorphism for Alu or line elements. These variations frequently include a deletion '-' indicator - * for the absent allele. - * (alu) / - - * - * Multi-Nucleotide Polymorphism (MNP) - * Assigned to variations that are multi-base variations of a single, common length - * GGA/AGT - */ - public enum Type { - NO_VARIATION, - SNP, - MNP, // a multi-nucleotide polymorphism - INDEL, - SYMBOLIC, - MIXED, - } - - /** - * Determines (if necessary) and returns the type of this variation by examining the alleles it contains. - * - * @return the type of this VariantContext - **/ - public Type getType() { - if ( type == null ) - determineType(); - - return type; - } - - /** - * convenience method for SNPs - * - * @return true if this is a SNP, false otherwise - */ - public boolean isSNP() { return getType() == Type.SNP; } - - - /** - * convenience method for variants - * - * @return true if this is a variant allele, false if it's reference - */ - public boolean isVariant() { return getType() != Type.NO_VARIATION; } - - /** - * convenience method for point events - * - * @return true if this is a SNP or ref site, false if it's an indel or mixed event - */ - public boolean isPointEvent() { return isSNP() || !isVariant(); } - - /** - * convenience method for indels - * - * @return true if this is an indel, false otherwise - */ - public boolean isIndel() { return getType() == Type.INDEL; } - - /** - * @return true if the alleles indicate a simple insertion (i.e., the reference allele is Null) - */ - public boolean isSimpleInsertion() { - // can't just call !isSimpleDeletion() because of complex indels - return getType() == Type.INDEL && isBiallelic() && getReference().length() == 1; - } - - /** - * @return true if the alleles indicate a simple deletion (i.e., a single alt allele that is Null) - */ - public boolean isSimpleDeletion() { - // can't just call !isSimpleInsertion() because of complex indels - return getType() == Type.INDEL && isBiallelic() && getAlternateAllele(0).length() == 1; - } - - /** - * @return true if the alleles indicate neither a simple deletion nor a simple insertion - */ - public boolean isComplexIndel() { - return isIndel() && !isSimpleDeletion() && !isSimpleInsertion(); - } - - public boolean isSymbolic() { - return getType() == Type.SYMBOLIC; - } - - public boolean isStructuralIndel() { - if ( getType() == Type.INDEL ) { - List sizes = getIndelLengths(); - if ( sizes != null ) { - for ( Integer length : sizes ) { - if ( length > MAX_ALLELE_SIZE_FOR_NON_SV ) { - return true; - } - } - } - } - return false; - } - - /** - * - * @return true if the variant is symbolic or a large indel - */ - public boolean isSymbolicOrSV() { - return isSymbolic() || isStructuralIndel(); - } - - public boolean isMNP() { - return getType() == Type.MNP; - } - - /** - * convenience method for indels - * - * @return true if this is an mixed variation, false otherwise - */ - public boolean isMixed() { return getType() == Type.MIXED; } - - - // --------------------------------------------------------------------------------------------------------- - // - // Generic accessors - // - // --------------------------------------------------------------------------------------------------------- - - public boolean hasID() { - return getID() != VCFConstants.EMPTY_ID_FIELD; - } - - public boolean emptyID() { - return ! hasID(); - } - - public String getID() { - return ID; - } - - - // --------------------------------------------------------------------------------------------------------- - // - // get routines to access context info fields - // - // --------------------------------------------------------------------------------------------------------- - public String getSource() { return commonInfo.getName(); } - public Set getFiltersMaybeNull() { return commonInfo.getFiltersMaybeNull(); } - public Set getFilters() { return commonInfo.getFilters(); } - public boolean isFiltered() { return commonInfo.isFiltered(); } - public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } - public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); } - public boolean hasLog10PError() { return commonInfo.hasLog10PError(); } - public double getLog10PError() { return commonInfo.getLog10PError(); } - public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } - - public Map getAttributes() { return commonInfo.getAttributes(); } - public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); } - public Object getAttribute(String key) { return commonInfo.getAttribute(key); } - - public Object getAttribute(String key, Object defaultValue) { - return commonInfo.getAttribute(key, defaultValue); - } - - public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - - public CommonInfo getCommonInfo() { - return commonInfo; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with alleles - // - // --------------------------------------------------------------------------------------------------------- - - /** - * @return the reference allele for this context - */ - public Allele getReference() { - Allele ref = REF; - if ( ref == null ) - throw new IllegalStateException("BUG: no reference allele found at " + this); - return ref; - } - - - /** - * @return true if the context is strictly bi-allelic - */ - public boolean isBiallelic() { - return getNAlleles() == 2; - } - - /** - * @return The number of segregating alleles in this context - */ - public int getNAlleles() { - return alleles.size(); - } - - /** - * Returns the maximum ploidy of all samples in this VC, or default if there are no genotypes - * - * This function is caching, so it's only expensive on the first call - * - * @param defaultPloidy the default ploidy, if all samples are no-called - * @return default, or the max ploidy - */ - public int getMaxPloidy(final int defaultPloidy) { - return genotypes.getMaxPloidy(defaultPloidy); - } - - /** - * @return The allele sharing the same bases as this String. A convenience method; better to use byte[] - */ - public Allele getAllele(String allele) { - return getAllele(allele.getBytes()); - } - - /** - * @return The allele sharing the same bases as this byte[], or null if no such allele is present. - */ - public Allele getAllele(byte[] allele) { - return Allele.getMatchingAllele(getAlleles(), allele); - } - - /** - * @return True if this context contains Allele allele, or false otherwise - */ - public boolean hasAllele(final Allele allele) { - return hasAllele(allele, false, true); - } - - public boolean hasAllele(final Allele allele, final boolean ignoreRefState) { - return hasAllele(allele, ignoreRefState, true); - } - - public boolean hasAlternateAllele(final Allele allele) { - return hasAllele(allele, false, false); - } - - public boolean hasAlternateAllele(final Allele allele, final boolean ignoreRefState) { - return hasAllele(allele, ignoreRefState, false); - } - - private boolean hasAllele(final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele) { - if ( (considerRefAllele && allele == REF) || allele == ALT ) // optimization for cached cases - return true; - - final List allelesToConsider = considerRefAllele ? getAlleles() : getAlternateAlleles(); - for ( Allele a : allelesToConsider ) { - if ( a.equals(allele, ignoreRefState) ) - return true; - } - - return false; - } - - - /** - * Gets the alleles. This method should return all of the alleles present at the location, - * including the reference allele. There are no constraints imposed on the ordering of alleles - * in the set. If the reference is not an allele in this context it will not be included. - * - * @return the set of alleles - */ - public List getAlleles() { return alleles; } - - /** - * Gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference allele. There are no constraints imposed on the ordering of alleles - * in the set. - * - * @return the set of alternate alleles - */ - public List getAlternateAlleles() { - return alleles.subList(1, alleles.size()); - } - - /** - * Gets the sizes of the alternate alleles if they are insertion/deletion events, and returns a list of their sizes - * - * @return a list of indel lengths ( null if not of type indel or mixed ) - */ - public List getIndelLengths() { - if ( getType() != Type.INDEL && getType() != Type.MIXED ) { - return null; - } - - List lengths = new ArrayList(); - for ( Allele a : getAlternateAlleles() ) { - lengths.add(a.length() - getReference().length()); - } - - return lengths; - } - - /** - * @param i -- the ith allele (from 0 to n - 2 for a context with n alleles including a reference allele) - * @return the ith non-reference allele in this context - * @throws IllegalArgumentException if i is invalid - */ - public Allele getAlternateAllele(int i) { - return alleles.get(i+1); - } - - /** - * @param other VariantContext whose alleles to compare against - * @return true if this VariantContext has the same alleles (both ref and alts) as other, - * regardless of ordering. Otherwise returns false. - */ - public boolean hasSameAllelesAs ( final VariantContext other ) { - return hasSameAlternateAllelesAs(other) && other.getReference().equals(getReference(), false); - } - - /** - * @param other VariantContext whose alternate alleles to compare against - * @return true if this VariantContext has the same alternate alleles as other, - * regardless of ordering. Otherwise returns false. - */ - public boolean hasSameAlternateAllelesAs ( final VariantContext other ) { - List thisAlternateAlleles = getAlternateAlleles(); - List otherAlternateAlleles = other.getAlternateAlleles(); - - if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { - return false; - } - - for ( Allele allele : thisAlternateAlleles ) { - if ( ! otherAlternateAlleles.contains(allele) ) { - return false; - } - } - - return true; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Working with genotypes - // - // --------------------------------------------------------------------------------------------------------- - - /** - * @return the number of samples in the context - */ - public int getNSamples() { - return genotypes.size(); - } - - /** - * @return true if the context has associated genotypes - */ - public boolean hasGenotypes() { - return ! genotypes.isEmpty(); - } - - public boolean hasGenotypes(Collection sampleNames) { - return genotypes.containsSamples(sampleNames); - } - - /** - * @return set of all Genotypes associated with this context - */ - public GenotypesContext getGenotypes() { - return genotypes; - } - - public Iterable getGenotypesOrderedByName() { - return genotypes.iterateInSampleNameOrder(); - } - - public Iterable getGenotypesOrderedBy(Iterable sampleOrdering) { - return genotypes.iterateInSampleNameOrder(sampleOrdering); - } - - /** - * Returns a map from sampleName -> Genotype for the genotype associated with sampleName. Returns a map - * for consistency with the multi-get function. - * - * @param sampleName the sample name - * @return mapping from sample name to genotype - * @throws IllegalArgumentException if sampleName isn't bound to a genotype - */ - public GenotypesContext getGenotypes(String sampleName) { - return getGenotypes(Collections.singleton(sampleName)); - } - - /** - * Returns a map from sampleName -> Genotype for each sampleName in sampleNames. Returns a map - * for consistency with the multi-get function. - * - * For testing convenience only - * - * @param sampleNames a unique list of sample names - * @return subsetting genotypes context - * @throws IllegalArgumentException if sampleName isn't bound to a genotype - */ - protected GenotypesContext getGenotypes(Collection sampleNames) { - return getGenotypes().subsetToSamples(new HashSet(sampleNames)); - } - - public GenotypesContext getGenotypes(Set sampleNames) { - return getGenotypes().subsetToSamples(sampleNames); - } - - - /** - * @return the set of all sample names in this context, not ordered - */ - public Set getSampleNames() { - return getGenotypes().getSampleNames(); - } - - public List getSampleNamesOrderedByName() { - return getGenotypes().getSampleNamesOrderedByName(); - } - - /** - * @param sample the sample name - * - * @return the Genotype associated with the given sample in this context or null if the sample is not in this context - */ - public Genotype getGenotype(String sample) { - return getGenotypes().get(sample); - } - - public boolean hasGenotype(String sample) { - return getGenotypes().containsSample(sample); - } - - public Genotype getGenotype(int ith) { - return genotypes.get(ith); - } - - - /** - * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS) - * - * @return chromosome count - */ - public int getCalledChrCount() { - final Set noSamples = Collections.emptySet(); - return getCalledChrCount(noSamples); - } - - /** - * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS) - * - * @param sampleIds IDs of samples to take into account. If empty then all samples are included. - * @return chromosome count - */ - public int getCalledChrCount(Set sampleIds) { - int n = 0; - GenotypesContext genotypes = sampleIds.isEmpty() ? getGenotypes() : getGenotypes(sampleIds); - - for ( final Genotype g : genotypes) { - for ( final Allele a : g.getAlleles() ) - n += a.isNoCall() ? 0 : 1; - } - - return n; - } - - /** - * Returns the number of chromosomes carrying allele A in the genotypes - * - * @param a allele - * @return chromosome count - */ - public int getCalledChrCount(Allele a) { - return getCalledChrCount(a,new HashSet(0)); - } - - /** - * Returns the number of chromosomes carrying allele A in the genotypes - * - * @param a allele - * @param sampleIds - IDs of samples to take into account. If empty then all samples are included. - * @return chromosome count - */ - public int getCalledChrCount(Allele a, Set sampleIds) { - int n = 0; - GenotypesContext genotypes = sampleIds.isEmpty() ? getGenotypes() : getGenotypes(sampleIds); - - for ( final Genotype g : genotypes ) { - n += g.countAllele(a); - } - - return n; - } - - /** - * Genotype-specific functions -- are the genotypes monomorphic w.r.t. to the alleles segregating at this - * site? That is, is the number of alternate alleles among all fo the genotype == 0? - * - * @return true if it's monomorphic - */ - public boolean isMonomorphicInSamples() { - if ( monomorphic == null ) - monomorphic = ! isVariant() || (hasGenotypes() && getCalledChrCount(getReference()) == getCalledChrCount()); - return monomorphic; - } - - /** - * Genotype-specific functions -- are the genotypes polymorphic w.r.t. to the alleles segregating at this - * site? That is, is the number of alternate alleles among all fo the genotype > 0? - * - * @return true if it's polymorphic - */ - public boolean isPolymorphicInSamples() { - return ! isMonomorphicInSamples(); - } - - private void calculateGenotypeCounts() { - if ( genotypeCounts == null ) { - genotypeCounts = new int[GenotypeType.values().length]; - - for ( final Genotype g : getGenotypes() ) { - genotypeCounts[g.getType().ordinal()]++; - } - } - } - - /** - * Genotype-specific functions -- how many no-calls are there in the genotypes? - * - * @return number of no calls - */ - public int getNoCallCount() { - calculateGenotypeCounts(); - return genotypeCounts[GenotypeType.NO_CALL.ordinal()]; - } - - /** - * Genotype-specific functions -- how many hom ref calls are there in the genotypes? - * - * @return number of hom ref calls - */ - public int getHomRefCount() { - calculateGenotypeCounts(); - return genotypeCounts[GenotypeType.HOM_REF.ordinal()]; - } - - /** - * Genotype-specific functions -- how many het calls are there in the genotypes? - * - * @return number of het calls - */ - public int getHetCount() { - calculateGenotypeCounts(); - return genotypeCounts[GenotypeType.HET.ordinal()]; - } - - /** - * Genotype-specific functions -- how many hom var calls are there in the genotypes? - * - * @return number of hom var calls - */ - public int getHomVarCount() { - calculateGenotypeCounts(); - return genotypeCounts[GenotypeType.HOM_VAR.ordinal()]; - } - - /** - * Genotype-specific functions -- how many mixed calls are there in the genotypes? - * - * @return number of mixed calls - */ - public int getMixedCount() { - calculateGenotypeCounts(); - return genotypeCounts[GenotypeType.MIXED.ordinal()]; - } - - // --------------------------------------------------------------------------------------------------------- - // - // validation: extra-strict validation routines for paranoid users - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Run all extra-strict validation tests on a Variant Context object - * - * @param reportedReference the reported reference allele - * @param observedReference the actual reference allele - * @param rsIDs the true dbSNP IDs - */ - public void extraStrictValidation(final Allele reportedReference, final Allele observedReference, final Set rsIDs) { - // validate the reference - validateReferenceBases(reportedReference, observedReference); - - // validate the RS IDs - validateRSIDs(rsIDs); - - // validate the altenate alleles - validateAlternateAlleles(); - - // validate the AN and AC fields - validateChromosomeCounts(); - - // TODO: implement me - //checkReferenceTrack(); - } - - public void validateReferenceBases(final Allele reportedReference, final Allele observedReference) { - if ( reportedReference != null && !reportedReference.basesMatch(observedReference) ) { - throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), observedReference.getBaseString(), reportedReference.getBaseString())); - } - } - - public void validateRSIDs(Set rsIDs) { - if ( rsIDs != null && hasID() ) { - for ( String id : getID().split(VCFConstants.ID_FIELD_SEPARATOR) ) { - if ( id.startsWith("rs") && !rsIDs.contains(id) ) - throw new TribbleException.InternalCodecException(String.format("the rsID %s for the record at position %s:%d is not in dbSNP", id, getChr(), getStart())); - } - } - } - - public void validateAlternateAlleles() { - if ( !hasGenotypes() ) - return; - - List reportedAlleles = getAlleles(); - Set observedAlleles = new HashSet(); - observedAlleles.add(getReference()); - for ( final Genotype g : getGenotypes() ) { - if ( g.isCalled() ) - observedAlleles.addAll(g.getAlleles()); - } - if ( observedAlleles.contains(Allele.NO_CALL) ) - observedAlleles.remove(Allele.NO_CALL); - - if ( reportedAlleles.size() != observedAlleles.size() ) - throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart())); - - int originalSize = reportedAlleles.size(); - // take the intersection and see if things change - observedAlleles.retainAll(reportedAlleles); - if ( observedAlleles.size() != originalSize ) - throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart())); - } - - public void validateChromosomeCounts() { - if ( !hasGenotypes() ) - return; - - // AN - if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { - int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString()); - int observedAN = getCalledChrCount(); - if ( reportedAN != observedAN ) - throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAN, observedAN)); - } - - // AC - if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { - ArrayList observedACs = new ArrayList(); - - // if there are alternate alleles, record the relevant tags - if ( getAlternateAlleles().size() > 0 ) { - for ( Allele allele : getAlternateAlleles() ) { - observedACs.add(getCalledChrCount(allele)); - } - } - else { // otherwise, set them to 0 - observedACs.add(0); - } - - if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) { - Collections.sort(observedACs); - List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY); - Collections.sort(reportedACs); - if ( observedACs.size() != reportedACs.size() ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.size(), observedACs.size())); - for (int i = 0; i < observedACs.size(); i++) { - if ( Integer.valueOf(reportedACs.get(i).toString()) != observedACs.get(i) ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getChr(), getStart(), reportedACs.get(i), observedACs.get(i))); - } - } else { - if ( observedACs.size() != 1 ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have enough values for the record at position %s:%d", getChr(), getStart())); - int reportedAC = Integer.valueOf(getAttribute(VCFConstants.ALLELE_COUNT_KEY).toString()); - if ( reportedAC != observedACs.get(0) ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAC, observedACs.get(0))); - } - } - } - - // --------------------------------------------------------------------------------------------------------- - // - // validation: the normal validation routines are called automatically upon creation of the VC - // - // --------------------------------------------------------------------------------------------------------- - - private boolean validate(final EnumSet validationToPerform) { - validateStop(); - for (final Validation val : validationToPerform ) { - switch (val) { - case ALLELES: validateAlleles(); break; - case GENOTYPES: validateGenotypes(); break; - default: throw new IllegalArgumentException("Unexpected validation mode " + val); - } - } - - return true; - } - - /** - * Check that getEnd() == END from the info field, if it's present - */ - private void validateStop() { - if ( hasAttribute(VCFConstants.END_KEY) ) { - final int end = getAttributeAsInt(VCFConstants.END_KEY, -1); - assert end != -1; - if ( end != getEnd() ) { - final String message = "Badly formed variant context at location " + getChr() + ":" - + getStart() + "; getEnd() was " + getEnd() - + " but this VariantContext contains an END key with value " + end; - if ( GeneralUtils.DEBUG_MODE_ENABLED && WARN_ABOUT_BAD_END ) { - System.err.println(message); - } - else { - throw new TribbleException(message); - } - } - } else { - final long length = (stop - start) + 1; - if ( ! hasSymbolicAlleles() && length != getReference().length() ) { - throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); - } - } - } - - private void validateAlleles() { - - boolean alreadySeenRef = false; - - for ( final Allele allele : alleles ) { - // make sure there's only one reference allele - if ( allele.isReference() ) { - if ( alreadySeenRef ) throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this); - alreadySeenRef = true; - } - - if ( allele.isNoCall() ) { - throw new IllegalArgumentException("BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this); - } - } - - // make sure there's one reference allele - if ( ! alreadySeenRef ) - throw new IllegalArgumentException("No reference allele found in VariantContext"); - } - - private void validateGenotypes() { - if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null"); - - for ( final Genotype g : this.genotypes ) { - if ( g.isAvailable() ) { - for ( Allele gAllele : g.getAlleles() ) { - if ( ! hasAllele(gAllele) && gAllele.isCalled() ) - throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); - } - } - } - } - - // --------------------------------------------------------------------------------------------------------- - // - // utility routines - // - // --------------------------------------------------------------------------------------------------------- - - private void determineType() { - if ( type == null ) { - switch ( getNAlleles() ) { - case 0: - throw new IllegalStateException("Unexpected error: requested type of VariantContext with no alleles!" + this); - case 1: - // note that this doesn't require a reference allele. You can be monomorphic independent of having a - // reference allele - type = Type.NO_VARIATION; - break; - default: - determinePolymorphicType(); - } - } - } - - private void determinePolymorphicType() { - type = null; - - // do a pairwise comparison of all alleles against the reference allele - for ( Allele allele : alleles ) { - if ( allele == REF ) - continue; - - // find the type of this allele relative to the reference - Type biallelicType = typeOfBiallelicVariant(REF, allele); - - // for the first alternate allele, set the type to be that one - if ( type == null ) { - type = biallelicType; - } - // if the type of this allele is different from that of a previous one, assign it the MIXED type and quit - else if ( biallelicType != type ) { - type = Type.MIXED; - return; - } - } - } - - private static Type typeOfBiallelicVariant(Allele ref, Allele allele) { - if ( ref.isSymbolic() ) - throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele"); - - if ( allele.isSymbolic() ) - return Type.SYMBOLIC; - - if ( ref.length() == allele.length() ) { - if ( allele.length() == 1 ) - return Type.SNP; - else - return Type.MNP; - } - - // Important note: previously we were checking that one allele is the prefix of the other. However, that's not an - // appropriate check as can be seen from the following example: - // REF = CTTA and ALT = C,CT,CA - // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check. - // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply - // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type - // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point - // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. - - return Type.INDEL; - - // old incorrect logic: - // if (oneIsPrefixOfOther(ref, allele)) - // return Type.INDEL; - // else - // return Type.MIXED; - } - - public String toString() { - return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s GT=%s", - getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop), - hasLog10PError() ? String.format("%.2f", getPhredScaledQual()) : ".", - this.getType(), - ParsingUtils.sortList(this.getAlleles()), - ParsingUtils.sortedString(this.getAttributes()), - this.getGenotypes()); - } - - public String toStringWithoutGenotypes() { - return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s", - getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop), - hasLog10PError() ? String.format("%.2f", getPhredScaledQual()) : ".", - this.getType(), - ParsingUtils.sortList(this.getAlleles()), - ParsingUtils.sortedString(this.getAttributes())); - } - - // protected basic manipulation routines - private static List makeAlleles(Collection alleles) { - final List alleleList = new ArrayList(alleles.size()); - - boolean sawRef = false; - for ( final Allele a : alleles ) { - for ( final Allele b : alleleList ) { - if ( a.equals(b, true) ) - throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a); - } - - // deal with the case where the first allele isn't the reference - if ( a.isReference() ) { - if ( sawRef ) - throw new IllegalArgumentException("Alleles for a VariantContext must contain at most one reference allele: " + alleles); - alleleList.add(0, a); - sawRef = true; - } - else - alleleList.add(a); - } - - if ( alleleList.isEmpty() ) - throw new IllegalArgumentException("Cannot create a VariantContext with an empty allele list"); - - if ( alleleList.get(0).isNonReference() ) - throw new IllegalArgumentException("Alleles for a VariantContext must contain at least one reference allele: " + alleles); - - return alleleList; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Fully decode - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Return a VC equivalent to this one but where all fields are fully decoded - * - * See VariantContext document about fully decoded - * - * @param header containing types about all fields in this VC - * @return a fully decoded version of this VC - */ - public VariantContext fullyDecode(final VCFHeader header, final boolean lenientDecoding) { - if ( isFullyDecoded() ) - return this; - else { - // TODO -- warning this is potentially very expensive as it creates copies over and over - final VariantContextBuilder builder = new VariantContextBuilder(this); - fullyDecodeInfo(builder, header, lenientDecoding); - fullyDecodeGenotypes(builder, header); - builder.fullyDecoded(true); - return builder.make(); - } - } - - /** - * See VariantContext document about fully decoded - * @return true if this is a fully decoded VC - */ - public boolean isFullyDecoded() { - return fullyDecoded; - } - - private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header, final boolean lenientDecoding) { - builder.attributes(fullyDecodeAttributes(getAttributes(), header, lenientDecoding)); - } - - private final Map fullyDecodeAttributes(final Map attributes, - final VCFHeader header, - final boolean lenientDecoding) { - final Map newAttributes = new HashMap(10); - - for ( final Map.Entry attr : attributes.entrySet() ) { - final String field = attr.getKey(); - - if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) - continue; // gross, FT is part of the extended attributes - - final VCFCompoundHeaderLine format = VariantContextUtils.getMetaDataForField(header, field); - final Object decoded = decodeValue(field, attr.getValue(), format); - - if ( decoded != null && - ! lenientDecoding - && format.getCountType() != VCFHeaderLineCount.UNBOUNDED - && format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements - final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1; - final int expSize = format.getCount(this); - if ( obsSize != expSize ) { - throw new TribbleException.InvalidHeader("Discordant field size detected for field " + - field + " at " + getChr() + ":" + getStart() + ". Field had " + obsSize + " values " + - "but the header says this should have " + expSize + " values based on header record " + - format); - } - } - newAttributes.put(field, decoded); - } - - return newAttributes; - } - - private final Object decodeValue(final String field, final Object value, final VCFCompoundHeaderLine format) { - if ( value instanceof String ) { - if ( field.equals(VCFConstants.GENOTYPE_PL_KEY) ) - return GenotypeLikelihoods.fromPLField((String)value); - - final String string = (String)value; - if ( string.indexOf(",") != -1 ) { - final String[] splits = string.split(","); - final List values = new ArrayList(splits.length); - for ( int i = 0; i < splits.length; i++ ) - values.add(decodeOne(field, splits[i], format)); - return values; - } else { - return decodeOne(field, string, format); - } - } else if ( value instanceof List && (((List) value).get(0)) instanceof String ) { - final List asList = (List)value; - final List values = new ArrayList(asList.size()); - for ( final String s : asList ) - values.add(decodeOne(field, s, format)); - return values; - } else { - return value; - } - - // allowMissingValuesComparedToHeader - } - - private final Object decodeOne(final String field, final String string, final VCFCompoundHeaderLine format) { - try { - if ( string.equals(VCFConstants.MISSING_VALUE_v4) ) - return null; - else { - switch ( format.getType() ) { - case Character: return string; - case Flag: - final boolean b = Boolean.valueOf(string) || string.equals("1"); - if ( b == false ) - throw new TribbleException("VariantContext FLAG fields " + field + " cannot contain false values" - + " as seen at " + getChr() + ":" + getStart()); - return b; - case String: return string; - case Integer: return Integer.valueOf(string); - case Float: return Double.valueOf(string); - default: throw new TribbleException("Unexpected type for field" + field); - } - } - } catch (NumberFormatException e) { - throw new TribbleException("Could not decode field " + field + " with value " + string + " of declared type " + format.getType()); - } - } - - private final void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) { - final GenotypesContext gc = new GenotypesContext(); - for ( final Genotype g : getGenotypes() ) { - gc.add(fullyDecodeGenotypes(g, header)); - } - builder.genotypesNoValidation(gc); - } - - private final Genotype fullyDecodeGenotypes(final Genotype g, final VCFHeader header) { - final Map map = fullyDecodeAttributes(g.getExtendedAttributes(), header, true); - return new GenotypeBuilder(g).attributes(map).make(); - } - - // --------------------------------------------------------------------------------------------------------- - // - // tribble integration routines -- not for public consumption - // - // --------------------------------------------------------------------------------------------------------- - public String getChr() { - return contig; - } - - public int getStart() { - return (int)start; - } - - public int getEnd() { - return (int)stop; - } - - public boolean hasSymbolicAlleles() { - return hasSymbolicAlleles(getAlleles()); - } - - public static boolean hasSymbolicAlleles( final List alleles ) { - for ( final Allele a: alleles ) { - if (a.isSymbolic()) { - return true; - } - } - return false; - } - - public Allele getAltAlleleWithHighestAlleleCount() { - // optimization: for bi-allelic sites, just return the 1only alt allele - if ( isBiallelic() ) - return getAlternateAllele(0); - - Allele best = null; - int maxAC1 = 0; - for ( Allele a : getAlternateAlleles() ) { - final int ac = getCalledChrCount(a); - if ( ac >= maxAC1 ) { - maxAC1 = ac; - best = a; - } - - } - return best; - } - - /** - * Lookup the index of allele in this variant context - * - * @param allele the allele whose index we want to get - * @return the index of the allele into getAlleles(), or -1 if it cannot be found - */ - public int getAlleleIndex(final Allele allele) { - return getAlleles().indexOf(allele); - } - - /** - * Return the allele index #getAlleleIndex for each allele in alleles - * - * @param alleles the alleles we want to look up - * @return a list of indices for each allele, in order - */ - public List getAlleleIndices(final Collection alleles) { - final List indices = new LinkedList(); - for ( final Allele allele : alleles ) - indices.add(getAlleleIndex(allele)); - return indices; - } - - public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) { - final int index = getAlleleIndex(targetAllele); - if ( index == -1 ) throw new IllegalArgumentException("Allele " + targetAllele + " not in this VariantContex " + this); - return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextBuilder.java deleted file mode 100644 index 276a6931a..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextBuilder.java +++ /dev/null @@ -1,482 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import com.google.java.contract.*; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.util.*; - -/** - * Builder class for VariantContext - * - * Some basic assumptions here: - * - * 1 -- data isn't protectively copied. If you provide an attribute map to - * the build, and modify it later, the builder will see this and so will any - * resulting variant contexts. It's best not to modify collections provided - * to a builder. - * - * 2 -- the system uses the standard builder model, allowing the simple construction idiom: - * - * builder.source("a").genotypes(gc).id("x").make() => VariantContext - * - * 3 -- The best way to copy a VariantContext is: - * - * new VariantContextBuilder(vc).make() => a copy of VC - * - * 4 -- validation of arguments is done at the during the final make() call, so a - * VariantContextBuilder can exist in an inconsistent state as long as those issues - * are resolved before the call to make() is issued. - * - * @author depristo - */ -public class VariantContextBuilder { - // required fields - private boolean fullyDecoded = false; - private String source = null; - private String contig = null; - private long start = -1; - private long stop = -1; - private Collection alleles = null; - - // optional -> these are set to the appropriate default value - private String ID = VCFConstants.EMPTY_ID_FIELD; - private GenotypesContext genotypes = GenotypesContext.NO_GENOTYPES; - private double log10PError = VariantContext.NO_LOG10_PERROR; - private Set filters = null; - private Map attributes = null; - private boolean attributesCanBeModified = false; - - /** enum of what must be validated */ - final private EnumSet toValidate = EnumSet.noneOf(VariantContext.Validation.class); - - /** - * Create an empty VariantContextBuilder where all values adopt their default values. Note that - * source, chr, start, stop, and alleles must eventually be filled in, or the resulting VariantContext - * will throw an error. - */ - public VariantContextBuilder() {} - - /** - * Create an empty VariantContextBuilder where all values adopt their default values, but the bare min. - * of info (source, chr, start, stop, and alleles) have been provided to start. - */ - @Requires({"source != null", "contig != null", "start >= 0", "stop >= 0", - "alleles != null && !alleles.isEmpty()"}) - public VariantContextBuilder(String source, String contig, long start, long stop, Collection alleles) { - this.source = source; - this.contig = contig; - this.start = start; - this.stop = stop; - this.alleles = alleles; - this.attributes = Collections.emptyMap(); // immutable - toValidate.add(VariantContext.Validation.ALLELES); - } - - /** - * Returns a new builder based on parent -- the new VC will have all fields initialized - * to their corresponding values in parent. This is the best way to create a derived VariantContext - * - * @param parent Cannot be null - */ - public VariantContextBuilder(VariantContext parent) { - if ( parent == null ) throw new IllegalArgumentException("BUG: VariantContextBuilder parent argument cannot be null in VariantContextBuilder"); - this.alleles = parent.alleles; - this.attributes = parent.getAttributes(); - this.attributesCanBeModified = false; - this.contig = parent.contig; - this.filters = parent.getFiltersMaybeNull(); - this.genotypes = parent.genotypes; - this.ID = parent.getID(); - this.log10PError = parent.getLog10PError(); - this.source = parent.getSource(); - this.start = parent.getStart(); - this.stop = parent.getEnd(); - this.fullyDecoded = parent.isFullyDecoded(); - } - - public VariantContextBuilder(VariantContextBuilder parent) { - if ( parent == null ) throw new IllegalArgumentException("BUG: VariantContext parent argument cannot be null in VariantContextBuilder"); - this.alleles = parent.alleles; - this.attributesCanBeModified = false; - this.contig = parent.contig; - this.genotypes = parent.genotypes; - this.ID = parent.ID; - this.log10PError = parent.log10PError; - this.source = parent.source; - this.start = parent.start; - this.stop = parent.stop; - this.fullyDecoded = parent.fullyDecoded; - - this.attributes(parent.attributes); - this.filters(parent.filters); - } - - public VariantContextBuilder copy() { - return new VariantContextBuilder(this); - } - - /** - * Tells this builder to use this collection of alleles for the resulting VariantContext - * - * @param alleles - * @return this builder - */ - @Requires({"alleles != null", "!alleles.isEmpty()"}) - public VariantContextBuilder alleles(final Collection alleles) { - this.alleles = alleles; - toValidate.add(VariantContext.Validation.ALLELES); - return this; - } - - public VariantContextBuilder alleles(final List alleleStrings) { - List alleles = new ArrayList(alleleStrings.size()); - - for ( int i = 0; i < alleleStrings.size(); i++ ) { - alleles.add(Allele.create(alleleStrings.get(i), i == 0)); - } - - return alleles(alleles); - } - - public VariantContextBuilder alleles(final String ... alleleStrings) { - return alleles(Arrays.asList(alleleStrings)); - } - - public List getAlleles() { - return new ArrayList(alleles); - } - - /** - * Tells this builder to use this map of attributes alleles for the resulting VariantContext - * - * Attributes can be null -> meaning there are no attributes. After - * calling this routine the builder assumes it can modify the attributes - * object here, if subsequent calls are made to set attribute values - * @param attributes - */ - public VariantContextBuilder attributes(final Map attributes) { - if (attributes != null) { - this.attributes = attributes; - } - else { - this.attributes = new HashMap(); - } - - this.attributesCanBeModified = true; - return this; - } - - /** - * Puts the key -> value mapping into this builder's attributes - * - * @param key - * @param value - * @return - */ - @Requires({"key != null"}) - @Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()+1)"}) - public VariantContextBuilder attribute(final String key, final Object value) { - makeAttributesModifiable(); - attributes.put(key, value); - return this; - } - - /** - * Removes key if present in the attributes - * - * @param key - * @return - */ - @Requires({"key != null"}) - @Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()-1)"}) - public VariantContextBuilder rmAttribute(final String key) { - makeAttributesModifiable(); - attributes.remove(key); - return this; - } - - /** - * Makes the attributes field modifiable. In many cases attributes is just a pointer to an immutable - * collection, so methods that want to add / remove records require the attributes to be copied to a - */ - @Ensures({"this.attributesCanBeModified"}) - private void makeAttributesModifiable() { - if ( ! attributesCanBeModified ) { - this.attributesCanBeModified = true; - this.attributes = new HashMap(attributes); - } - } - - /** - * This builder's filters are set to this value - * - * filters can be null -> meaning there are no filters - * @param filters - */ - public VariantContextBuilder filters(final Set filters) { - this.filters = filters; - return this; - } - - /** - * {@link #filters} - * - * @param filters - * @return - */ - public VariantContextBuilder filters(final String ... filters) { - filters(new LinkedHashSet(Arrays.asList(filters))); - return this; - } - - @Requires({"filter != null", "!filter.equals(\"PASS\")"}) - public VariantContextBuilder filter(final String filter) { - if ( this.filters == null ) this.filters = new LinkedHashSet(1); - this.filters.add(filter); - return this; - } - - /** - * Tells this builder that the resulting VariantContext should have PASS filters - * - * @return - */ - public VariantContextBuilder passFilters() { - return filters(VariantContext.PASSES_FILTERS); - } - - /** - * Tells this builder that the resulting VariantContext be unfiltered - * - * @return - */ - public VariantContextBuilder unfiltered() { - this.filters = null; - return this; - } - - /** - * Tells this builder that the resulting VariantContext should use this genotypes GenotypeContext - * - * Note that genotypes can be null -> meaning there are no genotypes - * - * @param genotypes - */ - public VariantContextBuilder genotypes(final GenotypesContext genotypes) { - this.genotypes = genotypes; - if ( genotypes != null ) - toValidate.add(VariantContext.Validation.GENOTYPES); - return this; - } - - public VariantContextBuilder genotypesNoValidation(final GenotypesContext genotypes) { - this.genotypes = genotypes; - return this; - } - - /** - * Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes - * - * Note that genotypes can be null -> meaning there are no genotypes - * - * @param genotypes - */ - public VariantContextBuilder genotypes(final Collection genotypes) { - return genotypes(GenotypesContext.copy(genotypes)); - } - - /** - * Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes - * @param genotypes - */ - public VariantContextBuilder genotypes(final Genotype ... genotypes) { - return genotypes(GenotypesContext.copy(Arrays.asList(genotypes))); - } - - /** - * Tells this builder that the resulting VariantContext should not contain any GenotypeContext - */ - public VariantContextBuilder noGenotypes() { - this.genotypes = null; - return this; - } - - /** - * Tells us that the resulting VariantContext should have ID - * @param ID - * @return - */ - @Requires("ID != null") - public VariantContextBuilder id(final String ID) { - this.ID = ID; - return this; - } - - /** - * Tells us that the resulting VariantContext should not have an ID - * @return - */ - public VariantContextBuilder noID() { - return id(VCFConstants.EMPTY_ID_FIELD); - } - - /** - * Tells us that the resulting VariantContext should have log10PError - * @param log10PError - * @return - */ - @Requires("log10PError <= 0 || log10PError == VariantContext.NO_LOG10_PERROR") - public VariantContextBuilder log10PError(final double log10PError) { - this.log10PError = log10PError; - return this; - } - - /** - * Tells us that the resulting VariantContext should have source field set to source - * @param source - * @return - */ - @Requires("source != null") - public VariantContextBuilder source(final String source) { - this.source = source; - return this; - } - - /** - * Tells us that the resulting VariantContext should have the specified location - * @param contig - * @param start - * @param stop - * @return - */ - @Requires({"contig != null", "start >= 0", "stop >= 0"}) - public VariantContextBuilder loc(final String contig, final long start, final long stop) { - this.contig = contig; - this.start = start; - this.stop = stop; - toValidate.add(VariantContext.Validation.ALLELES); - return this; - } - - /** - * Tells us that the resulting VariantContext should have the specified contig chr - * @param contig - * @return - */ - @Requires({"contig != null"}) - public VariantContextBuilder chr(final String contig) { - this.contig = contig; - return this; - } - - /** - * Tells us that the resulting VariantContext should have the specified contig start - * @param start - * @return - */ - @Requires({"start >= 0"}) - public VariantContextBuilder start(final long start) { - this.start = start; - toValidate.add(VariantContext.Validation.ALLELES); - return this; - } - - /** - * Tells us that the resulting VariantContext should have the specified contig stop - * @param stop - * @return - */ - @Requires({"stop >= 0"}) - public VariantContextBuilder stop(final long stop) { - this.stop = stop; - return this; - } - - /** - * @see #computeEndFromAlleles(java.util.List, int, int) with endForSymbolicAlleles == -1 - */ - public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start) { - return computeEndFromAlleles(alleles, start, -1); - } - - /** - * Compute the end position for this VariantContext from the alleles themselves - * - * assigns this builder the stop position computed. - * - * @param alleles the list of alleles to consider. The reference allele must be the first one - * @param start the known start position of this event - * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 - * if no is expected but will throw an error if one is found - * @return this builder - */ - @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) - public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { - stop(VariantContextUtils.computeEndFromAlleles(alleles, start, endForSymbolicAlleles)); - return this; - } - - /** - * @return true if this builder contains fully decoded data - * - * See VariantContext for more information - */ - public boolean isFullyDecoded() { - return fullyDecoded; - } - - /** - * Sets this builder's fully decoded state to true. - * - * A fully decoded builder indicates that all fields are represented by their - * proper java objects (e.g., Integer(10) not "10"). - * - * See VariantContext for more information - * - * @param isFullyDecoded - */ - public VariantContextBuilder fullyDecoded(boolean isFullyDecoded) { - this.fullyDecoded = isFullyDecoded; - return this; - } - - /** - * Takes all of the builder data provided up to this point, and instantiates - * a freshly allocated VariantContext with all of the builder data. This - * VariantContext is validated as appropriate and if not failing QC (and - * throwing an exception) is returned. - * - * Note that this function can be called multiple times to create multiple - * VariantContexts from the same builder. - */ - public VariantContext make() { - return new VariantContext(source, ID, contig, start, stop, alleles, - genotypes, log10PError, filters, attributes, - fullyDecoded, toValidate); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java deleted file mode 100644 index a5b7b6c04..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java +++ /dev/null @@ -1,374 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.jexl2.Expression; -import org.apache.commons.jexl2.JexlEngine; -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.*; - -import java.util.*; - -public class VariantContextUtils { - - private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); - - final public static JexlEngine engine = new JexlEngine(); - private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false; - - static { - engine.setSilent(false); // will throw errors now for selects that don't evaluate properly - engine.setLenient(false); - engine.setDebug(false); - } - - /** - * Update the attributes of the attributes map given the VariantContext to reflect the - * proper chromosome-based VCF tags - * - * @param vc the VariantContext - * @param attributes the attributes map to populate; must not be null; may contain old values - * @param removeStaleValues should we remove stale values from the mapping? - * @return the attributes map provided as input, returned for programming convenience - */ - public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { - return calculateChromosomeCounts(vc, attributes, removeStaleValues, new HashSet(0)); - } - - /** - * Update the attributes of the attributes map given the VariantContext to reflect the - * proper chromosome-based VCF tags - * - * @param vc the VariantContext - * @param attributes the attributes map to populate; must not be null; may contain old values - * @param removeStaleValues should we remove stale values from the mapping? - * @param founderIds - Set of founders Ids to take into account. AF and FC will be calculated over the founders. - * If empty or null, counts are generated for all samples as unrelated individuals - * @return the attributes map provided as input, returned for programming convenience - */ - public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues, final Set founderIds) { - final int AN = vc.getCalledChrCount(); - - // if everyone is a no-call, remove the old attributes if requested - if ( AN == 0 && removeStaleValues ) { - if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) - attributes.remove(VCFConstants.ALLELE_COUNT_KEY); - if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) - attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); - if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) - attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); - return attributes; - } - - if ( vc.hasGenotypes() ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, AN); - - // if there are alternate alleles, record the relevant tags - if ( vc.getAlternateAlleles().size() > 0 ) { - ArrayList alleleFreqs = new ArrayList(); - ArrayList alleleCounts = new ArrayList(); - ArrayList foundersAlleleCounts = new ArrayList(); - double totalFoundersChromosomes = (double)vc.getCalledChrCount(founderIds); - int foundersAltChromosomes; - for ( Allele allele : vc.getAlternateAlleles() ) { - foundersAltChromosomes = vc.getCalledChrCount(allele,founderIds); - alleleCounts.add(vc.getCalledChrCount(allele)); - foundersAlleleCounts.add(foundersAltChromosomes); - if ( AN == 0 ) { - alleleFreqs.add(0.0); - } else { - final Double freq = (double)foundersAltChromosomes / totalFoundersChromosomes; - alleleFreqs.add(freq); - } - } - - attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); - } else { - // if there's no alt AC and AF shouldn't be present - attributes.remove(VCFConstants.ALLELE_COUNT_KEY); - attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); - } - } - - return attributes; - } - - /** - * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper - * chromosome-based VCF tags based on the current VC produced by builder.make() - * - * @param builder the VariantContextBuilder we are updating - * @param removeStaleValues should we remove stale values from the mapping? - */ - public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) { - VariantContext vc = builder.make(); - builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, new HashSet(0))); - } - - /** - * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper - * chromosome-based VCF tags based on the current VC produced by builder.make() - * - * @param builder the VariantContextBuilder we are updating - * @param founderIds - Set of founders to take into account. AF and FC will be calculated over the founders only. - * If empty or null, counts are generated for all samples as unrelated individuals - * @param removeStaleValues should we remove stale values from the mapping? - */ - public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues, final Set founderIds) { - VariantContext vc = builder.make(); - builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, founderIds)); - } - - public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { - VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field); - if ( metaData == null ) metaData = header.getInfoHeaderLine(field); - if ( metaData == null ) { - if ( ASSUME_MISSING_FIELDS_ARE_STRINGS ) { - if ( ! MISSING_KEYS_WARNED_ABOUT.contains(field) ) { - MISSING_KEYS_WARNED_ABOUT.add(field); - if ( GeneralUtils.DEBUG_MODE_ENABLED ) - System.err.println("Field " + field + " missing from VCF header, assuming it is an unbounded string type"); - } - return new VCFInfoHeaderLine(field, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Auto-generated string header for " + field); - } - else - throw new TribbleException("Fully decoding VariantContext requires header line for all fields, but none was found for " + field); - } - return metaData; - } - - /** - * A simple but common wrapper for matching VariantContext objects using JEXL expressions - */ - public static class JexlVCMatchExp { - public String name; - public Expression exp; - - /** - * Create a new matcher expression with name and JEXL expression exp - * @param name name - * @param exp expression - */ - public JexlVCMatchExp(String name, Expression exp) { - this.name = name; - this.exp = exp; - } - } - - /** - * Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays contain - * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return - * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to - * match() below. - * - * @param names names - * @param exps expressions - * @return list of matches - */ - public static List initializeMatchExps(String[] names, String[] exps) { - if ( names == null || exps == null ) - throw new IllegalArgumentException("BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps) ); - - if ( names.length != exps.length ) - throw new IllegalArgumentException("Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); - - Map map = new HashMap(); - for ( int i = 0; i < names.length; i++ ) { map.put(names[i], exps[i]); } - - return VariantContextUtils.initializeMatchExps(map); - } - - public static List initializeMatchExps(ArrayList names, ArrayList exps) { - String[] nameArray = new String[names.size()]; - String[] expArray = new String[exps.size()]; - return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray)); - } - - - /** - * Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain - * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return - * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to - * match() below. - * - * @param names_and_exps mapping of names to expressions - * @return list of matches - */ - public static List initializeMatchExps(Map names_and_exps) { - List exps = new ArrayList(); - - for ( Map.Entry elt : names_and_exps.entrySet() ) { - String name = elt.getKey(); - String expStr = elt.getValue(); - - if ( name == null || expStr == null ) throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr); - try { - Expression exp = engine.createExpression(expStr); - exps.add(new JexlVCMatchExp(name, exp)); - } catch (Exception e) { - throw new IllegalArgumentException("Argument " + name + "has a bad value. Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax.") ; - } - } - - return exps; - } - - /** - * Returns true if exp match VC. See collection<> version for full docs. - * @param vc variant context - * @param exp expression - * @return true if there is a match - */ - public static boolean match(VariantContext vc, JexlVCMatchExp exp) { - return match(vc,Arrays.asList(exp)).get(exp); - } - - /** - * Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these - * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL - * expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp - * expressions. - * - * @param vc variant context - * @param exps expressions - * @return true if there is a match - */ - public static Map match(VariantContext vc, Collection exps) { - return new JEXLMap(exps,vc); - - } - - /** - * Returns true if exp match VC/g. See collection<> version for full docs. - * @param vc variant context - * @param g genotype - * @param exp expression - * @return true if there is a match - */ - public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) { - return match(vc,g,Arrays.asList(exp)).get(exp); - } - - /** - * Matches each JexlVCMatchExp exp against the data contained in vc/g, and returns a map from these - * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL - * expressions to VariantContext records/genotypes. Use initializeMatchExps() to create the list of JexlVCMatchExp - * expressions. - * - * @param vc variant context - * @param g genotype - * @param exps expressions - * @return true if there is a match - */ - public static Map match(VariantContext vc, Genotype g, Collection exps) { - return new JEXLMap(exps,vc,g); - } - - /** - * Returns a newly allocated VC that is the same as VC, but without genotypes - * @param vc variant context - * @return new VC without genotypes - */ - @Requires("vc != null") - @Ensures("result != null") - public static VariantContext sitesOnlyVariantContext(VariantContext vc) { - return new VariantContextBuilder(vc).noGenotypes().make(); - } - - /** - * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes - * @param vcs collection of VCs - * @return new VCs without genotypes - */ - @Requires("vcs != null") - @Ensures("result != null") - public static Collection sitesOnlyVariantContexts(Collection vcs) { - List r = new ArrayList(); - for ( VariantContext vc : vcs ) - r.add(sitesOnlyVariantContext(vc)); - return r; - } - - // TODO: remove that after testing -// static private void verifyUniqueSampleNames(Collection unsortedVCs) { -// Set names = new HashSet(); -// for ( VariantContext vc : unsortedVCs ) { -// for ( String name : vc.getSampleNames() ) { -// //System.out.printf("Checking %s %b%n", name, names.contains(name)); -// if ( names.contains(name) ) -// throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name); -// } -// -// names.addAll(vc.getSampleNames()); -// } -// } - - - public static int getSize( VariantContext vc ) { - return vc.getEnd() - vc.getStart() + 1; - } - - public static final Set genotypeNames(final Collection genotypes) { - final Set names = new HashSet(genotypes.size()); - for ( final Genotype g : genotypes ) - names.add(g.getSampleName()); - return names; - } - - /** - * Compute the end position for this VariantContext from the alleles themselves - * - * In the trivial case this is a single BP event and end = start (open intervals) - * In general the end is start + ref length - 1, handling the case where ref length == 0 - * However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases - * - * @param alleles the list of alleles to consider. The reference allele must be the first one - * @param start the known start position of this event - * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 - * if no is expected but will throw an error if one is found - * @return this builder - */ - @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) - public static int computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { - final Allele ref = alleles.get(0); - - if ( ref.isNonReference() ) - throw new IllegalStateException("computeEndFromAlleles requires first allele to be reference"); - - if ( VariantContext.hasSymbolicAlleles(alleles) ) { - if ( endForSymbolicAlleles == -1 ) - throw new IllegalStateException("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided"); - return endForSymbolicAlleles; - } else { - return start + Math.max(ref.length() - 1, 0); - } - } - -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantJEXLContext.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantJEXLContext.java deleted file mode 100644 index efdd54b57..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantJEXLContext.java +++ /dev/null @@ -1,326 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import org.apache.commons.jexl2.JexlContext; -import org.apache.commons.jexl2.MapContext; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.VCFConstants; - -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -/** - * - * @author aaron - * @author depristo - * - * Class VariantJEXLContext - * - * implements the JEXML context for VariantContext; this saves us from - * having to generate a JEXML context lookup map everytime we want to evaluate an expression. - * - * This is package protected, only classes in variantcontext should have access to it. - * - * // todo -- clean up to remove or better support genotype filtering - */ - -class VariantJEXLContext implements JexlContext { - // our stored variant context - private VariantContext vc; - - private interface AttributeGetter { - public Object get(VariantContext vc); - } - - private static Map x = new HashMap(); - - static { - x.put("vc", new AttributeGetter() { public Object get(VariantContext vc) { return vc; }}); - x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }}); - x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }}); - x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }}); - x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return -10 * vc.getLog10PError(); }}); - x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }}); - x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }}); - x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }}); - -// x.put("GT", new AttributeGetter() { public Object get(VariantContext vc) { return g.getGenotypeString(); }}); - x.put("homRefCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomRefCount(); }}); - x.put("hetCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHetCount(); }}); - x.put("homVarCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomVarCount(); }}); - } - - public VariantJEXLContext(VariantContext vc) { - this.vc = vc; - } - - public Object get(String name) { - Object result = null; - if ( x.containsKey(name) ) { // dynamic resolution of name -> value via map - result = x.get(name).get(vc); - } else if ( vc.hasAttribute(name)) { - result = vc.getAttribute(name); - } else if ( vc.getFilters().contains(name) ) { - result = "1"; - } - - //System.out.printf("dynamic lookup %s => %s%n", name, result); - - return result; - } - - public boolean has(String name) { - return get(name) != null; - } - - public void set(String name, Object value) { - throw new UnsupportedOperationException("remove() not supported on a VariantJEXLContext"); - } -} - - - - -/** - * this is an implementation of a Map of JexlVCMatchExp to true or false values. It lazy initializes each value - * as requested to save as much processing time as possible. - * - * Compatible with JEXL 1.1 (this code will be easier if we move to 2.0, all of the functionality can go into the - * JexlContext's get() - * - */ - -class JEXLMap implements Map { - // our variant context and/or Genotype - private final VariantContext vc; - private final Genotype g; - - // our context - private JexlContext jContext = null; - - // our mapping from JEXLVCMatchExp to Booleans, which will be set to NULL for previously uncached JexlVCMatchExp - private Map jexl; - - - public JEXLMap(Collection jexlCollection, VariantContext vc, Genotype g) { - this.vc = vc; - this.g = g; - initialize(jexlCollection); - } - - public JEXLMap(Collection jexlCollection, VariantContext vc) { - this(jexlCollection, vc, null); - } - - private void initialize(Collection jexlCollection) { - jexl = new HashMap(); - for (VariantContextUtils.JexlVCMatchExp exp: jexlCollection) { - jexl.put(exp, null); - } - } - - /** - * create the internal JexlContext, only when required. This code is where new JEXL context variables - * should get added. - * - */ - private void createContext() { - if ( g == null ) { - // todo -- remove dependancy on g to the entire system - jContext = new VariantJEXLContext(vc); - } else { - // - // this whole branch is here just to support G jexl operations - // - Map infoMap = new HashMap(); - - if ( vc != null ) { - // create a mapping of what we know about the variant context, its Chromosome, positions, etc. - infoMap.put("CHROM", vc.getChr()); - infoMap.put("POS", vc.getStart()); - infoMap.put("TYPE", vc.getType().toString()); - infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual())); - - // add alleles - infoMap.put("ALLELES", GeneralUtils.join(";", vc.getAlleles())); - infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles())); - - // add attributes - addAttributesToMap(infoMap, vc.getAttributes()); - - // add filter fields - infoMap.put("FILTER", vc.isFiltered() ? "1" : "0"); - for ( Object filterCode : vc.getFilters() ) { - infoMap.put(String.valueOf(filterCode), "1"); - } - - // add genotype-specific fields - // TODO -- implement me when we figure out a good way to represent this - // for ( Genotype g : vc.getGenotypes().values() ) { - // String prefix = g.getSampleName() + "."; - // addAttributesToMap(infoMap, g.getAttributes(), prefix); - // infoMap.put(prefix + "GT", g.getGenotypeString()); - // } - - // add specific genotype if one is provided - infoMap.put(VCFConstants.GENOTYPE_KEY, g.getGenotypeString()); - infoMap.put("isHomRef", g.isHomRef() ? "1" : "0"); - infoMap.put("isHet", g.isHet() ? "1" : "0"); - infoMap.put("isHomVar", g.isHomVar() ? "1" : "0"); - infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, g.getGQ()); - if ( g.hasDP() ) - infoMap.put(VCFConstants.DEPTH_KEY, g.getDP()); - for ( Map.Entry e : g.getExtendedAttributes().entrySet() ) { - if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) ) - infoMap.put(e.getKey(), e.getValue()); - } - } - - // create the internal context that we can evaluate expressions against - - jContext = new MapContext(infoMap); - } - } - - /** - * @return the size of the internal data structure - */ - public int size() { - return jexl.size(); - } - - /** - * @return true if we're empty - */ - public boolean isEmpty() { return this.jexl.isEmpty(); } - - /** - * do we contain the specified key - * @param o the key - * @return true if we have a value for that key - */ - public boolean containsKey(Object o) { return jexl.containsKey(o); } - - public Boolean get(Object o) { - // if we've already determined the value, return it - if (jexl.containsKey(o) && jexl.get(o) != null) return jexl.get(o); - - // try and cast the expression - VariantContextUtils.JexlVCMatchExp e = (VariantContextUtils.JexlVCMatchExp) o; - evaluateExpression(e); - return jexl.get(e); - } - - /** - * get the keyset of map - * @return a set of keys of type JexlVCMatchExp - */ - public Set keySet() { - return jexl.keySet(); - } - - /** - * get all the values of the map. This is an expensive call, since it evaluates all keys that haven't - * been evaluated yet. This is fine if you truely want all the keys, but if you only want a portion, or know - * the keys you want, you would be better off using get() to get them by name. - * @return a collection of boolean values, representing the results of all the variants evaluated - */ - public Collection values() { - // this is an expensive call - for (VariantContextUtils.JexlVCMatchExp exp : jexl.keySet()) - if (jexl.get(exp) == null) - evaluateExpression(exp); - return jexl.values(); - } - - /** - * evaulate a JexlVCMatchExp's expression, given the current context (and setup the context if it's null) - * @param exp the JexlVCMatchExp to evaluate - */ - private void evaluateExpression(VariantContextUtils.JexlVCMatchExp exp) { - // if the context is null, we need to create it to evaluate the JEXL expression - if (this.jContext == null) createContext(); - try { - final Boolean value = (Boolean) exp.exp.evaluate(jContext); - // treat errors as no match - jexl.put(exp, value == null ? false : value); - } catch (Exception e) { - // if exception happens because variable is undefined (i.e. field in expression is not present), evaluate to FALSE - // todo - might be safer if we explicitly checked for an exception type, but Apache's API doesn't seem to have that ability - if (e.getMessage().contains("undefined variable")) - jexl.put(exp,false); - else - throw new IllegalArgumentException(String.format("Invalid JEXL expression detected for %s with message %s", exp.name, e.getMessage())); - } - } - - /** - * helper function: adds the list of attributes to the information map we're building - * @param infoMap the map - * @param attributes the attributes - */ - private static void addAttributesToMap(Map infoMap, Map attributes ) { - for (Map.Entry e : attributes.entrySet()) { - infoMap.put(e.getKey(), String.valueOf(e.getValue())); - } - } - - public Boolean put(VariantContextUtils.JexlVCMatchExp jexlVCMatchExp, Boolean aBoolean) { - return jexl.put(jexlVCMatchExp,aBoolean); - } - - public void putAll(Map map) { - jexl.putAll(map); - } - - // ////////////////////////////////////////////////////////////////////////////////////// - // The Following are unsupported at the moment - // ////////////////////////////////////////////////////////////////////////////////////// - - // this doesn't make much sense to implement, boolean doesn't offer too much variety to deal - // with evaluating every key in the internal map. - public boolean containsValue(Object o) { - throw new UnsupportedOperationException("containsValue() not supported on a JEXLMap"); - } - - // this doesn't make much sense - public Boolean remove(Object o) { - throw new UnsupportedOperationException("remove() not supported on a JEXLMap"); - } - - - public Set> entrySet() { - throw new UnsupportedOperationException("clear() not supported on a JEXLMap"); - } - - // nope - public void clear() { - throw new UnsupportedOperationException("clear() not supported on a JEXLMap"); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2Encoder.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2Encoder.java deleted file mode 100644 index d2a3d5435..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2Encoder.java +++ /dev/null @@ -1,279 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.variant.bcf2.BCF2Type; -import org.broadinstitute.variant.bcf2.BCF2Utils; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.*; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public final class BCF2Encoder { - // TODO -- increase default size? - public static final int WRITE_BUFFER_INITIAL_SIZE = 16384; - private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE); - - // -------------------------------------------------------------------------------- - // - // Functions to return the data being encoded here - // - // -------------------------------------------------------------------------------- - - @Ensures("result != null") - public byte[] getRecordBytes() { - byte[] bytes = encodeStream.toByteArray(); - encodeStream.reset(); - return bytes; - } - - // -------------------------------------------------------------------------------- - // - // Writing typed values (have type byte) - // - // -------------------------------------------------------------------------------- - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTypedMissing(final BCF2Type type) throws IOException { - encodeType(0, type); - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTyped(final Object value, final BCF2Type type) throws IOException { - if ( value == null ) - encodeTypedMissing(type); - else { - switch ( type ) { - case INT8: - case INT16: - case INT32: encodeTypedInt((Integer)value, type); break; - case FLOAT: encodeTypedFloat((Double) value); break; - case CHAR: encodeTypedString((String) value); break; - default: throw new IllegalArgumentException("Illegal type encountered " + type); - } - } - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTypedInt(final int v) throws IOException { - final BCF2Type type = BCF2Utils.determineIntegerType(v); - encodeTypedInt(v, type); - } - - @Requires("type.isIntegerType()") - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTypedInt(final int v, final BCF2Type type) throws IOException { - encodeType(1, type); - encodeRawInt(v, type); - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTypedString(final String s) throws IOException { - encodeTypedString(s.getBytes()); - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTypedString(final byte[] s) throws IOException { - if ( s == null ) - encodeType(0, BCF2Type.CHAR); - else { - encodeType(s.length, BCF2Type.CHAR); - for ( int i = 0; i < s.length; i++ ) { - encodeRawChar(s[i]); - } - } - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTypedFloat(final double d) throws IOException { - encodeType(1, BCF2Type.FLOAT); - encodeRawFloat(d); - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeTyped(List v, final BCF2Type type) throws IOException { - if ( type == BCF2Type.CHAR && v.size() != 0 ) { - final String s = BCF2Utils.collapseStringList((List) v); - v = stringToBytes(s); - } - - encodeType(v.size(), type); - encodeRawValues(v, type); - } - - // -------------------------------------------------------------------------------- - // - // Writing raw values (don't have a type byte) - // - // -------------------------------------------------------------------------------- - - public final void encodeRawValues(final Collection v, final BCF2Type type) throws IOException { - for ( final T v1 : v ) { - encodeRawValue(v1, type); - } - } - - public final void encodeRawValue(final T value, final BCF2Type type) throws IOException { - try { - if ( value == type.getMissingJavaValue() ) - encodeRawMissingValue(type); - else { - switch (type) { - case INT8: - case INT16: - case INT32: encodeRawBytes((Integer) value, type); break; - case FLOAT: encodeRawFloat((Double) value); break; - case CHAR: encodeRawChar((Byte) value); break; - default: throw new IllegalArgumentException("Illegal type encountered " + type); - } - } - } catch ( ClassCastException e ) { - throw new ClassCastException("BUG: invalid type cast to " + type + " from " + value); - } - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeRawMissingValue(final BCF2Type type) throws IOException { - encodeRawBytes(type.getMissingBytes(), type); - } - - @Requires("size >= 0") - public final void encodeRawMissingValues(final int size, final BCF2Type type) throws IOException { - for ( int i = 0; i < size; i++ ) - encodeRawMissingValue(type); - } - - // -------------------------------------------------------------------------------- - // - // low-level encoders - // - // -------------------------------------------------------------------------------- - - public final void encodeRawChar(final byte c) throws IOException { - encodeStream.write(c); - } - - public final void encodeRawFloat(final double value) throws IOException { - encodeRawBytes(Float.floatToIntBits((float) value), BCF2Type.FLOAT); - } - - @Requires("size >= 0") - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeType(final int size, final BCF2Type type) throws IOException { - if ( size <= BCF2Utils.MAX_INLINE_ELEMENTS ) { - final int typeByte = BCF2Utils.encodeTypeDescriptor(size, type); - encodeStream.write(typeByte); - } else { - final int typeByte = BCF2Utils.encodeTypeDescriptor(BCF2Utils.OVERFLOW_ELEMENT_MARKER, type); - encodeStream.write(typeByte); - // write in the overflow size - encodeTypedInt(size); - } - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeRawInt(final int value, final BCF2Type type) throws IOException { - type.write(value, encodeStream); - } - - @Ensures("encodeStream.size() > old(encodeStream.size())") - public final void encodeRawBytes(final int value, final BCF2Type type) throws IOException { - type.write(value, encodeStream); - } - - // -------------------------------------------------------------------------------- - // - // utility functions - // - // -------------------------------------------------------------------------------- - - @Requires({"s != null", "sizeToWrite >= 0"}) - public void encodeRawString(final String s, final int sizeToWrite) throws IOException { - final byte[] bytes = s.getBytes(); - for ( int i = 0; i < sizeToWrite; i++ ) - if ( i < bytes.length ) - encodeRawChar(bytes[i]); - else - encodeRawMissingValue(BCF2Type.CHAR); - } - - /** - * Totally generic encoder that examines o, determines the best way to encode it, and encodes it - * - * This method is incredibly slow, but it's only used for UnitTests so it doesn't matter - * - * @param o - * @return - */ - @Requires("o != null") - public final BCF2Type encode(final Object o) throws IOException { - if ( o == null ) throw new IllegalArgumentException("Generic encode cannot deal with null values"); - - if ( o instanceof List ) { - final BCF2Type type = determineBCFType(((List) o).get(0)); - encodeTyped((List) o, type); - return type; - } else { - final BCF2Type type = determineBCFType(o); - encodeTyped(o, type); - return type; - } - } - - @Requires("arg != null") - private final BCF2Type determineBCFType(final Object arg) { - final Object toType = arg instanceof List ? ((List)arg).get(0) : arg; - - if ( toType instanceof Integer ) - return BCF2Utils.determineIntegerType((Integer) toType); - else if ( toType instanceof String ) - return BCF2Type.CHAR; - else if ( toType instanceof Double ) - return BCF2Type.FLOAT; - else - throw new IllegalArgumentException("No native encoding for Object of type " + arg.getClass().getSimpleName()); - } - - private final List stringToBytes(final String v) throws IOException { - if ( v == null || v.equals("") ) - return Collections.emptyList(); - else { - // TODO -- this needs to be optimized away for efficiency - final byte[] bytes = v.getBytes(); - final List l = new ArrayList(bytes.length); - for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]); - return l; - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldEncoder.java deleted file mode 100644 index a04a6bf37..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldEncoder.java +++ /dev/null @@ -1,518 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.broadinstitute.variant.bcf2.BCF2Type; -import org.broadinstitute.variant.bcf2.BCF2Utils; -import org.broadinstitute.variant.vcf.VCFCompoundHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineCount; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -@Invariant({ - "headerLine != null", - "dictionaryOffsetType.isIntegerType()", - "dictionaryOffset >= 0" -}) -public abstract class BCF2FieldEncoder { - /** - * The header line describing the field we will encode values of - */ - final VCFCompoundHeaderLine headerLine; - - /** - * The BCF2 type we'll use to encoder this field, if it can be determined statically. - * If not, this variable must be null - */ - final BCF2Type staticType; - - /** - * The integer offset into the strings map of the BCF2 file corresponding to this - * field. - */ - final int dictionaryOffset; - - /** - * The integer type we use to encode our dictionary offset in the BCF2 file - */ - final BCF2Type dictionaryOffsetType; - - // ---------------------------------------------------------------------- - // - // Constructor - // - // ---------------------------------------------------------------------- - - @Requires({"headerLine != null", "dict != null"}) - private BCF2FieldEncoder(final VCFCompoundHeaderLine headerLine, final Map dict, final BCF2Type staticType) { - this.headerLine = headerLine; - this.staticType = staticType; - - final Integer offset = dict.get(getField()); - if ( offset == null ) throw new IllegalStateException("Format error: could not find string " + getField() + " in header as required by BCF"); - this.dictionaryOffset = offset; - dictionaryOffsetType = BCF2Utils.determineIntegerType(offset); - } - - // ---------------------------------------------------------------------- - // - // Basic accessors - // - // ---------------------------------------------------------------------- - - @Ensures("result != null") - public final String getField() { return headerLine.getID(); } - - /** - * Write the field key (dictionary offset and type) into the BCF2Encoder stream - * - * @param encoder where we write our dictionary offset - * @throws IOException - */ - @Requires("encoder != null") - public final void writeFieldKey(final BCF2Encoder encoder) throws IOException { - encoder.encodeTypedInt(dictionaryOffset, dictionaryOffsetType); - } - - @Override - public String toString() { - return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName(); - } - - // ---------------------------------------------------------------------- - // - // methods to determine the number of encoded elements - // - // ---------------------------------------------------------------------- - - @Ensures("result != null") - protected final VCFHeaderLineCount getCountType() { - return headerLine.getCountType(); - } - - /** - * True if this field has a constant, fixed number of elements (such as 1 for an atomic integer) - * - * @return - */ - @Ensures("result != (hasValueDeterminedNumElements() || hasContextDeterminedNumElements())") - public boolean hasConstantNumElements() { - return getCountType() == VCFHeaderLineCount.INTEGER; - } - - /** - * True if the only way to determine how many elements this field contains is by - * inspecting the actual value directly, such as when the number of elements - * is a variable length list per site or per genotype. - * @return - */ - @Ensures("result != (hasConstantNumElements() || hasContextDeterminedNumElements())") - public boolean hasValueDeterminedNumElements() { - return getCountType() == VCFHeaderLineCount.UNBOUNDED; - } - - /** - * True if this field has a non-fixed number of elements that depends only on the properties - * of the current VariantContext, such as one value per Allele or per genotype configuration. - * - * @return - */ - @Ensures("result != (hasValueDeterminedNumElements() || hasConstantNumElements())") - public boolean hasContextDeterminedNumElements() { - return ! hasConstantNumElements() && ! hasValueDeterminedNumElements(); - } - - /** - * Get the number of elements, assuming this field has a constant number of elements. - * @return - */ - @Requires("hasConstantNumElements()") - @Ensures("result >= 0") - public int numElements() { - return headerLine.getCount(); - } - - /** - * Get the number of elements by looking at the actual value provided - * @return - */ - @Requires("hasValueDeterminedNumElements()") - @Ensures("result >= 0") - public int numElements(final Object value) { - return numElementsFromValue(value); - } - - /** - * Get the number of elements, assuming this field has context-determined number of elements. - * @return - */ - @Requires("hasContextDeterminedNumElements()") - @Ensures("result >= 0") - public int numElements(final VariantContext vc) { - return headerLine.getCount(vc); - } - - /** - * A convenience access for the number of elements, returning - * the number of encoded elements, either from the fixed number - * it has, from the VC, or from the value itself. - * @param vc - * @param value - * @return - */ - @Ensures("result >= 0") - public final int numElements(final VariantContext vc, final Object value) { - if ( hasConstantNumElements() ) return numElements(); - else if ( hasContextDeterminedNumElements() ) return numElements(vc); - else return numElements(value); - } - - /** - * Given a value, return the number of elements we will encode for it. - * - * Assumes the value is encoded as a List - * - * @param value - * @return - */ - @Requires("hasValueDeterminedNumElements()") - @Ensures("result >= 0") - protected int numElementsFromValue(final Object value) { - if ( value == null ) return 0; - else if ( value instanceof List ) return ((List) value).size(); - else return 1; - } - - // ---------------------------------------------------------------------- - // - // methods to determine the BCF2 type of the encoded values - // - // ---------------------------------------------------------------------- - - /** - * Is the BCF2 type of this field static, or does it have to be determine from - * the actual field value itself? - * @return - */ - @Ensures("result || isDynamicallyTyped()") - public final boolean isStaticallyTyped() { return ! isDynamicallyTyped(); } - - /** - * Is the BCF2 type of this field static, or does it have to be determine from - * the actual field value itself? - * @return - */ - @Ensures("result || isStaticallyTyped()") - public final boolean isDynamicallyTyped() { return staticType == null; } - - /** - * Get the BCF2 type for this field, either from the static type of the - * field itself or by inspecting the value itself. - * - * @return - */ - public final BCF2Type getType(final Object value) { - return isDynamicallyTyped() ? getDynamicType(value) : getStaticType(); - } - - @Requires("isStaticallyTyped()") - @Ensures("result != null") - public final BCF2Type getStaticType() { - return staticType; - } - - @Requires("isDynamicallyTyped()") - @Ensures("result != null") - public BCF2Type getDynamicType(final Object value) { - throw new IllegalStateException("BUG: cannot get dynamic type for statically typed BCF2 field " + getField()); - } - - // ---------------------------------------------------------------------- - // - // methods to encode values, including the key abstract method - // - // ---------------------------------------------------------------------- - - /** - * Key abstract method that should encode a value of the given type into the encoder. - * - * Value will be of a type appropriate to the underlying encoder. If the genotype field is represented as - * an int[], this will be value, and the encoder needs to handle encoding all of the values in the int[]. - * - * The argument should be used, not the getType() method in the superclass as an outer loop might have - * decided a more general type (int16) to use, even through this encoder could have been done with int8. - * - * If minValues > 0, then encodeValue must write in at least minValues items from value. If value is atomic, - * this means that minValues - 1 MISSING values should be added to the encoder. If minValues is a collection - * type (int[]) then minValues - values.length should be added. This argument is intended to handle padding - * of values in genotype fields. - * - * @param encoder - * @param value - * @param type - * @param minValues - * @throws IOException - */ - @Requires({"encoder != null", "isDynamicallyTyped() || type == getStaticType()", "minValues >= 0"}) - public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException; - - // ---------------------------------------------------------------------- - // - // Subclass to encode Strings - // - // ---------------------------------------------------------------------- - - public static class StringOrCharacter extends BCF2FieldEncoder { - public StringOrCharacter(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.CHAR); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - final String s = javaStringToBCF2String(value); - encoder.encodeRawString(s, Math.max(s.length(), minValues)); - } - - // - // Regardless of what the header says, BCF2 strings and characters are always encoded - // as arrays of CHAR type, which has a variable number of elements depending on the - // exact string being encoded - // - @Override public boolean hasConstantNumElements() { return false; } - @Override public boolean hasContextDeterminedNumElements() { return false; } - @Override public boolean hasValueDeterminedNumElements() { return true; } - @Override protected int numElementsFromValue(final Object value) { - return value == null ? 0 : javaStringToBCF2String(value).length(); - } - - /** - * Recode the incoming object to a String, compacting it into a - * BCF2 string if the value is a list. - * - * @param value a String or List to encode, or null - * @return a non-null string to encode - */ - @Ensures("result != null") - private String javaStringToBCF2String(final Object value) { - if ( value == null ) - return ""; - else if (value instanceof List) { - final List l = (List)value; - if ( l.isEmpty() ) return ""; - else return BCF2Utils.collapseStringList(l); - } else - return (String)value; - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode FLAG - // - // ---------------------------------------------------------------------- - - public static class Flag extends BCF2FieldEncoder { - public Flag(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.INT8); - if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 ) - throw new IllegalStateException("Flag encoder only supports atomic flags for field " + getField()); - } - - @Override - public int numElements() { - return 1; // the header says 0 but we will write 1 value - } - - @Override - @Requires({"minValues <= 1", "value != null", "value instanceof Boolean", "((Boolean)value) == true"}) - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - encoder.encodeRawBytes(1, getStaticType()); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode FLOAT - // - // ---------------------------------------------------------------------- - - public static class Float extends BCF2FieldEncoder { - final boolean isAtomic; - - public Float(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, BCF2Type.FLOAT); - isAtomic = hasConstantNumElements() && numElements() == 1; - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - // TODO -- can be restructured to avoid toList operation - if ( isAtomic ) { - // fast path for fields with 1 fixed float value - if ( value != null ) { - encoder.encodeRawFloat((Double)value); - count++; - } - } else { - // handle generic case - final List doubles = toList(Double.class, value); - for ( final Double d : doubles ) { - if ( d != null ) { // necessary because .,. => [null, null] in VC - encoder.encodeRawFloat(d); - count++; - } - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode int[] - // - // ---------------------------------------------------------------------- - - public static class IntArray extends BCF2FieldEncoder { - public IntArray(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - protected int numElementsFromValue(final Object value) { - return value == null ? 0 : ((int[])value).length; - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value); - } - - @Requires("value == null || ((int[])value).length <= minValues") - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - if ( value != null ) { - for ( final int i : (int[])value ) { - encoder.encodeRawInt(i, type); - count++; - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - // ---------------------------------------------------------------------- - // - // Subclass to encode List - // - // ---------------------------------------------------------------------- - - /** - * Specialized int encoder for atomic (non-list) integers - */ - public static class AtomicInt extends BCF2FieldEncoder { - public AtomicInt(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((Integer)value); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - if ( value != null ) { - encoder.encodeRawInt((Integer)value, type); - count++; - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - public static class GenericInts extends BCF2FieldEncoder { - public GenericInts(final VCFCompoundHeaderLine headerLine, final Map dict ) { - super(headerLine, dict, null); - } - - @Override - public BCF2Type getDynamicType(final Object value) { - return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType(toList(Integer.class, value)); - } - - @Override - public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException { - int count = 0; - for ( final Integer i : toList(Integer.class, value) ) { - if ( i != null ) { // necessary because .,. => [null, null] in VC - encoder.encodeRawInt(i, type); - count++; - } - } - for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type); - } - } - - - // ---------------------------------------------------------------------- - // - // Helper methods - // - // ---------------------------------------------------------------------- - - /** - * Helper function that takes an object and returns a list representation - * of it: - * - * o == null => [] - * o is a list => o - * else => [o] - * - * @param o - * @return - */ - private final static List toList(final Class c, final Object o) { - if ( o == null ) return Collections.emptyList(); - else if ( o instanceof List ) return (List)o; - else return Collections.singletonList((T)o); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldWriter.java deleted file mode 100644 index 9667d1889..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldWriter.java +++ /dev/null @@ -1,337 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.variant.bcf2.BCF2Type; -import org.broadinstitute.variant.bcf2.BCF2Utils; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public abstract class BCF2FieldWriter { - private final VCFHeader header; - private final BCF2FieldEncoder fieldEncoder; - - @Requires({"header != null", "fieldEncoder != null"}) - protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - this.header = header; - this.fieldEncoder = fieldEncoder; - } - - @Ensures("result != null") - protected VCFHeader getHeader() { return header; } - @Ensures("result != null") - protected BCF2FieldEncoder getFieldEncoder() { - return fieldEncoder; - } - @Ensures("result != null") - protected String getField() { return getFieldEncoder().getField(); } - - @Requires("vc != null") - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - fieldEncoder.writeFieldKey(encoder); - } - - public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } // TODO -- overload done so that we null out values and test for correctness - - @Override - public String toString() { - return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder(); - } - - // -------------------------------------------------------------------------------- - // - // Sites writers - // - // -------------------------------------------------------------------------------- - - public static abstract class SiteWriter extends BCF2FieldWriter { - protected SiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException; - } - - public static class GenericSiteWriter extends SiteWriter { - public GenericSiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - final Object rawValue = vc.getAttribute(getField(), null); - final BCF2Type type = getFieldEncoder().getType(rawValue); - if ( rawValue == null ) { - // the value is missing, just write in null - encoder.encodeType(0, type); - } else { - final int valueCount = getFieldEncoder().numElements(vc, rawValue); - encoder.encodeType(valueCount, type); - getFieldEncoder().encodeValue(encoder, rawValue, type, valueCount); - } - } - } - - // -------------------------------------------------------------------------------- - // - // Genotypes writers - // - // -------------------------------------------------------------------------------- - - public static abstract class GenotypesWriter extends BCF2FieldWriter { - int nValuesPerGenotype = -1; - BCF2Type encodingType = null; - - protected GenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - - if ( fieldEncoder.hasConstantNumElements() ) { - nValuesPerGenotype = getFieldEncoder().numElements(); - } - } - - @Override - @Requires({"encodingType != null", - "nValuesPerGenotype >= 0 || ! getFieldEncoder().hasConstantNumElements()"}) - @Ensures("nValuesPerGenotype >= 0") - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // writes the key information - super.start(encoder, vc); - - // only update if we need to - if ( ! getFieldEncoder().hasConstantNumElements() ) { - if ( getFieldEncoder().hasContextDeterminedNumElements() ) - // we are cheap -- just depends on genotype of allele counts - nValuesPerGenotype = getFieldEncoder().numElements(vc); - else - // we have to go fishing through the values themselves (expensive) - nValuesPerGenotype = computeMaxSizeOfGenotypeFieldFromValues(vc); - } - - encoder.encodeType(nValuesPerGenotype, encodingType); - } - - @Requires({"encodingType != null", "nValuesPerGenotype >= 0"}) - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final Object fieldValue = g.getExtendedAttribute(getField(), null); - getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); - } - - @Ensures({"result >= 0"}) - protected int numElements(final VariantContext vc, final Genotype g) { - return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField())); - } - - @Ensures({"result >= 0"}) - private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) { - int size = -1; - - for ( final Genotype g : vc.getGenotypes() ) { - size = Math.max(size, numElements(vc, g)); - } - - return size; - } - } - - public static class StaticallyTypeGenotypesWriter extends GenotypesWriter { - public StaticallyTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - encodingType = getFieldEncoder().getStaticType(); - } - } - - public static class IntegerTypeGenotypesWriter extends GenotypesWriter { - public IntegerTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // the only value that is dynamic are integers - final List values = new ArrayList(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - for ( final Object i : BCF2Utils.toList(g.getExtendedAttribute(getField(), null)) ) { - if ( i != null ) values.add((Integer)i); // we know they are all integers - } - } - - encodingType = BCF2Utils.determineIntegerType(values); - super.start(encoder, vc); - } - } - - public static class IGFGenotypesWriter extends GenotypesWriter { - final IntGenotypeFieldAccessors.Accessor ige; - - public IGFGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder, final IntGenotypeFieldAccessors.Accessor ige) { - super(header, fieldEncoder); - this.ige = ige; - - if ( ! (fieldEncoder instanceof BCF2FieldEncoder.IntArray) ) - throw new IllegalArgumentException("BUG: IntGenotypesWriter requires IntArray encoder for field " + getField()); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - // TODO - // TODO this piece of code consumes like 10% of the runtime alone because fo the vc.getGenotypes() iteration - // TODO - encodingType = BCF2Type.INT8; - for ( final Genotype g : vc.getGenotypes() ) { - final int[] pls = ige.getValues(g); - final BCF2Type plsType = getFieldEncoder().getType(pls); - encodingType = BCF2Utils.maxIntegerType(encodingType, plsType); - if ( encodingType == BCF2Type.INT32 ) - break; // stop early - } - - super.start(encoder, vc); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - getFieldEncoder().encodeValue(encoder, ige.getValues(g), encodingType, nValuesPerGenotype); - } - - @Override - protected int numElements(final VariantContext vc, final Genotype g) { - return ige.getSize(g); - } - } - - public static class FTGenotypesWriter extends StaticallyTypeGenotypesWriter { - public FTGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final String fieldValue = g.getFilters(); - getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); - } - - @Override - protected int numElements(final VariantContext vc, final Genotype g) { - return getFieldEncoder().numElements(vc, g.getFilters()); - } - } - - public static class GTWriter extends GenotypesWriter { - final Map alleleMapForTriPlus = new HashMap(5); - Allele ref, alt1; - - public GTWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { - super(header, fieldEncoder); - } - - @Override - public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { - if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES ) - throw new IllegalStateException("Current BCF2 encoder cannot handle sites " + - "with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have " - + vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart()); - - encodingType = BCF2Type.INT8; - buildAlleleMap(vc); - nValuesPerGenotype = vc.getMaxPloidy(2); - - super.start(encoder, vc); - } - - @Override - public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { - final int samplePloidy = g.getPloidy(); - for ( int i = 0; i < nValuesPerGenotype; i++ ) { - if ( i < samplePloidy ) { - // we encode the actual allele - final Allele a = g.getAllele(i); - final int offset = getAlleleOffset(a); - final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00); - encoder.encodeRawBytes(encoded, encodingType); - } else { - // we need to pad with missing as we have ploidy < max for this sample - encoder.encodeRawBytes(encodingType.getMissingBytes(), encodingType); - } - } - } - - /** - * Fast path code to determine the offset. - * - * Inline tests for == against ref (most common, first test) - * == alt1 (second most common, second test) - * == NO_CALL (third) - * and finally in the map from allele => offset for all alt 2+ alleles - * - * @param a the allele whose offset we wish to determine - * @return the offset (from 0) of the allele in the list of variant context alleles (-1 means NO_CALL) - */ - @Requires("a != null") - private final int getAlleleOffset(final Allele a) { - if ( a == ref ) return 0; - else if ( a == alt1 ) return 1; - else if ( a == Allele.NO_CALL ) return -1; - else { - final Integer o = alleleMapForTriPlus.get(a); - if ( o == null ) throw new IllegalStateException("BUG: Couldn't find allele offset for allele " + a); - return o; - } - } - - private final void buildAlleleMap(final VariantContext vc) { - // these are fast path options to determine the offsets for - final int nAlleles = vc.getNAlleles(); - ref = vc.getReference(); - alt1 = nAlleles > 1 ? vc.getAlternateAllele(0) : null; - - if ( nAlleles > 2 ) { - // for multi-allelics we need to clear the map, and add additional looks - alleleMapForTriPlus.clear(); - final List alleles = vc.getAlleles(); - for ( int i = 2; i < alleles.size(); i++ ) { - alleleMapForTriPlus.put(alleles.get(i), i); - } - } - } - } -} - diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldWriterManager.java deleted file mode 100644 index a3cbc5bf3..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2FieldWriterManager.java +++ /dev/null @@ -1,180 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.*; - -import java.util.HashMap; -import java.util.Map; - -/** - * See #BCFWriter for documentation on this classes role in encoding BCF2 files - * - * @author Mark DePristo - * @since 06/12 - */ -public class BCF2FieldWriterManager { - final Map siteWriters = new HashMap(); - final Map genotypesWriters = new HashMap(); - final IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); - - public BCF2FieldWriterManager() { } - - /** - * Setup the FieldWriters appropriate to each INFO and FORMAT in the VCF header - * - * Must be called before any of the getter methods will work - * - * @param header a VCFHeader containing description for every INFO and FORMAT field we'll attempt to write out to BCF - * @param encoder the encoder we are going to use to write out the BCF2 data - * @param stringDictionary a map from VCFHeader strings to their offsets for encoding - */ - public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map stringDictionary) { - for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) { - final String field = line.getID(); - final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary); - add(siteWriters, field, writer); - } - - for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) { - final String field = line.getID(); - final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary); - add(genotypesWriters, field, writer); - } - } - - @Requires({"field != null", "writer != null"}) - @Ensures("map.containsKey(field)") - private final void add(final Map map, final String field, final T writer) { - if ( map.containsKey(field) ) - throw new IllegalStateException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders"); - map.put(field, writer); - } - - // ----------------------------------------------------------------- - // - // Master routine to look at the header, a specific line, and - // build an appropriate SiteWriter for that header element - // - // ----------------------------------------------------------------- - - private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFHeader header, - final VCFInfoHeaderLine line, - final BCF2Encoder encoder, - final Map dict) { - return new BCF2FieldWriter.GenericSiteWriter(header, createFieldEncoder(line, encoder, dict, false)); - } - - private BCF2FieldEncoder createFieldEncoder(final VCFCompoundHeaderLine line, - final BCF2Encoder encoder, - final Map dict, - final boolean createGenotypesEncoders ) { - - if ( createGenotypesEncoders && intGenotypeFieldAccessors.getAccessor(line.getID()) != null ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && line.getType() != VCFHeaderLineType.Integer ) - System.err.println("Warning: field " + line.getID() + " expected to encode an integer but saw " + line.getType() + " for record " + line); - return new BCF2FieldEncoder.IntArray(line, dict); - } else if ( createGenotypesEncoders && line.getID().equals(VCFConstants.GENOTYPE_KEY) ) { - return new BCF2FieldEncoder.GenericInts(line, dict); - } else { - switch ( line.getType() ) { - case Character: - case String: - return new BCF2FieldEncoder.StringOrCharacter(line, dict); - case Flag: - return new BCF2FieldEncoder.Flag(line, dict); - case Float: - return new BCF2FieldEncoder.Float(line, dict); - case Integer: - if ( line.isFixedCount() && line.getCount() == 1 ) - return new BCF2FieldEncoder.AtomicInt(line, dict); - else - return new BCF2FieldEncoder.GenericInts(line, dict); - default: - throw new IllegalArgumentException("Unexpected type for field " + line.getID()); - } - } - } - - // ----------------------------------------------------------------- - // - // Master routine to look at the header, a specific line, and - // build an appropriate Genotypes for that header element - // - // ----------------------------------------------------------------- - - private BCF2FieldWriter.GenotypesWriter createGenotypesWriter(final VCFHeader header, - final VCFFormatHeaderLine line, - final BCF2Encoder encoder, - final Map dict) { - final String field = line.getID(); - final BCF2FieldEncoder fieldEncoder = createFieldEncoder(line, encoder, dict, true); - - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - return new BCF2FieldWriter.GTWriter(header, fieldEncoder); - } else if ( line.getID().equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { - return new BCF2FieldWriter.FTGenotypesWriter(header, fieldEncoder); - } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { - return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field)); - } else if ( line.getType() == VCFHeaderLineType.Integer ) { - return new BCF2FieldWriter.IntegerTypeGenotypesWriter(header, fieldEncoder); - } else { - return new BCF2FieldWriter.StaticallyTypeGenotypesWriter(header, fieldEncoder); - } - } - - // ----------------------------------------------------------------- - // - // Accessors to get site / genotype writers - // - // ----------------------------------------------------------------- - - /** - * Get a site writer specialized to encode values for site info field - * @param field key found in the VCF header INFO records - * @return non-null writer if one can be found, or null if none exists for field - */ - public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String field) { - return getWriter(field, siteWriters); - } - - /** - * Get a genotypes writer specialized to encode values for genotypes field - * @param field key found in the VCF header FORMAT records - * @return non-null writer if one can be found, or null if none exists for field - */ - public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String field) { - return getWriter(field, genotypesWriters); - } - - @Requires({"map != null", "key != null"}) - public T getWriter(final String key, final Map map) { - return map.get(key); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2Writer.java deleted file mode 100644 index c24ffec48..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/BCF2Writer.java +++ /dev/null @@ -1,425 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.variant.bcf2.BCF2Type; -import org.broadinstitute.variant.bcf2.BCF2Utils; -import org.broadinstitute.variant.bcf2.BCFVersion; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFContigHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFUtils; - -import java.io.*; -import java.util.*; - -/** - * VariantContextWriter that emits BCF2 binary encoding - * - * Overall structure of this writer is complex for efficiency reasons - * - * -- The BCF2Writer manages the low-level BCF2 encoder, the mappings - * from contigs and strings to offsets, the VCF header, and holds the - * lower-level encoders that map from VC and Genotype fields to their - * specific encoders. This class also writes out the standard BCF2 fields - * like POS, contig, the size of info and genotype data, QUAL, etc. It - * has loops over the INFO and GENOTYPES to encode each individual datum - * with the generic field encoders, but the actual encoding work is - * done with by the FieldWriters classes themselves - * - * -- BCF2FieldWriter are specialized classes for writing out SITE and - * genotype information for specific SITE/GENOTYPE fields (like AC for - * sites and GQ for genotypes). These are objects in themselves because - * the manage all of the complexity of relating the types in the VCF header - * with the proper encoding in BCF as well as the type representing this - * in java. Relating all three of these pieces of information together - * is the main complexity challenge in the encoder. The piece of code - * that determines which FieldWriters to associate with each SITE and - * GENOTYPE field is the BCF2FieldWriterManager. These FieldWriters - * are specialized for specific combinations of encoders (see below) - * and contexts (genotypes) for efficiency, so they smartly manage - * the writing of PLs (encoded as int[]) directly into the lowest - * level BCFEncoder. - * - * -- At the third level is the BCF2FieldEncoder, relatively simple - * pieces of code that handle the task of determining the right - * BCF2 type for specific field values, as well as reporting back - * information such as the number of elements used to encode it - * (simple for atomic values like Integer but complex for PLs - * or lists of strings) - * - * -- At the lowest level is the BCF2Encoder itself. This provides - * just the limited encoding methods specified by the BCF2 specification. This encoder - * doesn't do anything but make it possible to conveniently write out valid low-level - * BCF2 constructs. - * - * @author Mark DePristo - * @since 06/12 - */ -class BCF2Writer extends IndexingVariantContextWriter { - public static final int MAJOR_VERSION = 2; - public static final int MINOR_VERSION = 1; - - final private static boolean ALLOW_MISSING_CONTIG_LINES = false; - - private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support - private VCFHeader header; - private final Map contigDictionary = new HashMap(); - private final Map stringDictionaryMap = new LinkedHashMap(); - private final boolean doNotWriteGenotypes; - private String[] sampleNames = null; - - private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives - final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager(); - - /** - * cached results for whether we can write out raw genotypes data. - */ - private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null; - private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false; - - - public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { - super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); - this.outputStream = getOutputStream(); - this.doNotWriteGenotypes = doNotWriteGenotypes; - } - - // -------------------------------------------------------------------------------- - // - // Interface functions - // - // -------------------------------------------------------------------------------- - - @Override - public void writeHeader(VCFHeader header) { - // make sure the header is sorted correctly - header = new VCFHeader(header.getMetaDataInSortedOrder(), header.getGenotypeSamples()); - - // create the config offsets map - if ( header.getContigLines().isEmpty() ) { - if ( ALLOW_MISSING_CONTIG_LINES ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("No contig dictionary found in header, falling back to reference sequence dictionary"); - } - createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null)); - } else { - throw new IllegalStateException("Cannot write BCF2 file with missing contig lines"); - } - } else { - createContigDictionary(header.getContigLines()); - } - - // set up the map from dictionary string values -> offset - final ArrayList dict = BCF2Utils.makeDictionary(header); - for ( int i = 0; i < dict.size(); i++ ) { - stringDictionaryMap.put(dict.get(i), i); - } - - sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]); - - // setup the field encodings - fieldManager.setup(header, encoder, stringDictionaryMap); - - try { - // write out the header into a byte stream, get it's length, and write everything to the file - final ByteArrayOutputStream capture = new ByteArrayOutputStream(); - final OutputStreamWriter writer = new OutputStreamWriter(capture); - this.header = VCFWriter.writeHeader(header, writer, doNotWriteGenotypes, VCFWriter.getVersionLine(), "BCF2 stream"); - writer.append('\0'); // the header is null terminated by a byte - writer.close(); - - final byte[] headerBytes = capture.toByteArray(); - new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream); - BCF2Type.INT32.write(headerBytes.length, outputStream); - outputStream.write(headerBytes); - } catch (IOException e) { - throw new RuntimeException("BCF2 stream: Got IOException while trying to write BCF2 header", e); - } - } - - @Override - public void add( VariantContext vc ) { - if ( doNotWriteGenotypes ) - vc = new VariantContextBuilder(vc).noGenotypes().make(); - vc = vc.fullyDecode(header, false); - - super.add(vc); // allow on the fly indexing - - try { - final byte[] infoBlock = buildSitesData(vc); - final byte[] genotypesBlock = buildSamplesData(vc); - - // write the two blocks to disk - writeBlock(infoBlock, genotypesBlock); - } - catch ( IOException e ) { - throw new RuntimeException("Error writing record to BCF2 file: " + vc.toString(), e); - } - } - - @Override - public void close() { - try { - outputStream.flush(); - outputStream.close(); - } - catch ( IOException e ) { - throw new RuntimeException("Failed to close BCF2 file"); - } - super.close(); - } - - // -------------------------------------------------------------------------------- - // - // implicit block - // - // The first four records of BCF are inline untype encoded data of: - // - // 4 byte integer chrom offset - // 4 byte integer start - // 4 byte integer ref length - // 4 byte float qual - // - // -------------------------------------------------------------------------------- - private byte[] buildSitesData( VariantContext vc ) throws IOException { - final int contigIndex = contigDictionary.get(vc.getChr()); - if ( contigIndex == -1 ) - throw new IllegalStateException(String.format("Contig %s not found in sequence dictionary from reference", vc.getChr())); - - // note use of encodeRawValue to not insert the typing byte - encoder.encodeRawValue(contigIndex, BCF2Type.INT32); - - // pos. GATK is 1 based, BCF2 is 0 based - encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32); - - // ref length. GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1 - // for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1 - encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32); - - // qual - if ( vc.hasLog10PError() ) - encoder.encodeRawFloat((float) vc.getPhredScaledQual()); - else - encoder.encodeRawMissingValue(BCF2Type.FLOAT); - - // info fields - final int nAlleles = vc.getNAlleles(); - final int nInfo = vc.getAttributes().size(); - final int nGenotypeFormatFields = getNGenotypeFormatFields(vc); - final int nSamples = header.getNGenotypeSamples(); - - encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x0000FFFF), BCF2Type.INT32); - encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x00FFFFF), BCF2Type.INT32); - - buildID(vc); - buildAlleles(vc); - buildFilter(vc); - buildInfo(vc); - - return encoder.getRecordBytes(); - } - - - /** - * Can we safely write on the raw (undecoded) genotypes of an input VC? - * - * The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in - * which case we return the previous result. If it's not cached, we use the BCF2Util to - * compare the VC header with our header (expensive) and cache it. - * - * @param lazyData - * @return - */ - private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) { - if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) { - // result is already cached - canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header); - lastVCFHeaderOfUnparsedGenotypes = lazyData.header; - } - - return canPassOnUnparsedGenotypeDataForLastVCFHeader; - } - - private BCF2Codec.LazyData getLazyData(final VariantContext vc) { - if ( vc.getGenotypes().isLazyWithData() ) { - final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes(); - - if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData && - canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) { - return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData(); - } else { - lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long - } - } - - return null; - } - - /** - * Try to get the nGenotypeFields as efficiently as possible. - * - * If this is a lazy BCF2 object just grab the field count from there, - * otherwise do the whole counting by types test in the actual data - * - * @param vc - * @return - */ - private final int getNGenotypeFormatFields(final VariantContext vc) { - final BCF2Codec.LazyData lazyData = getLazyData(vc); - return lazyData != null ? lazyData.nGenotypeFields : VCFWriter.calcVCFGenotypeKeys(vc, header).size(); - } - - private void buildID( VariantContext vc ) throws IOException { - encoder.encodeTypedString(vc.getID()); - } - - private void buildAlleles( VariantContext vc ) throws IOException { - for ( Allele allele : vc.getAlleles() ) { - final byte[] s = allele.getDisplayBases(); - if ( s == null ) - throw new IllegalStateException("BUG: BCF2Writer encountered null padded allele" + allele); - encoder.encodeTypedString(s); - } - } - - private void buildFilter( VariantContext vc ) throws IOException { - if ( vc.isFiltered() ) { - encodeStringsByRef(vc.getFilters()); - } else if ( vc.filtersWereApplied() ) { - encodeStringsByRef(Collections.singleton(VCFConstants.PASSES_FILTERS_v4)); - } else { - encoder.encodeTypedMissing(BCF2Type.INT8); - } - } - - private void buildInfo( VariantContext vc ) throws IOException { - for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) { - final String field = infoFieldEntry.getKey(); - final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(field); - if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "INFO"); - writer.start(encoder, vc); - writer.site(encoder, vc); - writer.done(encoder, vc); - } - } - - private byte[] buildSamplesData(final VariantContext vc) throws IOException { - final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects - if ( lazyData != null ) { - // we never decoded any data from this BCF file, so just pass it back - return lazyData.bytes; - } - - // we have to do work to convert the VC into a BCF2 byte stream - final List genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header); - for ( final String field : genotypeFields ) { - final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); - if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); - - assert writer != null; - - writer.start(encoder, vc); - for ( final String name : sampleNames ) { - Genotype g = vc.getGenotype(name); - if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype); - writer.addGenotype(encoder, vc, g); - } - writer.done(encoder, vc); - } - return encoder.getRecordBytes(); - } - - /** - * Throws a meaningful error message when a field (INFO or FORMAT) is found when writing out a file - * but there's no header line for it. - * - * @param vc - * @param field - * @param fieldType - */ - private final void errorUnexpectedFieldToWrite(final VariantContext vc, final String field, final String fieldType) { - throw new IllegalStateException("Found field " + field + " in the " + fieldType + " fields of VariantContext at " + - vc.getChr() + ":" + vc.getStart() + " from " + vc.getSource() + " but this hasn't been defined in the VCFHeader"); - } - - // -------------------------------------------------------------------------------- - // - // Low-level block encoding - // - // -------------------------------------------------------------------------------- - - /** - * Write the data in the encoder to the outputstream as a length encoded - * block of data. After this call the encoder stream will be ready to - * start a new data block - * - * @throws IOException - */ - @Requires({"infoBlock.length > 0", "genotypesBlock.length >= 0"}) - private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException { - BCF2Type.INT32.write(infoBlock.length, outputStream); - BCF2Type.INT32.write(genotypesBlock.length, outputStream); - outputStream.write(infoBlock); - outputStream.write(genotypesBlock); - } - - @Requires("! strings.isEmpty()") - @Ensures("result.isIntegerType()") - private final BCF2Type encodeStringsByRef(final Collection strings) throws IOException { - final List offsets = new ArrayList(strings.size()); - - // iterate over strings until we find one that needs 16 bits, and break - for ( final String string : strings ) { - final Integer got = stringDictionaryMap.get(string); - if ( got == null ) throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF"); - final int offset = got; - offsets.add(offset); - } - - final BCF2Type type = BCF2Utils.determineIntegerType(offsets); - encoder.encodeTyped(offsets, type); - return type; - } - - /** - * Create the contigDictionary from the contigLines extracted from the VCF header - * - * @param contigLines - */ - @Requires("contigDictionary.isEmpty()") - private final void createContigDictionary(final Collection contigLines) { - int offset = 0; - for ( VCFContigHeaderLine contig : contigLines ) - contigDictionary.put(contig.getID(), offset++); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/IndexingVariantContextWriter.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/IndexingVariantContextWriter.java deleted file mode 100644 index 96a4fb411..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/IndexingVariantContextWriter.java +++ /dev/null @@ -1,181 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import org.broad.tribble.Tribble; -import org.broad.tribble.index.DynamicIndexCreator; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.io.*; - -/** - * this class writes VCF files - */ -abstract class IndexingVariantContextWriter implements VariantContextWriter { - private final String name; - private final SAMSequenceDictionary refDict; - - private OutputStream outputStream; - private PositionalOutputStream positionalOutputStream = null; - private DynamicIndexCreator indexer = null; - private LittleEndianOutputStream idxStream = null; - - @Requires({"name != null", - "! ( location == null && output == null )", - "! ( enableOnTheFlyIndexing && location == null )"}) - protected IndexingVariantContextWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) { - outputStream = output; - this.name = name; - this.refDict = refDict; - - if ( enableOnTheFlyIndexing ) { - try { - idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location))); - //System.out.println("Creating index on the fly for " + location); - indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - indexer.initialize(location, indexer.defaultBinSize()); - positionalOutputStream = new PositionalOutputStream(output); - outputStream = positionalOutputStream; - } catch ( IOException ex ) { - // No matter what we keep going, since we don't care if we can't create the index file - idxStream = null; - indexer = null; - positionalOutputStream = null; - } - } - } - - @Ensures("result != null") - public OutputStream getOutputStream() { - return outputStream; - } - - @Ensures("result != null") - public String getStreamName() { - return name; - } - - public abstract void writeHeader(VCFHeader header); - - /** - * attempt to close the VCF file - */ - public void close() { - try { - // try to close the index stream (keep it separate to help debugging efforts) - if ( indexer != null ) { - Index index = indexer.finalizeIndex(positionalOutputStream.getPosition()); - setIndexSequenceDictionary(index, refDict); - index.write(idxStream); - idxStream.close(); - } - - // close the underlying output stream as well - outputStream.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close index for " + getStreamName(), e); - } - } - - /** - * @return the reference sequence dictionary used for the variant contexts being written - */ - public SAMSequenceDictionary getRefDict() { - return refDict; - } - - /** - * add a record to the file - * - * @param vc the Variant Context object - */ - public void add(VariantContext vc) { - // if we are doing on the fly indexing, add the record ***before*** we write any bytes - if ( indexer != null ) - indexer.addFeature(vc, positionalOutputStream.getPosition()); - } - - /** - * Returns a reasonable "name" for this writer, to display to the user if something goes wrong - * - * @param location - * @param stream - * @return - */ - protected static final String writerName(final File location, final OutputStream stream) { - return location == null ? stream.toString() : location.getAbsolutePath(); - } - - // a constant we use for marking sequence dictionary entries in the Tribble index property list - private static final String SequenceDictionaryPropertyPredicate = "DICT:"; - - private static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { - for ( SAMSequenceRecord seq : dict.getSequences() ) { - final String contig = SequenceDictionaryPropertyPredicate + seq.getSequenceName(); - final String length = String.valueOf(seq.getSequenceLength()); - index.addProperty(contig,length); - } - } -} - -final class PositionalOutputStream extends OutputStream { - private final OutputStream out; - private long position = 0; - - public PositionalOutputStream(final OutputStream out) { - this.out = out; - } - - public final void write(final byte[] bytes) throws IOException { - write(bytes, 0, bytes.length); - } - - public final void write(final byte[] bytes, final int startIndex, final int numBytes) throws IOException { - position += numBytes; - out.write(bytes, startIndex, numBytes); - } - - public final void write(int c) throws IOException { - position++; - out.write(c); - } - - public final long getPosition() { return position; } - - @Override - public void close() throws IOException { - super.close(); - out.close(); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/IntGenotypeFieldAccessors.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/IntGenotypeFieldAccessors.java deleted file mode 100644 index f02612b43..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/IntGenotypeFieldAccessors.java +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.variantcontext.Genotype; - -import java.util.HashMap; - -/** - * A convenient way to provide a single view on the many int and int[] field values we work with, - * for writing out the values. This class makes writing out the inline AD, GQ, PL, DP fields - * easy and fast - * - * @author Mark DePristo - * @since 6/12 - */ -class IntGenotypeFieldAccessors { - // initialized once per writer to allow parallel writers to work - private final HashMap intGenotypeFieldEncoders = new HashMap(); - - public IntGenotypeFieldAccessors() { - intGenotypeFieldEncoders.put(VCFConstants.DEPTH_KEY, new IntGenotypeFieldAccessors.DPAccessor()); - intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new IntGenotypeFieldAccessors.ADAccessor()); - intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_PL_KEY, new IntGenotypeFieldAccessors.PLAccessor()); - intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_QUALITY_KEY, new IntGenotypeFieldAccessors.GQAccessor()); - } - - /** - * Return an accessor for field, or null if none exists - * @param field - * @return - */ - public Accessor getAccessor(final String field) { - return intGenotypeFieldEncoders.get(field); - } - - public static abstract class Accessor { - public abstract int[] getValues(final Genotype g); - - public final int getSize(final Genotype g) { - final int[] v = getValues(g); - return v == null ? 0 : v.length; - } - } - - private static abstract class AtomicAccessor extends Accessor { - private final int[] singleton = new int[1]; - - @Override - public int[] getValues(final Genotype g) { - singleton[0] = getValue(g); - return singleton[0] == -1 ? null : singleton; - } - - public abstract int getValue(final Genotype g); - } - - public static class GQAccessor extends AtomicAccessor { - @Override public int getValue(final Genotype g) { return Math.min(g.getGQ(), VCFConstants.MAX_GENOTYPE_QUAL); } - } - - public static class DPAccessor extends AtomicAccessor { - @Override public int getValue(final Genotype g) { return g.getDP(); } - } - - public static class ADAccessor extends Accessor { - @Override public int[] getValues(final Genotype g) { return g.getAD(); } - } - - public static class PLAccessor extends Accessor { - @Override public int[] getValues(final Genotype g) { return g.getPL(); } - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/Options.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/Options.java deleted file mode 100644 index 3b6d46451..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/Options.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -/** - * Available writer options for VariantContextWriters - * - * @author Mark DePristo - * @since 5/12 - */ -public enum Options { - INDEX_ON_THE_FLY, - DO_NOT_WRITE_GENOTYPES, - ALLOW_MISSING_FIELDS_IN_HEADER, - FORCE_BCF -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/SortingVariantContextWriter.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/SortingVariantContextWriter.java deleted file mode 100644 index d7254fa71..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/SortingVariantContextWriter.java +++ /dev/null @@ -1,61 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import org.broadinstitute.variant.variantcontext.VariantContext; - -/** - * this class writes VCF files, allowing records to be passed in unsorted (up to a certain genomic distance away) - */ -class SortingVariantContextWriter extends SortingVariantContextWriterBase { - - // the maximum START distance between records that we'll cache - private int maxCachingStartDistance; - - /** - * create a local-sorting VCF writer, given an inner VCF writer to write to - * - * @param innerWriter the VCFWriter to write to - * @param maxCachingStartDistance the maximum start distance between records that we'll cache - * @param takeOwnershipOfInner Should this Writer close innerWriter when it's done with it - */ - public SortingVariantContextWriter(VariantContextWriter innerWriter, int maxCachingStartDistance, boolean takeOwnershipOfInner) { - super(innerWriter, takeOwnershipOfInner); - this.maxCachingStartDistance = maxCachingStartDistance; - } - - public SortingVariantContextWriter(VariantContextWriter innerWriter, int maxCachingStartDistance) { - this(innerWriter, maxCachingStartDistance, false); // by default, don't own inner - } - - protected void noteCurrentRecord(VariantContext vc) { - super.noteCurrentRecord(vc); // first, check for errors - - // then, update mostUpstreamWritableLoc: - int mostUpstreamWritableIndex = vc.getStart() - maxCachingStartDistance; - this.mostUpstreamWritableLoc = Math.max(BEFORE_MOST_UPSTREAM_LOC, mostUpstreamWritableIndex); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/SortingVariantContextWriterBase.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/SortingVariantContextWriterBase.java deleted file mode 100644 index c4588dff6..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/SortingVariantContextWriterBase.java +++ /dev/null @@ -1,195 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.*; -import java.util.concurrent.PriorityBlockingQueue; - -/** - * This class writes VCF files, allowing records to be passed in unsorted. - * It also enforces that it is never passed records of the same chromosome with any other chromosome in between them. - */ -abstract class SortingVariantContextWriterBase implements VariantContextWriter { - - // The VCFWriter to which to actually write the sorted VCF records - private final VariantContextWriter innerWriter; - - // the current queue of un-emitted records - private final Queue queue; - - // The locus until which we are permitted to write out (inclusive) - protected Integer mostUpstreamWritableLoc; - protected static final int BEFORE_MOST_UPSTREAM_LOC = 0; // No real locus index is <= 0 - - // The set of chromosomes already passed over and to which it is forbidden to return - private final Set finishedChromosomes; - - // Should we call innerWriter.close() in close() - private final boolean takeOwnershipOfInner; - - // -------------------------------------------------------------------------------- - // - // Constructors - // - // -------------------------------------------------------------------------------- - - /** - * create a local-sorting VCF writer, given an inner VCF writer to write to - * - * @param innerWriter the VCFWriter to write to - * @param takeOwnershipOfInner Should this Writer close innerWriter when it's done with it - */ - public SortingVariantContextWriterBase(VariantContextWriter innerWriter, boolean takeOwnershipOfInner) { - this.innerWriter = innerWriter; - this.finishedChromosomes = new TreeSet(); - this.takeOwnershipOfInner = takeOwnershipOfInner; - - // has to be PriorityBlockingQueue to be thread-safe - this.queue = new PriorityBlockingQueue(50, new VariantContextComparator()); - - this.mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC; - } - - public SortingVariantContextWriterBase(VariantContextWriter innerWriter) { - this(innerWriter, false); // by default, don't own inner - } - - // -------------------------------------------------------------------------------- - // - // public interface functions - // - // -------------------------------------------------------------------------------- - - @Override - public void writeHeader(VCFHeader header) { - innerWriter.writeHeader(header); - } - - /** - * attempt to close the VCF file; we need to flush the queue first - */ - @Override - public void close() { - stopWaitingToSort(); - - if (takeOwnershipOfInner) - innerWriter.close(); - } - - - /** - * add a record to the file - * - * @param vc the Variant Context object - */ - @Override - public synchronized void add(VariantContext vc) { - /* Note that the code below does not prevent the successive add()-ing of: (chr1, 10), (chr20, 200), (chr15, 100) - since there is no implicit ordering of chromosomes: - */ - VCFRecord firstRec = queue.peek(); - if (firstRec != null && !vc.getChr().equals(firstRec.vc.getChr())) { // if we hit a new contig, flush the queue - if (finishedChromosomes.contains(vc.getChr())) - throw new IllegalArgumentException("Added a record at " + vc.getChr() + ":" + vc.getStart() + ", but already finished with chromosome" + vc.getChr()); - - finishedChromosomes.add(firstRec.vc.getChr()); - stopWaitingToSort(); - } - - noteCurrentRecord(vc); // possibly overwritten - - queue.add(new VCFRecord(vc)); - emitSafeRecords(); - } - - /** - * Gets a string representation of this object. - * @return a string representation of this object - */ - @Override - public String toString() { - return getClass().getName(); - } - - // -------------------------------------------------------------------------------- - // - // protected interface functions for subclasses to use - // - // -------------------------------------------------------------------------------- - - private synchronized void stopWaitingToSort() { - emitRecords(true); - mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC; - } - - protected synchronized void emitSafeRecords() { - emitRecords(false); - } - - protected void noteCurrentRecord(VariantContext vc) { - // did the user break the contract by giving a record too late? - if (mostUpstreamWritableLoc != null && vc.getStart() < mostUpstreamWritableLoc) // went too far back, since may have already written anything that is <= mostUpstreamWritableLoc - throw new IllegalArgumentException("Permitted to write any record upstream of position " + mostUpstreamWritableLoc + ", but a record at " + vc.getChr() + ":" + vc.getStart() + " was just added."); - } - - // -------------------------------------------------------------------------------- - // - // private implementation functions - // - // -------------------------------------------------------------------------------- - - private synchronized void emitRecords(boolean emitUnsafe) { - while (!queue.isEmpty()) { - VCFRecord firstRec = queue.peek(); - - // No need to wait, waiting for nothing, or before what we're waiting for: - if (emitUnsafe || mostUpstreamWritableLoc == null || firstRec.vc.getStart() <= mostUpstreamWritableLoc) { - queue.poll(); - innerWriter.add(firstRec.vc); - } - else { - break; - } - } - } - - private static class VariantContextComparator implements Comparator { - public int compare(VCFRecord r1, VCFRecord r2) { - return r1.vc.getStart() - r2.vc.getStart(); - } - } - - private static class VCFRecord { - public VariantContext vc; - - public VCFRecord(VariantContext vc) { - this.vc = vc; - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/VCFWriter.java deleted file mode 100644 index e794e9249..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/VCFWriter.java +++ /dev/null @@ -1,606 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import net.sf.samtools.SAMSequenceDictionary; -import org.broad.tribble.TribbleException; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.*; -import java.lang.reflect.Array; -import java.nio.charset.Charset; -import java.util.*; - -/** - * this class writes VCF files - */ -class VCFWriter extends IndexingVariantContextWriter { - private final static String VERSION_LINE = VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString(); - - // should we write genotypes or just sites? - final protected boolean doNotWriteGenotypes; - - // the VCF header we're storing - protected VCFHeader mHeader = null; - - final private boolean allowMissingFieldsInHeader; - - /** - * The VCF writer uses an internal Writer, based by the ByteArrayOutputStream lineBuffer, - * to temp. buffer the header and per-site output before flushing the per line output - * in one go to the super.getOutputStream. This results in high-performance, proper encoding, - * and allows us to avoid flushing explicitly the output stream getOutputStream, which - * allows us to properly compress vcfs in gz format without breaking indexing on the fly - * for uncompressed streams. - */ - private static final int INITIAL_BUFFER_SIZE = 1024 * 16; - private final ByteArrayOutputStream lineBuffer = new ByteArrayOutputStream(INITIAL_BUFFER_SIZE); - private final Writer writer; - - /** - * The encoding used for VCF files. ISO-8859-1 - */ - final private Charset charset; - - private IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors(); - - public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, - final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes, - final boolean allowMissingFieldsInHeader ) { - super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); - this.doNotWriteGenotypes = doNotWriteGenotypes; - this.allowMissingFieldsInHeader = allowMissingFieldsInHeader; - this.charset = Charset.forName("ISO-8859-1"); - this.writer = new OutputStreamWriter(lineBuffer, charset); - } - - // -------------------------------------------------------------------------------- - // - // VCFWriter interface functions - // - // -------------------------------------------------------------------------------- - - /** - * Write String s to the internal buffered writer. - * - * flushBuffer() must be called to actually write the data to the true output stream. - * - * @param s the string to write - * @throws IOException - */ - private void write(final String s) throws IOException { - writer.write(s); - } - - /** - * Actually write the line buffer contents to the destination output stream. - * - * After calling this function the line buffer is reset, so the contents of the buffer can be reused - * - * @throws IOException - */ - private void flushBuffer() throws IOException { - writer.flush(); - getOutputStream().write(lineBuffer.toByteArray()); - lineBuffer.reset(); - } - - @Override - public void writeHeader(VCFHeader header) { - // note we need to update the mHeader object after this call because they header - // may have genotypes trimmed out of it, if doNotWriteGenotypes is true - try { - mHeader = writeHeader(header, writer, doNotWriteGenotypes, getVersionLine(), getStreamName()); - flushBuffer(); - } catch ( IOException e ) { - throw new RuntimeException("Couldn't write file " + getStreamName(), e); - } - } - - public static String getVersionLine() { - return VERSION_LINE; - } - - public static VCFHeader writeHeader(VCFHeader header, - final Writer writer, - final boolean doNotWriteGenotypes, - final String versionLine, - final String streamNameForError) { - header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header; - - try { - // the file format field needs to be written first - writer.write(versionLine + "\n"); - - for ( VCFHeaderLine line : header.getMetaDataInSortedOrder() ) { - if ( VCFHeaderVersion.isFormatString(line.getKey()) ) - continue; - - writer.write(VCFHeader.METADATA_INDICATOR); - writer.write(line.toString()); - writer.write("\n"); - } - - // write out the column line - writer.write(VCFHeader.HEADER_INDICATOR); - boolean isFirst = true; - for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { - if ( isFirst ) - isFirst = false; // don't write out a field separator - else - writer.write(VCFConstants.FIELD_SEPARATOR); - writer.write(field.toString()); - } - - if ( header.hasGenotypingData() ) { - writer.write(VCFConstants.FIELD_SEPARATOR); - writer.write("FORMAT"); - for ( String sample : header.getGenotypeSamples() ) { - writer.write(VCFConstants.FIELD_SEPARATOR); - writer.write(sample); - } - } - - writer.write("\n"); - writer.flush(); // necessary so that writing to an output stream will work - } - catch (IOException e) { - throw new RuntimeException("IOException writing the VCF header to " + streamNameForError, e); - } - - return header; - } - - /** - * attempt to close the VCF file - */ - @Override - public void close() { - // try to close the vcf stream - try { - // TODO -- would it be useful to null out the line buffer so we don't have it around unnecessarily? - writer.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close " + getStreamName(), e); - } - - super.close(); - } - - /** - * add a record to the file - * - * @param vc the Variant Context object - */ - @Override - public void add(VariantContext vc) { - if ( mHeader == null ) - throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName()); - - if ( doNotWriteGenotypes ) - vc = new VariantContextBuilder(vc).noGenotypes().make(); - - try { - super.add(vc); - - Map alleleMap = buildAlleleMap(vc); - - // CHROM - write(vc.getChr()); - write(VCFConstants.FIELD_SEPARATOR); - - // POS - write(String.valueOf(vc.getStart())); - write(VCFConstants.FIELD_SEPARATOR); - - // ID - String ID = vc.getID(); - write(ID); - write(VCFConstants.FIELD_SEPARATOR); - - // REF - String refString = vc.getReference().getDisplayString(); - write(refString); - write(VCFConstants.FIELD_SEPARATOR); - - // ALT - if ( vc.isVariant() ) { - Allele altAllele = vc.getAlternateAllele(0); - String alt = altAllele.getDisplayString(); - write(alt); - - for (int i = 1; i < vc.getAlternateAlleles().size(); i++) { - altAllele = vc.getAlternateAllele(i); - alt = altAllele.getDisplayString(); - write(","); - write(alt); - } - } else { - write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); - } - write(VCFConstants.FIELD_SEPARATOR); - - // QUAL - if ( !vc.hasLog10PError() ) - write(VCFConstants.MISSING_VALUE_v4); - else - write(formatQualValue(vc.getPhredScaledQual())); - write(VCFConstants.FIELD_SEPARATOR); - - // FILTER - String filters = getFilterString(vc); - write(filters); - write(VCFConstants.FIELD_SEPARATOR); - - // INFO - Map infoFields = new TreeMap(); - for ( Map.Entry field : vc.getAttributes().entrySet() ) { - String key = field.getKey(); - - if ( ! mHeader.hasInfoLine(key) ) - fieldIsMissingFromHeaderError(vc, key, "INFO"); - - String outputValue = formatVCFField(field.getValue()); - if ( outputValue != null ) - infoFields.put(key, outputValue); - } - writeInfoString(infoFields); - - // FORMAT - final GenotypesContext gc = vc.getGenotypes(); - if ( gc.isLazyWithData() && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() instanceof String ) { - write(VCFConstants.FIELD_SEPARATOR); - write(((LazyGenotypesContext) gc).getUnparsedGenotypeData().toString()); - } else { - List genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader); - if ( ! genotypeAttributeKeys.isEmpty() ) { - for ( final String format : genotypeAttributeKeys ) - if ( ! mHeader.hasFormatLine(format) ) - fieldIsMissingFromHeaderError(vc, format, "FORMAT"); - - final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); - - write(VCFConstants.FIELD_SEPARATOR); - write(genotypeFormatString); - - addGenotypeData(vc, alleleMap, genotypeAttributeKeys); - } - } - - write("\n"); - // note that we cannot call flush here if we want block gzipping to work properly - // calling flush results in all gzipped blocks for each variant - flushBuffer(); - } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to " + getStreamName(), e); - } - } - - private static Map buildAlleleMap(final VariantContext vc) { - final Map alleleMap = new HashMap(vc.getAlleles().size()+1); - alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup - - final List alleles = vc.getAlleles(); - for ( int i = 0; i < alleles.size(); i++ ) { - alleleMap.put(alleles.get(i), String.valueOf(i)); - } - - return alleleMap; - } - - // -------------------------------------------------------------------------------- - // - // implementation functions - // - // -------------------------------------------------------------------------------- - - private final String getFilterString(final VariantContext vc) { - if ( vc.isFiltered() ) { - for ( final String filter : vc.getFilters() ) - if ( ! mHeader.hasFilterLine(filter) ) - fieldIsMissingFromHeaderError(vc, filter, "FILTER"); - - return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())); - } - else if ( vc.filtersWereApplied() ) - return VCFConstants.PASSES_FILTERS_v4; - else - return VCFConstants.UNFILTERED; - } - - private static final String QUAL_FORMAT_STRING = "%.2f"; - private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00"; - - private String formatQualValue(double qual) { - String s = String.format(QUAL_FORMAT_STRING, qual); - if ( s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM) ) - s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length()); - return s; - } - - /** - * create the info string; assumes that no values are null - * - * @param infoFields a map of info fields - * @throws IOException for writer - */ - private void writeInfoString(Map infoFields) throws IOException { - if ( infoFields.isEmpty() ) { - write(VCFConstants.EMPTY_INFO_FIELD); - return; - } - - boolean isFirst = true; - for ( Map.Entry entry : infoFields.entrySet() ) { - if ( isFirst ) - isFirst = false; - else - write(VCFConstants.INFO_FIELD_SEPARATOR); - - String key = entry.getKey(); - write(key); - - if ( !entry.getValue().equals("") ) { - VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key); - if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) { - write("="); - write(entry.getValue()); - } - } - } - } - - /** - * add the genotype data - * - * @param vc the variant context - * @param genotypeFormatKeys Genotype formatting string - * @param alleleMap alleles for this context - * @throws IOException for writer - */ - private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) - throws IOException { - final int ploidy = vc.getMaxPloidy(2); - - for ( String sample : mHeader.getGenotypeSamples() ) { - write(VCFConstants.FIELD_SEPARATOR); - - Genotype g = vc.getGenotype(sample); - if ( g == null ) g = GenotypeBuilder.createMissing(sample, ploidy); - - final List attrs = new ArrayList(genotypeFormatKeys.size()); - for ( String field : genotypeFormatKeys ) { - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { - if ( !g.isAvailable() ) { - throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record"); - } - - writeAllele(g.getAllele(0), alleleMap); - for (int i = 1; i < g.getPloidy(); i++) { - write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); - writeAllele(g.getAllele(i), alleleMap); - } - - continue; - } else { - String outputValue; - if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) { - outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4; - } else { - final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field); - if ( accessor != null ) { - final int[] intValues = accessor.getValues(g); - if ( intValues == null ) - outputValue = VCFConstants.MISSING_VALUE_v4; - else if ( intValues.length == 1 ) // fast path - outputValue = Integer.toString(intValues[0]); - else { - StringBuilder sb = new StringBuilder(); - sb.append(intValues[0]); - for ( int i = 1; i < intValues.length; i++) { - sb.append(","); - sb.append(intValues[i]); - } - outputValue = sb.toString(); - } - } else { - Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; - - VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); - if ( metaData != null ) { - int numInFormatField = metaData.getCount(vc); - if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { - // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. - // For example, if Number=2, the string has to be ".,." - StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); - for ( int i = 1; i < numInFormatField; i++ ) { - sb.append(","); - sb.append(VCFConstants.MISSING_VALUE_v4); - } - val = sb.toString(); - } - } - - // assume that if key is absent, then the given string encoding suffices - outputValue = formatVCFField(val); - } - } - - if ( outputValue != null ) - attrs.add(outputValue); - } - } - - // strip off trailing missing values - for (int i = attrs.size()-1; i >= 0; i--) { - if ( isMissingValue(attrs.get(i)) ) - attrs.remove(i); - else - break; - } - - for (int i = 0; i < attrs.size(); i++) { - if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) ) - write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); - write(attrs.get(i)); - } - } - } - - private boolean isMissingValue(String s) { - // we need to deal with the case that it's a list of missing values - return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); - } - - private void writeAllele(Allele allele, Map alleleMap) throws IOException { - String encoding = alleleMap.get(allele); - if ( encoding == null ) - throw new TribbleException.InternalCodecException("Allele " + allele + " is not an allele in the variant context"); - write(encoding); - } - - /** - * Takes a double value and pretty prints it to a String for display - * - * Large doubles => gets %.2f style formatting - * Doubles < 1 / 10 but > 1/100 => get %.3f style formatting - * Double < 1/100 => %.3e formatting - * @param d - * @return - */ - public static final String formatVCFDouble(final double d) { - String format; - if ( d < 1 ) { - if ( d < 0.01 ) { - if ( Math.abs(d) >= 1e-20 ) - format = "%.3e"; - else { - // return a zero format - return "0.00"; - } - } else { - format = "%.3f"; - } - } else { - format = "%.2f"; - } - - return String.format(format, d); - } - - public static String formatVCFField(Object val) { - String result; - if ( val == null ) - result = VCFConstants.MISSING_VALUE_v4; - else if ( val instanceof Double ) - result = formatVCFDouble((Double) val); - else if ( val instanceof Boolean ) - result = (Boolean)val ? "" : null; // empty string for true, null for false - else if ( val instanceof List ) { - result = formatVCFField(((List)val).toArray()); - } else if ( val.getClass().isArray() ) { - final int length = Array.getLength(val); - if ( length == 0 ) - return formatVCFField(null); - final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0))); - for ( int i = 1; i < length; i++) { - sb.append(","); - sb.append(formatVCFField(Array.get(val, i))); - } - result = sb.toString(); - } else - result = val.toString(); - - return result; - } - - /** - * Determine which genotype fields are in use in the genotypes in VC - * @param vc - * @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first - */ - public static List calcVCFGenotypeKeys(final VariantContext vc, final VCFHeader header) { - Set keys = new HashSet(); - - boolean sawGoodGT = false; - boolean sawGoodQual = false; - boolean sawGenotypeFilter = false; - boolean sawDP = false; - boolean sawAD = false; - boolean sawPL = false; - for ( final Genotype g : vc.getGenotypes() ) { - keys.addAll(g.getExtendedAttributes().keySet()); - if ( g.isAvailable() ) sawGoodGT = true; - if ( g.hasGQ() ) sawGoodQual = true; - if ( g.hasDP() ) sawDP = true; - if ( g.hasAD() ) sawAD = true; - if ( g.hasPL() ) sawPL = true; - if (g.isFiltered()) sawGenotypeFilter = true; - } - - if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY); - if ( sawDP ) keys.add(VCFConstants.DEPTH_KEY); - if ( sawAD ) keys.add(VCFConstants.GENOTYPE_ALLELE_DEPTHS); - if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY); - if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY); - - List sortedList = ParsingUtils.sortList(new ArrayList(keys)); - - // make sure the GT is first - if ( sawGoodGT ) { - List newList = new ArrayList(sortedList.size()+1); - newList.add(VCFConstants.GENOTYPE_KEY); - newList.addAll(sortedList); - sortedList = newList; - } - - if ( sortedList.isEmpty() && header.hasGenotypingData() ) { - // this needs to be done in case all samples are no-calls - return Collections.singletonList(VCFConstants.GENOTYPE_KEY); - } else { - return sortedList; - } - } - - - private static int countOccurrences(char c, String s) { - int count = 0; - for (int i = 0; i < s.length(); i++) { - count += s.charAt(i) == c ? 1 : 0; - } - return count; - } - - private final void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) { - if ( !allowMissingFieldsInHeader) - throw new IllegalStateException("Key " + id + " found in VariantContext field " + field - + " at " + vc.getChr() + ":" + vc.getStart() - + " but this key isn't defined in the VCFHeader. We require all VCFs to have" - + " complete VCF headers by default."); - } -} diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/VariantContextWriter.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/VariantContextWriter.java deleted file mode 100644 index 4ab6b2dd4..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/VariantContextWriter.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.variantcontext.VariantContext; - -/** - * this class writes VCF files - */ -public interface VariantContextWriter { - - public void writeHeader(VCFHeader header); - - /** - * attempt to close the VCF file - */ - public void close(); - - public void add(VariantContext vc); -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/writer/VariantContextWriterFactory.java b/public/java/src/org/broadinstitute/variant/variantcontext/writer/VariantContextWriterFactory.java deleted file mode 100644 index 542c7e422..000000000 --- a/public/java/src/org/broadinstitute/variant/variantcontext/writer/VariantContextWriterFactory.java +++ /dev/null @@ -1,121 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import net.sf.samtools.SAMSequenceDictionary; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.OutputStream; -import java.util.EnumSet; - -/** - * Factory methods to create VariantContext writers - * - * @author depristo - * @since 5/12 - */ -public class VariantContextWriterFactory { - - public static final EnumSet DEFAULT_OPTIONS = EnumSet.of(Options.INDEX_ON_THE_FLY); - public static final EnumSet NO_OPTIONS = EnumSet.noneOf(Options.class); - - private VariantContextWriterFactory() {} - - public static VariantContextWriter create(final File location, final SAMSequenceDictionary refDict) { - return create(location, openOutputStream(location), refDict, DEFAULT_OPTIONS); - } - - public static VariantContextWriter create(final File location, final SAMSequenceDictionary refDict, final EnumSet options) { - return create(location, openOutputStream(location), refDict, options); - } - - public static VariantContextWriter create(final File location, - final OutputStream output, - final SAMSequenceDictionary refDict) { - return create(location, output, refDict, DEFAULT_OPTIONS); - } - - public static VariantContextWriter create(final OutputStream output, - final SAMSequenceDictionary refDict, - final EnumSet options) { - return create(null, output, refDict, options); - } - - public static VariantContextWriter create(final File location, - final OutputStream output, - final SAMSequenceDictionary refDict, - final EnumSet options) { - final boolean enableBCF = isBCFOutput(location, options); - - if ( enableBCF ) - return new BCF2Writer(location, output, refDict, - options.contains(Options.INDEX_ON_THE_FLY), - options.contains(Options.DO_NOT_WRITE_GENOTYPES)); - else { - return new VCFWriter(location, output, refDict, - options.contains(Options.INDEX_ON_THE_FLY), - options.contains(Options.DO_NOT_WRITE_GENOTYPES), - options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER)); - } - } - - /** - * Should we output a BCF file based solely on the name of the file at location? - * - * @param location - * @return - */ - public static boolean isBCFOutput(final File location) { - return isBCFOutput(location, EnumSet.noneOf(Options.class)); - } - - public static boolean isBCFOutput(final File location, final EnumSet options) { - return options.contains(Options.FORCE_BCF) || (location != null && location.getName().contains(".bcf")); - } - - public static VariantContextWriter sortOnTheFly(final VariantContextWriter innerWriter, int maxCachingStartDistance) { - return sortOnTheFly(innerWriter, maxCachingStartDistance, false); - } - - public static VariantContextWriter sortOnTheFly(final VariantContextWriter innerWriter, int maxCachingStartDistance, boolean takeOwnershipOfInner) { - return new SortingVariantContextWriter(innerWriter, maxCachingStartDistance, takeOwnershipOfInner); - } - - /** - * Returns a output stream writing to location, or throws an exception if this fails - * @param location - * @return - */ - protected static OutputStream openOutputStream(final File location) { - try { - return new FileOutputStream(location); - } catch (FileNotFoundException e) { - throw new RuntimeException(location + ": Unable to create VCF writer", e); - } - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/variant/vcf/AbstractVCFCodec.java deleted file mode 100644 index a4ccd050a..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/AbstractVCFCodec.java +++ /dev/null @@ -1,724 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.AsciiFeatureCodec; -import org.broad.tribble.Feature; -import org.broad.tribble.NameAwareCodec; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import net.sf.samtools.util.BlockCompressedInputStream; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.variantcontext.*; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.util.*; -import java.util.zip.GZIPInputStream; - - -public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { - public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); - - protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column - - // we have to store the list of strings that make up the header until they're needed - protected VCFHeader header = null; - protected VCFHeaderVersion version = null; - - // a mapping of the allele - protected Map> alleleMap = new HashMap>(3); - - // for ParsingUtils.split - protected String[] GTValueArray = new String[100]; - protected String[] genotypeKeyArray = new String[100]; - protected String[] infoFieldArray = new String[1000]; - protected String[] infoValueArray = new String[1000]; - - // for performance testing purposes - public static boolean validate = true; - - // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over - // todo: make this thread safe? - protected String[] parts = null; - protected String[] genotypeParts = null; - protected final String[] locParts = new String[6]; - - // for performance we cache the hashmap of filter encodings for quick lookup - protected HashMap> filterHash = new HashMap>(); - - // we store a name to give to each of the variant contexts we emit - protected String name = "Unknown"; - - protected int lineNo = 0; - - protected Map stringCache = new HashMap(); - - protected boolean warnedAboutNoEqualsForNonFlag = false; - - /** - * If true, then we'll magically fix up VCF headers on the fly when we read them in - */ - protected boolean doOnTheFlyModifications = true; - - protected AbstractVCFCodec() { - super(VariantContext.class); - } - - /** - * Creates a LazyParser for a LazyGenotypesContext to use to decode - * our genotypes only when necessary. We do this instead of eagarly - * decoding the genotypes just to turn around and reencode in the frequent - * case where we don't actually want to manipulate the genotypes - */ - class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { - final List alleles; - final String contig; - final int start; - - LazyVCFGenotypesParser(final List alleles, final String contig, final int start) { - this.alleles = alleles; - this.contig = contig; - this.start = start; - } - - @Override - public LazyGenotypesContext.LazyData parse(final Object data) { - //System.out.printf("Loading genotypes... %s:%d%n", contig, start); - return createGenotypeMap((String) data, alleles, contig, start); - } - } - - /** - * @param reader the line reader to take header lines from - * @return the number of header lines - */ - public abstract Object readHeader(LineReader reader); - - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied - */ - protected abstract List parseFilters(String filterString); - - /** - * create a VCF header from a set of header record lines - * - * @param headerStrings a list of strings that represent all the ## and # entries - * @return a VCFHeader object - */ - protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion version ) { - this.version = version; - - Set metaData = new LinkedHashSet(); - Set sampleNames = new LinkedHashSet(); - int contigCounter = 0; - // iterate over all the passed in strings - for ( String str : headerStrings ) { - if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { - String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); - if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) - throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); - - int arrayIndex = 0; - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - try { - if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) - throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); - } catch (IllegalArgumentException e) { - throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); - } - arrayIndex++; - } - - boolean sawFormatTag = false; - if ( arrayIndex < strings.length ) { - if ( !strings[arrayIndex].equals("FORMAT") ) - throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); - sawFormatTag = true; - arrayIndex++; - } - - while ( arrayIndex < strings.length ) - sampleNames.add(strings[arrayIndex++]); - - if ( sawFormatTag && sampleNames.size() == 0 ) - throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); - - } else { - if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { - final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version); - metaData.add(info); - } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { - final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); - metaData.add(filter); - } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { - final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); - metaData.add(format); - } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { - final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); - metaData.add(contig); - } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { - final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")); - metaData.add(alt); - } else { - int equals = str.indexOf("="); - if ( equals != -1 ) - metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); - } - } - } - - this.header = new VCFHeader(metaData, sampleNames); - if ( doOnTheFlyModifications ) - this.header = VCFStandardHeaderLines.repairStandardHeaderLines(this.header); - return this.header; - } - - /** - * the fast decode function - * @param line the line of text for the record - * @return a feature, (not guaranteed complete) that has the correct start and stop - */ - public Feature decodeLoc(String line) { - return decodeLine(line, false); - } - - /** - * decode the line into a feature (VariantContext) - * @param line the line - * @return a VariantContext - */ - public VariantContext decode(String line) { - return decodeLine(line, true); - } - - private final VariantContext decodeLine(final String line, final boolean includeGenotypes) { - // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line - if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; - - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new TribbleException("VCF Header cannot be null when decoding a record"); - - if (parts == null) - parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; - - int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - - // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) - if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) || - (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) - throw new TribbleException("Line " + lineNo + ": there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + - " tokens, and saw " + nParts + " )"); - - return parseVCFLine(parts, includeGenotypes); - } - - /** - * parse out the VCF line - * - * @param parts the parts split up - * @return a variant context object - */ - private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) { - VariantContextBuilder builder = new VariantContextBuilder(); - builder.source(getName()); - - // increment the line count - // TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when - // TODO -- the first record is far along the contig) and the line counter can get out of sync - lineNo++; - - // parse out the required fields - final String chr = getCachedString(parts[0]); - builder.chr(chr); - int pos = -1; - try { - pos = Integer.valueOf(parts[1]); - } catch (NumberFormatException e) { - generateException(parts[1] + " is not a valid start position in the VCF format"); - } - builder.start(pos); - - if ( parts[2].length() == 0 ) - generateException("The VCF specification requires a valid ID field"); - else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) - builder.noID(); - else - builder.id(parts[2]); - - final String ref = getCachedString(parts[3].toUpperCase()); - final String alts = getCachedString(parts[4].toUpperCase()); - builder.log10PError(parseQual(parts[5])); - - final List filters = parseFilters(getCachedString(parts[6])); - if ( filters != null ) builder.filters(new HashSet(filters)); - final Map attrs = parseInfo(parts[7]); - builder.attributes(attrs); - - if ( attrs.containsKey(VCFConstants.END_KEY) ) { - // update stop with the end key if provided - try { - builder.stop(Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString())); - } catch (Exception e) { - generateException("the END value in the INFO field is not valid"); - } - } else { - builder.stop(pos + ref.length() - 1); - } - - // get our alleles, filters, and setup an attribute map - final List alleles = parseAlleles(ref, alts, lineNo); - builder.alleles(alleles); - - // do we have genotyping data - if (parts.length > NUM_STANDARD_FIELDS && includeGenotypes) { - final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); - final int nGenotypes = header.getNGenotypeSamples(); - LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); - - // did we resort the sample names? If so, we need to load the genotype data - if ( !header.samplesWereAlreadySorted() ) - lazy.decode(); - - builder.genotypesNoValidation(lazy); - } - - VariantContext vc = null; - try { - vc = builder.make(); - } catch (Exception e) { - generateException(e.getMessage()); - } - - return vc; - } - - /** - * get the name of this codec - * @return our set name - */ - public String getName() { - return name; - } - - /** - * set the name of this codec - * @param name new name - */ - public void setName(String name) { - this.name = name; - } - - /** - * Return a cached copy of the supplied string. - * - * @param str string - * @return interned string - */ - protected String getCachedString(String str) { - String internedString = stringCache.get(str); - if ( internedString == null ) { - internedString = new String(str); - stringCache.put(internedString, internedString); - } - return internedString; - } - - /** - * parse out the info fields - * @param infoField the fields - * @return a mapping of keys to objects - */ - private Map parseInfo(String infoField) { - Map attributes = new HashMap(); - - if ( infoField.length() == 0 ) - generateException("The VCF specification requires a valid info field"); - - if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { - if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 ) - generateException("The VCF specification does not allow for whitespace in the INFO field"); - - int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false); - for (int i = 0; i < infoFieldSplitSize; i++) { - String key; - Object value; - - int eqI = infoFieldArray[i].indexOf("="); - if ( eqI != -1 ) { - key = infoFieldArray[i].substring(0, eqI); - String valueString = infoFieldArray[i].substring(eqI+1); - - // split on the INFO field separator - int infoValueSplitSize = ParsingUtils.split(valueString, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); - if ( infoValueSplitSize == 1 ) { - value = infoValueArray[0]; - final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); - if ( headerLine != null && headerLine.getType() == VCFHeaderLineType.Flag && value.equals("0") ) { - // deal with the case where a flag field has =0, such as DB=0, by skipping the add - continue; - } - } else { - ArrayList valueList = new ArrayList(infoValueSplitSize); - for ( int j = 0; j < infoValueSplitSize; j++ ) - valueList.add(infoValueArray[j]); - value = valueList; - } - } else { - key = infoFieldArray[i]; - final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); - if ( headerLine != null && headerLine.getType() != VCFHeaderLineType.Flag ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && ! warnedAboutNoEqualsForNonFlag ) { - System.err.println("Found info key " + key + " without a = value, but the header says the field is of type " - + headerLine.getType() + " but this construct is only value for FLAG type fields"); - warnedAboutNoEqualsForNonFlag = true; - } - - value = VCFConstants.MISSING_VALUE_v4; - } else { - value = true; - } - } - - // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING - if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; - - attributes.put(key, value); - } - } - - return attributes; - } - - /** - * create a an allele from an index and an array of alleles - * @param index the index - * @param alleles the alleles - * @return an Allele - */ - protected static Allele oneAllele(String index, List alleles) { - if ( index.equals(VCFConstants.EMPTY_ALLELE) ) - return Allele.NO_CALL; - final int i; - try { - i = Integer.valueOf(index); - } catch ( NumberFormatException e ) { - throw new TribbleException.InternalCodecException("The following invalid GT allele index was encountered in the file: " + index); - } - if ( i >= alleles.size() ) - throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record"); - return alleles.get(i); - } - - - /** - * parse genotype alleles from the genotype string - * @param GT GT string - * @param alleles list of possible alleles - * @param cache cache of alleles for GT - * @return the allele list for the GT string - */ - protected static List parseGenotypeAlleles(String GT, List alleles, Map> cache) { - // cache results [since they are immutable] and return a single object for each genotype - List GTAlleles = cache.get(GT); - - if ( GTAlleles == null ) { - StringTokenizer st = new StringTokenizer(GT, VCFConstants.PHASING_TOKENS); - GTAlleles = new ArrayList(st.countTokens()); - while ( st.hasMoreTokens() ) { - String genotype = st.nextToken(); - GTAlleles.add(oneAllele(genotype, alleles)); - } - cache.put(GT, GTAlleles); - } - - return GTAlleles; - } - - /** - * parse out the qual value - * @param qualString the quality string - * @return return a double - */ - protected static Double parseQual(String qualString) { - // if we're the VCF 4 missing char, return immediately - if ( qualString.equals(VCFConstants.MISSING_VALUE_v4)) - return VariantContext.NO_LOG10_PERROR; - - Double val = Double.valueOf(qualString); - - // check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles - if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON)) - return VariantContext.NO_LOG10_PERROR; - - // scale and return the value - return val / -10.0; - } - - /** - * parse out the alleles - * @param ref the reference base - * @param alts a string of alternates to break into alleles - * @param lineNo the line number for this record - * @return a list of alleles, and a pair of the shortest and longest sequence - */ - protected static List parseAlleles(String ref, String alts, int lineNo) { - List alleles = new ArrayList(2); // we are almost always biallelic - // ref - checkAllele(ref, true, lineNo); - Allele refAllele = Allele.create(ref, true); - alleles.add(refAllele); - - if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split - parseSingleAltAllele(alleles, alts, lineNo); - else - for ( String alt : alts.split(",") ) - parseSingleAltAllele(alleles, alt, lineNo); - - return alleles; - } - - /** - * check to make sure the allele is an acceptable allele - * @param allele the allele to check - * @param isRef are we the reference allele? - * @param lineNo the line number for this record - */ - private static void checkAllele(String allele, boolean isRef, int lineNo) { - if ( allele == null || allele.length() == 0 ) - generateException("Empty alleles are not permitted in VCF records", lineNo); - - if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { - System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); - } - - if ( isSymbolicAllele(allele) ) { - if ( isRef ) { - generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); - } - } else { - // check for VCF3 insertions or deletions - if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) ) - generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + - " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); - - if (!Allele.acceptableAlleleBases(allele)) - generateException("Unparsable vcf record with allele " + allele, lineNo); - - if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) ) - generateException("The reference allele cannot be missing", lineNo); - } - } - - /** - * return true if this is a symbolic allele (e.g. ) or - * structural variation breakend (with [ or ]), otherwise false - * @param allele the allele to check - * @return true if the allele is a symbolic allele, otherwise false - */ - private static boolean isSymbolicAllele(String allele) { - return (allele != null && allele.length() > 2 && - ((allele.startsWith("<") && allele.endsWith(">")) || - (allele.contains("[") || allele.contains("]")))); - } - - /** - * parse a single allele, given the allele list - * @param alleles the alleles available - * @param alt the allele to parse - * @param lineNo the line number for this record - */ - private static void parseSingleAltAllele(List alleles, String alt, int lineNo) { - checkAllele(alt, false, lineNo); - - Allele allele = Allele.create(alt, false); - if ( ! allele.isNoCall() ) - alleles.add(allele); - } - - public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) { - try { - return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || - isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) || - isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); - } catch ( FileNotFoundException e ) { - return false; - } catch ( IOException e ) { - return false; - } - } - - private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) { - try { - byte[] buff = new byte[MAGIC_HEADER_LINE.length()]; - int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length()); - boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes()); - return eq; -// String firstLine = new String(buff); -// return firstLine.startsWith(MAGIC_HEADER_LINE); - } catch ( IOException e ) { - return false; - } catch ( RuntimeException e ) { - return false; - } finally { - try { stream.close(); } catch ( IOException e ) {} - } - } - - - /** - * create a genotype map - * - * @param str the string - * @param alleles the list of alleles - * @return a mapping of sample name to genotype object - */ - public LazyGenotypesContext.LazyData createGenotypeMap(final String str, - final List alleles, - final String chr, - final int pos) { - if (genotypeParts == null) - genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; - - int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); - if ( nParts != genotypeParts.length ) - generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo); - - ArrayList genotypes = new ArrayList(nParts); - - // get the format keys - int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - - // cycle through the sample names - Iterator sampleNameIterator = header.getGenotypeSamples().iterator(); - - // clear out our allele mapping - alleleMap.clear(); - - // cycle through the genotype strings - for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { - int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); - - final String sampleName = sampleNameIterator.next(); - final GenotypeBuilder gb = new GenotypeBuilder(sampleName); - - // check to see if the value list is longer than the key list, which is a problem - if (nGTKeys < GTValueSplitSize) - generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); - - int genotypeAlleleLocation = -1; - if (nGTKeys >= 1) { - gb.maxAttributes(nGTKeys - 1); - - for (int i = 0; i < nGTKeys; i++) { - final String gtKey = genotypeKeyArray[i]; - boolean missing = i >= GTValueSplitSize; - - // todo -- all of these on the fly parsing of the missing value should be static constants - if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { - genotypeAlleleLocation = i; - } else if ( missing ) { - // if its truly missing (there no provided value) skip adding it to the attributes - } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { - final List filters = parseFilters(getCachedString(GTValueArray[i])); - if ( filters != null ) gb.filters(filters); - } else if ( GTValueArray[i].equals(VCFConstants.MISSING_VALUE_v4) ) { - // don't add missing values to the map - } else { - if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { - if ( GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) - gb.noGQ(); - else - gb.GQ((int)Math.round(Double.valueOf(GTValueArray[i]))); - } else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { - gb.AD(decodeInts(GTValueArray[i])); - } else if (gtKey.equals(VCFConstants.GENOTYPE_PL_KEY)) { - gb.PL(decodeInts(GTValueArray[i])); - } else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { - gb.PL(GenotypeLikelihoods.fromGLField(GTValueArray[i]).getAsPLs()); - } else if (gtKey.equals(VCFConstants.DEPTH_KEY)) { - gb.DP(Integer.valueOf(GTValueArray[i])); - } else { - gb.attribute(gtKey, GTValueArray[i]); - } - } - } - } - - // check to make sure we found a genotype field if our version is less than 4.1 file - if ( version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1 ) - generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0"); - if ( genotypeAlleleLocation > 0 ) - generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); - - final List GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); - gb.alleles(GTalleles); - gb.phased(genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1); - - // add it to the list - try { - genotypes.add(gb.make()); - } catch (TribbleException e) { - throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); - } - } - - return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset()); - } - - - private final static String[] INT_DECODE_ARRAY = new String[10000]; - private final static int[] decodeInts(final String string) { - final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ','); - final int[] values = new int[nValues]; - for ( int i = 0; i < nValues; i++ ) - values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]); - return values; - } - - /** - * Forces all VCFCodecs to not perform any on the fly modifications to the VCF header - * of VCF records. Useful primarily for raw comparisons such as when comparing - * raw VCF records - */ - public final void disableOnTheFlyModifications() { - doOnTheFlyModifications = false; - } - - - protected void generateException(String message) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); - } - - protected static void generateException(String message, int lineNo) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/variant/vcf/VCF3Codec.java deleted file mode 100644 index 5e2cfb2b9..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCF3Codec.java +++ /dev/null @@ -1,138 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; - -import java.io.IOException; -import java.util.*; - - -/** - * A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been - * depreciated in favor of VCF4 (See VCF codec for the latest information) - * - *

- * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example) - *

- * - *

- * See also: @see VCF specification
- * See also: @see VCF spec. publication - *

- * - * @author Mark DePristo - * @since 2010 - */ -public class VCF3Codec extends AbstractVCFCodec { - public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; - - - /** - * @param reader the line reader to take header lines from - * @return the number of header lines - */ - public Object readHeader(LineReader reader) { - List headerStrings = new ArrayList(); - - String line; - VCFHeaderVersion version = null; - try { - boolean foundHeaderVersion = false; - while ((line = reader.readLine()) != null) { - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]); - } - headerStrings.add(line); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(line); - return super.parseHeaderFromLines(headerStrings, version); - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - } catch (IOException e) { - throw new RuntimeException("IO Exception ", e); - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied - */ - protected List parseFilters(String filterString) { - - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - // empty set for passes filters - List fFields = new ArrayList(); - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - return new ArrayList(fFields); - - if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status"); - - // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return new ArrayList(filterHash.get(filterString)); - - // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - - filterHash.put(filterString, fFields); - - return fFields; - } - - @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/variant/vcf/VCFCodec.java deleted file mode 100644 index adb8b0842..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFCodec.java +++ /dev/null @@ -1,159 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; - -import java.io.IOException; -import java.util.*; - -/** - * A feature codec for the VCF 4 specification - * - *

- * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a - * header line, and then data lines each containing information about a position in the genome. - *

- *

One of the main uses of next-generation sequencing is to discover variation amongst large populations - * of related samples. Recently the format for storing next-generation read alignments has been - * standardised by the SAM/BAM file format specification. This has significantly improved the - * interoperability of next-generation tools for alignment, visualisation, and variant calling. - * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent - * types of sequence variation, including SNPs, indels and larger structural variants, together - * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for - * fast data retrieval of variants from a range of positions on the reference genome. - * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects - * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements - * various utilities for processing VCF files, including validation, merging and comparing, - * and also provides a general Perl and Python API. - * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.

- * - *

- * See also: @see VCF specification
- * See also: @see VCF spec. publication - *

- * - *

File format example

- *
- *     ##fileformat=VCFv4.0
- *     #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
- *     chr1    109     .       A       T       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:610,327:308:-316.30,-95.47,-803.03:99
- *     chr1    147     .       C       A       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:294,49:118:-57.87,-34.96,-338.46:99
- * 
- * - * @author Mark DePristo - * @since 2010 - */ -public class VCFCodec extends AbstractVCFCodec { - // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. - public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; - - /** - * @param reader the line reader to take header lines from - * @return the number of header lines - */ - public Object readHeader(LineReader reader) { - List headerStrings = new ArrayList(); - - String line; - try { - boolean foundHeaderVersion = false; - while ((line = reader.readLine()) != null) { - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version == VCFHeaderVersion.VCF3_3 || version == VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); - } - headerStrings.add(line); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(line); - super.parseHeaderFromLines(headerStrings, version); - return this.header; - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - } catch (IOException e) { - throw new RuntimeException("IO Exception ", e); - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * - * @param filterString the string to parse - * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) - */ - protected List parseFilters(String filterString) { - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return Collections.emptyList(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); - if ( filterString.length() == 0 ) - generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); - - // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); - - // empty set for passes filters - List fFields = new LinkedList(); - // otherwise we have to parse and cache the value - if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - - filterHash.put(filterString, Collections.unmodifiableList(fFields)); - - return fFields; - } - - @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFCompoundHeaderLine.java deleted file mode 100644 index 3fc790f80..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFCompoundHeaderLine.java +++ /dev/null @@ -1,258 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.Map; - -/** - * a base class for compound header lines, which include info lines and format lines (so far) - */ -public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { - - public enum SupportedHeaderLineType { - INFO(true), FORMAT(false); - - public final boolean allowFlagValues; - SupportedHeaderLineType(boolean flagValues) { - allowFlagValues = flagValues; - } - } - - // the field types - private String name; - private int count = -1; - private VCFHeaderLineCount countType; - private String description; - private VCFHeaderLineType type; - - // access methods - public String getID() { return name; } - public String getDescription() { return description; } - public VCFHeaderLineType getType() { return type; } - public VCFHeaderLineCount getCountType() { return countType; } - public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; } - public int getCount() { - if ( ! isFixedCount() ) - throw new TribbleException("Asking for header line count when type is not an integer"); - return count; - } - - /** - * Get the number of values expected for this header field, given the properties of VariantContext vc - * - * If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1 - * If the count is of type A, return vc.getNAlleles - 1 - * If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the - * max ploidy among all samples. Note that if the max ploidy of the VC is 0 (there's no GT information - * at all, then implicitly assume diploid samples when computing G values. - * If the count is UNBOUNDED return -1 - * - * @param vc - * @return - */ - public int getCount(final VariantContext vc) { - switch ( countType ) { - case INTEGER: return count; - case UNBOUNDED: return -1; - case A: return vc.getNAlleles() - 1; - case G: - final int ploidy = vc.getMaxPloidy(2); - return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy); - default: - throw new TribbleException("Unknown count type: " + countType); - } - } - - public void setNumberToUnbounded() { - countType = VCFHeaderLineCount.UNBOUNDED; - count = -1; - } - - // our type of line, i.e. format, info, etc - private final SupportedHeaderLineType lineType; - - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.name = name; - this.countType = VCFHeaderLineCount.INTEGER; - this.count = count; - this.type = type; - this.description = description; - this.lineType = lineType; - validate(); - } - - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.name = name; - this.countType = count; - this.type = type; - this.description = description; - this.lineType = lineType; - validate(); - } - - /** - * create a VCF format header line - * - * @param line the header line - * @param version the VCF header version - * @param lineType the header line type - * - */ - protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); - name = mapping.get("ID"); - count = -1; - final String numberStr = mapping.get("Number"); - if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) { - countType = VCFHeaderLineCount.A; - } else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) { - countType = VCFHeaderLineCount.G; - } else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) && - numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || - ((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) && - numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) { - countType = VCFHeaderLineCount.UNBOUNDED; - } else { - countType = VCFHeaderLineCount.INTEGER; - count = Integer.valueOf(numberStr); - - } - - if ( count < 0 && countType == VCFHeaderLineCount.INTEGER ) - throw new TribbleException.InvalidHeader("Count < 0 for fixed size VCF header field " + name); - - try { - type = VCFHeaderLineType.valueOf(mapping.get("Type")); - } catch (Exception e) { - throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)"); - } - if (type == VCFHeaderLineType.Flag && !allowFlagValues()) - throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); - - description = mapping.get("Description"); - if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - - this.lineType = lineType; - - validate(); - } - - private void validate() { - if ( name == null || type == null || description == null || lineType == null ) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", - super.getKey(), name, type, description, lineType )); - - if ( type == VCFHeaderLineType.Flag && count != 0 ) { - count = 0; - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code"); - } - } - } - - /** - * make a string representation of this header line - * @return a string representation - */ - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - Object number; - switch ( countType ) { - case A: number = VCFConstants.PER_ALLELE_COUNT; break; - case G: number = VCFConstants.PER_GENOTYPE_COUNT; break; - case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break; - case INTEGER: - default: number = count; - } - map.put("Number", number); - map.put("Type", type); - map.put("Description", description); - return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); - } - - /** - * returns true if we're equal to another compounder header line - * @param o a compound header line - * @return true if equal - */ - public boolean equals(Object o) { - if ( !(o instanceof VCFCompoundHeaderLine) ) - return false; - VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o; - return equalsExcludingDescription(other) && - description.equals(other.description); - } - - public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { - return count == other.count && - countType == other.countType && - type == other.type && - lineType == other.lineType && - name.equals(other.name); - } - - public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { - return lineType == other.lineType && - name.equals(other.name); - } - - /** - * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) - * @return true if we do, false otherwise - */ - abstract boolean allowFlagValues(); - -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/variant/vcf/VCFConstants.java deleted file mode 100644 index 41659d735..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFConstants.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import java.util.Locale; - -public final class VCFConstants { - public static final Locale VCF_LOCALE = Locale.US; - - // reserved INFO/FORMAT field keys - public static final String ANCESTRAL_ALLELE_KEY = "AA"; - public static final String ALLELE_COUNT_KEY = "AC"; - public static final String MLE_ALLELE_COUNT_KEY = "MLEAC"; - public static final String ALLELE_FREQUENCY_KEY = "AF"; - public static final String MLE_ALLELE_FREQUENCY_KEY = "MLEAF"; - public static final String MLE_PER_SAMPLE_ALLELE_COUNT_KEY = "MLPSAC"; - public static final String MLE_PER_SAMPLE_ALLELE_FRACTION_KEY = "MLPSAF"; - public static final String ALLELE_NUMBER_KEY = "AN"; - public static final String RMS_BASE_QUALITY_KEY = "BQ"; - public static final String CIGAR_KEY = "CIGAR"; - public static final String DBSNP_KEY = "DB"; - public static final String DEPTH_KEY = "DP"; - public static final String DOWNSAMPLED_KEY = "DS"; - public static final String EXPECTED_ALLELE_COUNT_KEY = "EC"; - public static final String END_KEY = "END"; - - public static final String GENOTYPE_FILTER_KEY = "FT"; - public static final String GENOTYPE_KEY = "GT"; - public static final String GENOTYPE_POSTERIORS_KEY = "GP"; - public static final String GENOTYPE_QUALITY_KEY = "GQ"; - public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; - public static final String GENOTYPE_PL_KEY = "PL"; // phred-scaled genotype likelihoods - @Deprecated public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods - - public static final String HAPMAP2_KEY = "H2"; - public static final String HAPMAP3_KEY = "H3"; - public static final String HAPLOTYPE_QUALITY_KEY = "HQ"; - public static final String RMS_MAPPING_QUALITY_KEY = "MQ"; - public static final String MAPPING_QUALITY_ZERO_KEY = "MQ0"; - public static final String SAMPLE_NUMBER_KEY = "NS"; - public static final String PHASE_QUALITY_KEY = "PQ"; - public static final String PHASE_SET_KEY = "PS"; - public static final String OLD_DEPTH_KEY = "RD"; - public static final String STRAND_BIAS_KEY = "SB"; - public static final String SOMATIC_KEY = "SOMATIC"; - public static final String VALIDATED_KEY = "VALIDATED"; - public static final String THOUSAND_GENOMES_KEY = "1000G"; - - // separators - public static final String FORMAT_FIELD_SEPARATOR = ":"; - public static final String GENOTYPE_FIELD_SEPARATOR = ":"; - public static final char GENOTYPE_FIELD_SEPARATOR_CHAR = ':'; - public static final String FIELD_SEPARATOR = "\t"; - public static final char FIELD_SEPARATOR_CHAR = '\t'; - public static final String FILTER_CODE_SEPARATOR = ";"; - public static final String INFO_FIELD_ARRAY_SEPARATOR = ","; - public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ','; - public static final String ID_FIELD_SEPARATOR = ";"; - public static final String INFO_FIELD_SEPARATOR = ";"; - public static final char INFO_FIELD_SEPARATOR_CHAR = ';'; - public static final String UNPHASED = "/"; - public static final String PHASED = "|"; - public static final String PHASED_SWITCH_PROB_v3 = "\\"; - public static final String PHASING_TOKENS = "/|\\"; - - // header lines - public static final String FILTER_HEADER_START = "##FILTER"; - public static final String FORMAT_HEADER_START = "##FORMAT"; - public static final String INFO_HEADER_START = "##INFO"; - public static final String ALT_HEADER_START = "##ALT"; - public static final String CONTIG_HEADER_KEY = "contig"; - public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY; - - // old indel alleles - public static final char DELETION_ALLELE_v3 = 'D'; - public static final char INSERTION_ALLELE_v3 = 'I'; - - // missing/default values - public static final String UNFILTERED = "."; - public static final String PASSES_FILTERS_v3 = "0"; - public static final String PASSES_FILTERS_v4 = "PASS"; - public static final String EMPTY_ID_FIELD = "."; - public static final String EMPTY_INFO_FIELD = "."; - public static final String EMPTY_ALTERNATE_ALLELE_FIELD = "."; - public static final String MISSING_VALUE_v4 = "."; - public static final String MISSING_QUALITY_v3 = "-1"; - public static final Double MISSING_QUALITY_v3_DOUBLE = Double.valueOf(MISSING_QUALITY_v3); - - public static final String MISSING_GENOTYPE_QUALITY_v3 = "-1"; - public static final String MISSING_HAPLOTYPE_QUALITY_v3 = "-1"; - public static final String MISSING_DEPTH_v3 = "-1"; - public static final String UNBOUNDED_ENCODING_v4 = "."; - public static final String UNBOUNDED_ENCODING_v3 = "-1"; - public static final String PER_ALLELE_COUNT = "A"; - public static final String PER_GENOTYPE_COUNT = "G"; - public static final String EMPTY_ALLELE = "."; - public static final String EMPTY_GENOTYPE = "./."; - public static final int MAX_GENOTYPE_QUAL = 99; - - public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare - public static final String REFSAMPLE_DEPTH_KEY = "REFDEPTH"; -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFContigHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFContigHeaderLine.java deleted file mode 100644 index 5e6a73baf..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFContigHeaderLine.java +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import java.util.Map; - -/** - * A special class representing a contig VCF header line. Nows the true contig order and sorts on that - * - * @author mdepristo - */ -public class VCFContigHeaderLine extends VCFSimpleHeaderLine { - final Integer contigIndex; - - - /** - * create a VCF contig header line - * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - */ - public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, int contigIndex) { - super(line, version, key, null); - this.contigIndex = contigIndex; - } - - public VCFContigHeaderLine(final Map mapping, int contigIndex) { - super(VCFHeader.CONTIG_KEY, mapping, null); - this.contigIndex = contigIndex; - } - - public Integer getContigIndex() { - return contigIndex; - } - - /** - * IT IS CRITIAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER - * - * @param other - * @return - */ - @Override - public int compareTo(final Object other) { - if ( other instanceof VCFContigHeaderLine ) - return contigIndex.compareTo(((VCFContigHeaderLine) other).contigIndex); - else { - return super.compareTo(other); - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFFilterHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFFilterHeaderLine.java deleted file mode 100644 index c853033c0..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFFilterHeaderLine.java +++ /dev/null @@ -1,63 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import java.util.Arrays; - -/** - * @author ebanks - * A class representing a key=value entry for FILTER fields in the VCF header - */ -public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { - - /** - * create a VCF filter header line - * - * @param name the name for this header line - * @param description the description for this header line - */ - public VCFFilterHeaderLine(String name, String description) { - super("FILTER", name, description); - } - - /** - * Convenience constructor for FILTER whose description is the name - * @param name - */ - public VCFFilterHeaderLine(String name) { - super("FILTER", name, name); - } - - /** - * create a VCF info header line - * - * @param line the header line - * @param version the vcf header version - */ - public VCFFilterHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, "FILTER", Arrays.asList("ID", "Description")); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFFormatHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFFormatHeaderLine.java deleted file mode 100644 index 0e88e0220..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFFormatHeaderLine.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - - -/** - * @author ebanks - *

- * Class VCFFormatHeaderLine - *

- * A class representing a key=value entry for genotype FORMAT fields in the VCF header - */ -public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { - - public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - if (type == VCFHeaderLineType.Flag) - throw new IllegalArgumentException("Flag is an unsupported type for format fields"); - } - - public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - } - - public VCFFormatHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FORMAT); - } - - // format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype). - @Override - boolean allowFlagValues() { - return false; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java deleted file mode 100644 index 9d4c4d576..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java +++ /dev/null @@ -1,454 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.TribbleException; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.variant.utils.GeneralUtils; - -import java.util.*; - - -/** - * This class is really a POS. It allows duplicate entries in the metadata, - * stores header lines in lots of places, and all around f*cking sucks. - * - * todo -- clean this POS up - * - * @author aaron - *

- * Class VCFHeader - *

- * A class representing the VCF header - */ -public class VCFHeader { - - // the mandatory header fields - public enum HEADER_FIELDS { - CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO - } - - // the associated meta data - private final Set mMetaData = new LinkedHashSet(); - private final Map mInfoMetaData = new LinkedHashMap(); - private final Map mFormatMetaData = new LinkedHashMap(); - private final Map mFilterMetaData = new LinkedHashMap(); - private final Map mOtherMetaData = new LinkedHashMap(); - private final List contigMetaData = new ArrayList(); - - // the list of auxillary tags - private final List mGenotypeSampleNames = new ArrayList(); - - // the character string that indicates meta data - public static final String METADATA_INDICATOR = "##"; - - // the header string indicator - public static final String HEADER_INDICATOR = "#"; - - public static final String SOURCE_KEY = "source"; - public static final String REFERENCE_KEY = "reference"; - public static final String CONTIG_KEY = "contig"; - public static final String INTERVALS_KEY = "intervals"; - public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals"; - public static final String INTERVAL_MERGING_KEY = "interval_merging"; - public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule"; - public static final String INTERVAL_PADDING_KEY = "interval_padding"; - - // were the input samples sorted originally (or are we sorting them)? - private boolean samplesWereAlreadySorted = true; - - // cache for efficient conversion of VCF -> VariantContext - private ArrayList sampleNamesInOrder = null; - private HashMap sampleNameToOffset = null; - - private boolean writeEngineHeaders = true; - private boolean writeCommandLine = true; - - /** - * Create an empty VCF header with no header lines and no samples - */ - public VCFHeader() { - this(Collections.emptySet(), Collections.emptySet()); - } - - /** - * create a VCF header, given a list of meta data and auxillary tags - * - * @param metaData the meta data associated with this header - */ - public VCFHeader(Set metaData) { - mMetaData.addAll(metaData); - loadVCFVersion(); - loadMetaDataMaps(); - } - - /** - * Creates a shallow copy of the meta data in VCF header toCopy - * - * @param toCopy - */ - public VCFHeader(final VCFHeader toCopy) { - this(toCopy.mMetaData); - } - - /** - * create a VCF header, given a list of meta data and auxillary tags - * - * @param metaData the meta data associated with this header - * @param genotypeSampleNames the sample names - */ - public VCFHeader(Set metaData, Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); - } - - public VCFHeader(Set metaData, List genotypeSampleNames) { - this(metaData); - - if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() ) - throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names"); - - mGenotypeSampleNames.addAll(genotypeSampleNames); - samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames); - buildVCFReaderMaps(genotypeSampleNames); - } - - /** - * Tell this VCF header to use pre-calculated sample name ordering and the - * sample name -> offset map. This assumes that all VariantContext created - * using this header (i.e., read by the VCFCodec) will have genotypes - * occurring in the same order - * - * @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearence - */ - private void buildVCFReaderMaps(Collection genotypeSampleNamesInAppearenceOrder) { - sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); - sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); - - int i = 0; - for ( final String name : genotypeSampleNamesInAppearenceOrder ) { - sampleNamesInOrder.add(name); - sampleNameToOffset.put(name, i++); - } - Collections.sort(sampleNamesInOrder); - } - - - /** - * Adds a header line to the header metadata. - * - * @param headerLine Line to add to the existing metadata component. - */ - public void addMetaDataLine(VCFHeaderLine headerLine) { - mMetaData.add(headerLine); - loadMetaDataMaps(); - } - - /** - * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present - */ - public List getContigLines() { - return Collections.unmodifiableList(contigMetaData); - } - - - /** - * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present - */ - public List getFilterLines() { - final List filters = new ArrayList(); - for ( VCFHeaderLine line : mMetaData ) { - if ( line instanceof VCFFilterHeaderLine ) { - filters.add((VCFFilterHeaderLine)line); - } - } - return filters; - } - - /** - * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present - */ - public List getIDHeaderLines() { - final List filters = new ArrayList(); - for ( VCFHeaderLine line : mMetaData ) { - if ( line instanceof VCFIDHeaderLine ) { - filters.add((VCFIDHeaderLine)line); - } - } - return filters; - } - - /** - * check our metadata for a VCF version tag, and throw an exception if the version is out of date - * or the version is not present - */ - public void loadVCFVersion() { - List toRemove = new ArrayList(); - for ( VCFHeaderLine line : mMetaData ) - if ( VCFHeaderVersion.isFormatString(line.getKey())) { - toRemove.add(line); - } - // remove old header lines for now, - mMetaData.removeAll(toRemove); - - } - - /** - * load the format/info meta data maps (these are used for quick lookup by key name) - */ - private void loadMetaDataMaps() { - for ( VCFHeaderLine line : mMetaData ) { - if ( line instanceof VCFInfoHeaderLine ) { - VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - addMetaDataMapBinding(mInfoMetaData, infoLine); - } else if ( line instanceof VCFFormatHeaderLine ) { - VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - addMetaDataMapBinding(mFormatMetaData, formatLine); - } else if ( line instanceof VCFFilterHeaderLine ) { - VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line; - mFilterMetaData.put(filterLine.getID(), filterLine); - } else if ( line instanceof VCFContigHeaderLine ) { - contigMetaData.add((VCFContigHeaderLine)line); - } else { - mOtherMetaData.put(line.getKey(), line); - } - } - - if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " - + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" - + " automatically adding a corresponding PL field to your VCF header"); - } - addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); - } - } - - /** - * Add line to map, issuing warnings about duplicates - * - * @param map - * @param line - * @param - */ - private final void addMetaDataMapBinding(final Map map, T line) { - final String key = line.getID(); - if ( map.containsKey(key) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" ); - } - } - else { - map.put(key, line); - } - } - - /** - * get the header fields in order they're presented in the input file (which is now required to be - * the order presented in the spec). - * - * @return a set of the header fields, in order - */ - public Set getHeaderFields() { - return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); - } - - /** - * get the meta data, associated with this header, in sorted order - * - * @return a set of the meta data - */ - public Set getMetaDataInInputOrder() { - return makeGetMetaDataSet(mMetaData); - } - - public Set getMetaDataInSortedOrder() { - return makeGetMetaDataSet(new TreeSet(mMetaData)); - } - - private static Set makeGetMetaDataSet(final Set headerLinesInSomeOrder) { - final Set lines = new LinkedHashSet(); - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString())); - lines.addAll(headerLinesInSomeOrder); - return Collections.unmodifiableSet(lines); - } - - /** - * Get the VCFHeaderLine whose key equals key. Returns null if no such line exists - * @param key - * @return - */ - public VCFHeaderLine getMetaDataLine(final String key) { - for (final VCFHeaderLine line: mMetaData) { - if ( line.getKey().equals(key) ) - return line; - } - - return null; - } - - /** - * get the genotyping sample names - * - * @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false - */ - public List getGenotypeSamples() { - return mGenotypeSampleNames; - } - - public int getNGenotypeSamples() { - return mGenotypeSampleNames.size(); - } - - /** - * do we have genotyping data? - * - * @return true if we have genotyping columns, false otherwise - */ - public boolean hasGenotypingData() { - return getNGenotypeSamples() > 0; - } - - /** - * were the input samples sorted originally? - * - * @return true if the input samples were sorted originally, false otherwise - */ - public boolean samplesWereAlreadySorted() { - return samplesWereAlreadySorted; - } - - /** @return the column count */ - public int getColumnCount() { - return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0); - } - - /** - * Returns the INFO HeaderLines in their original ordering - */ - public Collection getInfoHeaderLines() { - return mInfoMetaData.values(); - } - - /** - * Returns the FORMAT HeaderLines in their original ordering - */ - public Collection getFormatHeaderLines() { - return mFormatMetaData.values(); - } - - /** - * @param id the header key name - * @return the meta data line, or null if there is none - */ - public VCFInfoHeaderLine getInfoHeaderLine(String id) { - return mInfoMetaData.get(id); - } - - /** - * @param id the header key name - * @return the meta data line, or null if there is none - */ - public VCFFormatHeaderLine getFormatHeaderLine(String id) { - return mFormatMetaData.get(id); - } - - /** - * @param id the header key name - * @return the meta data line, or null if there is none - */ - public VCFFilterHeaderLine getFilterHeaderLine(final String id) { - return mFilterMetaData.get(id); - } - - public boolean hasInfoLine(final String id) { - return getInfoHeaderLine(id) != null; - } - - public boolean hasFormatLine(final String id) { - return getFormatHeaderLine(id) != null; - } - - public boolean hasFilterLine(final String id) { - return getFilterHeaderLine(id) != null; - } - - /** - * @param key the header key name - * @return the meta data line, or null if there is none - */ - public VCFHeaderLine getOtherHeaderLine(String key) { - return mOtherMetaData.get(key); - } - - /** - * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. - * @return true if additional engine headers will be written to the VCF - */ - public boolean isWriteEngineHeaders() { - return writeEngineHeaders; - } - - /** - * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. - * @param writeEngineHeaders true if additional engine headers will be written to the VCF - */ - public void setWriteEngineHeaders(boolean writeEngineHeaders) { - this.writeEngineHeaders = writeEngineHeaders; - } - - /** - * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. - * @return true if the command line will be written to the VCF - */ - public boolean isWriteCommandLine() { - return writeCommandLine; - } - - /** - * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. - * @param writeCommandLine true if the command line will be written to the VCF - */ - public void setWriteCommandLine(boolean writeCommandLine) { - this.writeCommandLine = writeCommandLine; - } - - public ArrayList getSampleNamesInOrder() { - return sampleNamesInOrder; - } - - public HashMap getSampleNameToOffset() { - return sampleNameToOffset; - } - - @Override - public String toString() { - final StringBuilder b = new StringBuilder(); - b.append("[VCFHeader:"); - for ( final VCFHeaderLine line : mMetaData ) - b.append("\n\t").append(line); - return b.append("\n]").toString(); - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLine.java deleted file mode 100644 index d18e310f5..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLine.java +++ /dev/null @@ -1,134 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.TribbleException; - -import java.util.Map; - - -/** - * @author ebanks - *

- * Class VCFHeaderLine - *

- * A class representing a key=value entry in the VCF header - */ -public class VCFHeaderLine implements Comparable { - protected static final boolean ALLOW_UNBOUND_DESCRIPTIONS = true; - protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; - - private String mKey = null; - private String mValue = null; - - - /** - * create a VCF header line - * - * @param key the key for this header line - * @param value the value for this header line - */ - public VCFHeaderLine(String key, String value) { - if ( key == null ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot be null"); - mKey = key; - mValue = value; - } - - /** - * Get the key - * - * @return the key - */ - public String getKey() { - return mKey; - } - - /** - * Get the value - * - * @return the value - */ - public String getValue() { - return mValue; - } - - public String toString() { - return toStringEncoding(); - } - - /** - * Should be overloaded in sub classes to do subclass specific - * - * @return the string encoding - */ - protected String toStringEncoding() { - return mKey + "=" + mValue; - } - - public boolean equals(Object o) { - if ( !(o instanceof VCFHeaderLine) ) - return false; - return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue()); - } - - public int compareTo(Object other) { - return toString().compareTo(other.toString()); - } - - /** - * @param line the line - * @return true if the line is a VCF meta data line, or false if it is not - */ - public static boolean isHeaderLine(String line) { - return line != null && line.length() > 0 && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1)); - } - - /** - * create a string of a mapping pair for the target VCF version - * @param keyValues a mapping of the key->value pairs to output - * @return a string, correctly formatted - */ - public static String toStringEncoding(Map keyValues) { - StringBuilder builder = new StringBuilder(); - builder.append("<"); - boolean start = true; - for (Map.Entry entry : keyValues.entrySet()) { - if (start) start = false; - else builder.append(","); - - if ( entry.getValue() == null ) throw new TribbleException.InternalCodecException("Header problem: unbound value at " + entry + " from " + keyValues); - - builder.append(entry.getKey()); - builder.append("="); - builder.append(entry.getValue().toString().contains(",") || - entry.getValue().toString().contains(" ") || - entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue()); - } - builder.append(">"); - return builder.toString(); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineCount.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineCount.java deleted file mode 100644 index bae404b6c..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineCount.java +++ /dev/null @@ -1,33 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -/** - * the count encodings we use for fields in VCF header lines - */ -public enum VCFHeaderLineCount { - INTEGER, A, G, UNBOUNDED; -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineTranslator.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineTranslator.java deleted file mode 100644 index 3c2a35d46..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineTranslator.java +++ /dev/null @@ -1,153 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.TribbleException; - -import java.util.*; - -/** - * A class for translating between vcf header versions - */ -public class VCFHeaderLineTranslator { - private static Map mapping; - - static { - mapping = new HashMap(); - mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser()); - mapping.put(VCFHeaderVersion.VCF4_1,new VCF4Parser()); - mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser()); - mapping.put(VCFHeaderVersion.VCF3_2,new VCF3Parser()); - } - - public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { - return mapping.get(version).parseLine(valueLine,expectedTagOrder); - } -} - - -interface VCFLineParser { - public Map parseLine(String valueLine, List expectedTagOrder); -} - - -/** - * a class that handles the to and from disk for VCF 4 lines - */ -class VCF4Parser implements VCFLineParser { - /** - * parse a VCF4 line - * @param valueLine the line - * @return a mapping of the tags parsed out - */ - public Map parseLine(String valueLine, List expectedTagOrder) { - // our return map - Map ret = new LinkedHashMap(); - - // a builder to store up characters as we go - StringBuilder builder = new StringBuilder(); - - // store the key when we're parsing out the values - String key = ""; - - // where are we in the stream of characters? - int index = 0; - - // are we inside a quotation? we don't special case ',' then - boolean inQuote = false; - - // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly [yes, but this machine is getting ugly now... MAD] - for (char c: valueLine.toCharArray()) { - if ( c == '\"' ) { - inQuote = ! inQuote; - } else if ( inQuote ) { - builder.append(c); - } else { - switch (c) { - case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it - case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list - case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder - case (',') : ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; // drop the current key value to the return map - default: builder.append(c); // otherwise simply append to the current string - } - } - - index++; - } - - // validate the tags against the expected list - index = 0; - if ( expectedTagOrder != null ) { - if ( ret.size() > expectedTagOrder.size() ) - throw new TribbleException.InvalidHeader("unexpected tag count " + ret.size() + " in line " + valueLine); - for ( String str : ret.keySet() ) { - if ( !expectedTagOrder.get(index).equals(str) ) - throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine); - index++; - } - } - return ret; - } -} - -class VCF3Parser implements VCFLineParser { - - public Map parseLine(String valueLine, List expectedTagOrder) { - // our return map - Map ret = new LinkedHashMap(); - - // a builder to store up characters as we go - StringBuilder builder = new StringBuilder(); - - // where are we in the stream of characters? - int index = 0; - // where in the expected tag order are we? - int tagIndex = 0; - - // are we inside a quotation? we don't special case ',' then - boolean inQuote = false; - - // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly - for (char c: valueLine.toCharArray()) { - switch (c) { - case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it - case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map - default: builder.append(c); // otherwise simply append to the current string - } - index++; - } - ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); - - // validate the tags against the expected list - index = 0; - if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); - for (String str : ret.keySet()){ - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); - index++; - } - return ret; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineType.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineType.java deleted file mode 100644 index d2d502ab7..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderLineType.java +++ /dev/null @@ -1,33 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -/** - * the type encodings we use for fields in VCF header lines - */ -public enum VCFHeaderLineType { - Integer, Float, String, Character, Flag; -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderVersion.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderVersion.java deleted file mode 100644 index 35ca45126..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeaderVersion.java +++ /dev/null @@ -1,116 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.TribbleException; - -/** - * information that identifies each header version - */ -public enum VCFHeaderVersion { - VCF3_2("VCRv3.2","format"), - VCF3_3("VCFv3.3","fileformat"), - VCF4_0("VCFv4.0","fileformat"), - VCF4_1("VCFv4.1","fileformat"); - - private final String versionString; - private final String formatString; - - /** - * create the enum, privately, using: - * @param vString the version string - * @param fString the format string - */ - VCFHeaderVersion(String vString, String fString) { - this.versionString = vString; - this.formatString = fString; - } - - /** - * get the header version - * @param version the version string - * @return a VCFHeaderVersion object - */ - public static VCFHeaderVersion toHeaderVersion(String version) { - version = clean(version); - for (VCFHeaderVersion hv : VCFHeaderVersion.values()) - if (hv.versionString.equals(version)) - return hv; - return null; - } - - /** - * are we a valid version string of some type - * @param version the version string - * @return true if we're valid of some type, false otherwise - */ - public static boolean isVersionString(String version){ - return toHeaderVersion(version) != null; - } - - /** - * are we a valid format string for some type - * @param format the format string - * @return true if we're valid of some type, false otherwise - */ - public static boolean isFormatString(String format){ - format = clean(format); - for (VCFHeaderVersion hv : VCFHeaderVersion.values()) - if (hv.formatString.equals(format)) - return true; - return false; - } - - public static VCFHeaderVersion getHeaderVersion(String versionLine) { - String[] lineFields = versionLine.split("="); - if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) ) - throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line"); - - if ( !isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - - return toHeaderVersion(lineFields[1]); - } - - /** - * Utility function to clean up a VCF header string - * - * @param s string - * @return trimmed version of s - */ - private static String clean(String s) { - return s.trim(); - } - - - public String getVersionString() { - return versionString; - } - - public String getFormatString() { - return formatString; - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFIDHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFIDHeaderLine.java deleted file mode 100644 index cdd544076..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFIDHeaderLine.java +++ /dev/null @@ -1,31 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -/** an interface for ID-based header lines **/ -public interface VCFIDHeaderLine { - String getID(); -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFInfoHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFInfoHeaderLine.java deleted file mode 100644 index 8ecf52278..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFInfoHeaderLine.java +++ /dev/null @@ -1,54 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - - -/** - * @author ebanks - *

- * Class VCFInfoHeaderLine - *

- * A class representing a key=value entry for INFO fields in the VCF header - */ -public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { - public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } - - public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } - - public VCFInfoHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.INFO); - } - - // info fields allow flag values - @Override - boolean allowFlagValues() { - return true; - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFSimpleHeaderLine.java b/public/java/src/org/broadinstitute/variant/vcf/VCFSimpleHeaderLine.java deleted file mode 100644 index 20a973921..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFSimpleHeaderLine.java +++ /dev/null @@ -1,106 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - - -/** - * @author ebanks - * A class representing a key=value entry for simple VCF header types - */ -public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { - - private String name; - private Map genericFields = new LinkedHashMap(); - - /** - * create a VCF filter header line - * - * @param key the key for this header line - * @param name the name for this header line - * @param description description for this header line - */ - public VCFSimpleHeaderLine(String key, String name, String description) { - super(key, ""); - Map map = new LinkedHashMap(1); - map.put("Description", description); - initialize(name, map); - } - - /** - * create a VCF info header line - * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line - */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering) { - this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering), expectedTagOrdering); - } - - public VCFSimpleHeaderLine(final String key, final Map mapping, final List expectedTagOrdering) { - super(key, ""); - name = mapping.get("ID"); - initialize(name, mapping); - } - - protected void initialize(String name, Map genericFields) { - if ( name == null || genericFields == null || genericFields.isEmpty() ) - throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name)); - - this.name = name; - this.genericFields.putAll(genericFields); - } - - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - map.putAll(genericFields); - return getKey() + "=" + VCFHeaderLine.toStringEncoding(map); - } - - public boolean equals(Object o) { - if ( !(o instanceof VCFSimpleHeaderLine) ) - return false; - VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o; - if ( !name.equals(other.name) || genericFields.size() != other.genericFields.size() ) - return false; - for ( Map.Entry entry : genericFields.entrySet() ) { - if ( !entry.getValue().equals(other.genericFields.get(entry.getKey())) ) - return false; - } - - return true; - } - - public String getID() { - return name; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFStandardHeaderLines.java b/public/java/src/org/broadinstitute/variant/vcf/VCFStandardHeaderLines.java deleted file mode 100644 index d289c679e..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFStandardHeaderLines.java +++ /dev/null @@ -1,264 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.utils.GeneralUtils; - -import java.util.*; - -/** - * Manages header lines for standard VCF INFO and FORMAT fields - * - * Provides simple mechanisms for registering standard lines, - * looking them up, and adding them to headers - * - * @author Mark DePristo - * @since 6/12 - */ -public class VCFStandardHeaderLines { - /** - * Enabling this causes us to repair header lines even if only their descriptions differ - */ - private final static boolean REPAIR_BAD_DESCRIPTIONS = false; - private static Standards formatStandards = new Standards(); - private static Standards infoStandards = new Standards(); - - /** - * Walks over the VCF header and repairs the standard VCF header lines in it, returning a freshly - * allocated VCFHeader with standard VCF header lines repaired as necessary - * - * @param header - * @return - */ - @Requires("header != null") - @Ensures("result != null") - public static VCFHeader repairStandardHeaderLines(final VCFHeader header) { - final Set newLines = new LinkedHashSet(header.getMetaDataInInputOrder().size()); - for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { - if ( line instanceof VCFFormatHeaderLine ) { - line = formatStandards.repair((VCFFormatHeaderLine) line); - } else if ( line instanceof VCFInfoHeaderLine) { - line = infoStandards.repair((VCFInfoHeaderLine) line); - } - - newLines.add(line); - } - - return new VCFHeader(newLines, header.getGenotypeSamples()); - } - - /** - * Adds header lines for each of the format fields in IDs to header, returning the set of - * IDs without standard descriptions, unless throwErrorForMissing is true, in which - * case this situation results in a TribbleException - * - * @param IDs - * @return - */ - public static Set addStandardFormatLines(final Set headerLines, final boolean throwErrorForMissing, final Collection IDs) { - return formatStandards.addToHeader(headerLines, IDs, throwErrorForMissing); - } - - /** - * @see #addStandardFormatLines(java.util.Set, boolean, java.util.Collection) - * - * @param headerLines - * @param throwErrorForMissing - * @param IDs - * @return - */ - public static Set addStandardFormatLines(final Set headerLines, final boolean throwErrorForMissing, final String ... IDs) { - return addStandardFormatLines(headerLines, throwErrorForMissing, Arrays.asList(IDs)); - } - - /** - * Returns the standard format line for ID. If none exists, return null or throw an exception, depending - * on throwErrorForMissing - * - * @param ID - * @param throwErrorForMissing - * @return - */ - public static VCFFormatHeaderLine getFormatLine(final String ID, final boolean throwErrorForMissing) { - return formatStandards.get(ID, throwErrorForMissing); - } - - /** - * Returns the standard format line for ID. If none exists throw an exception - * - * @param ID - * @return - */ - public static VCFFormatHeaderLine getFormatLine(final String ID) { - return formatStandards.get(ID, true); - } - - private static void registerStandard(final VCFFormatHeaderLine line) { - formatStandards.add(line); - } - - /** - * Adds header lines for each of the info fields in IDs to header, returning the set of - * IDs without standard descriptions, unless throwErrorForMissing is true, in which - * case this situation results in a TribbleException - * - * @param IDs - * @return - */ - public static Set addStandardInfoLines(final Set headerLines, final boolean throwErrorForMissing, final Collection IDs) { - return infoStandards.addToHeader(headerLines, IDs, throwErrorForMissing); - } - - /** - * @see #addStandardFormatLines(java.util.Set, boolean, java.util.Collection) - * - * @param IDs - * @return - */ - public static Set addStandardInfoLines(final Set headerLines, final boolean throwErrorForMissing, final String ... IDs) { - return addStandardInfoLines(headerLines, throwErrorForMissing, Arrays.asList(IDs)); - } - - /** - * Returns the standard info line for ID. If none exists, return null or throw an exception, depending - * on throwErrorForMissing - * - * @param ID - * @param throwErrorForMissing - * @return - */ - public static VCFInfoHeaderLine getInfoLine(final String ID, final boolean throwErrorForMissing) { - return infoStandards.get(ID, throwErrorForMissing); - } - - /** - * Returns the standard info line for ID. If none exists throw an exception - * - * @param ID - * @return - */ - public static VCFInfoHeaderLine getInfoLine(final String ID) { - return getInfoLine(ID, true); - } - - private static void registerStandard(final VCFInfoHeaderLine line) { - infoStandards.add(line); - } - - - // - // VCF header line constants - // - static { - // FORMAT lines - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter")); - - // INFO lines - registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); - } - - private static class Standards { - private final Map standards = new HashMap(); - - @Requires("line != null") - @Ensures({"result != null", "result.getID().equals(line.getID())"}) - public T repair(final T line) { - final T standard = get(line.getID(), false); - if ( standard != null ) { - final boolean badCountType = line.getCountType() != standard.getCountType(); - final boolean badCount = line.isFixedCount() && ! badCountType && line.getCount() != standard.getCount(); - final boolean badType = line.getType() != standard.getType(); - final boolean badDesc = ! line.getDescription().equals(standard.getDescription()); - final boolean needsRepair = badCountType || badCount || badType || (REPAIR_BAD_DESCRIPTIONS && badDesc); - - if ( needsRepair ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Repairing standard header line for field " + line.getID() + " because" - + (badCountType ? " -- count types disagree; header has " + line.getCountType() + " but standard is " + standard.getCountType() : "") - + (badType ? " -- type disagree; header has " + line.getType() + " but standard is " + standard.getType() : "") - + (badCount ? " -- counts disagree; header has " + line.getCount() + " but standard is " + standard.getCount() : "") - + (badDesc ? " -- descriptions disagree; header has '" + line.getDescription() + "' but standard is '" + standard.getDescription() + "'": "")); - } - return standard; - } else - return line; - } else - return line; - } - - @Requires("headerLines != null") - @Ensures({"result != null", "result.isEmpty() || ! throwErrorForMissing", "IDs.containsAll(result)"}) - public Set addToHeader(final Set headerLines, final Collection IDs, final boolean throwErrorForMissing) { - final Set missing = new HashSet(); - for ( final String ID : IDs ) { - final T line = get(ID, throwErrorForMissing); - if ( line == null ) - missing.add(ID); - else - headerLines.add(line); - } - - return missing; - } - - @Requires("line != null") - @Ensures({"standards.containsKey(line.getID())"}) - public void add(final T line) { - if ( standards.containsKey(line.getID()) ) - throw new TribbleException("Attempting to add multiple standard header lines for ID " + line.getID()); - standards.put(line.getID(), line); - } - - @Requires("ID != null") - @Ensures({"result != null || ! throwErrorForMissing"}) - public T get(final String ID, final boolean throwErrorForMissing) { - final T x = standards.get(ID); - if ( throwErrorForMissing && x == null ) - throw new TribbleException("Couldn't find a standard VCF header line for field " + ID); - return x; - } - } -} diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/variant/vcf/VCFUtils.java deleted file mode 100644 index f61761652..000000000 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFUtils.java +++ /dev/null @@ -1,196 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.commons.io.FilenameUtils; -import org.broadinstitute.variant.utils.GeneralUtils; - -import java.io.File; -import java.util.*; - -public class VCFUtils { - - public static Set smartMergeHeaders(Collection headers, boolean emitWarnings) throws IllegalStateException { - HashMap map = new HashMap(); // from KEY.NAME -> line - HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings); - - // todo -- needs to remove all version headers from sources and add its own VCF version line - for ( VCFHeader source : headers ) { - //System.out.printf("Merging in header %s%n", source); - for ( VCFHeaderLine line : source.getMetaDataInSortedOrder()) { - - String key = line.getKey(); - if ( line instanceof VCFIDHeaderLine ) - key = key + "-" + ((VCFIDHeaderLine)line).getID(); - - if ( map.containsKey(key) ) { - VCFHeaderLine other = map.get(key); - if ( line.equals(other) ) { - // continue; - } else if ( ! line.getClass().equals(other.getClass()) ) { - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - } else if ( line instanceof VCFFilterHeaderLine ) { - String lineName = ((VCFFilterHeaderLine) line).getID(); - String otherName = ((VCFFilterHeaderLine) other).getID(); - if ( ! lineName.equals(otherName) ) - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - } else if ( line instanceof VCFCompoundHeaderLine ) { - VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line; - VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other; - - // if the names are the same, but the values are different, we need to quit - if (! (compLine).equalsExcludingDescription(compOther) ) { - if ( compLine.getType().equals(compOther.getType()) ) { - // The Number entry is an Integer that describes the number of values that can be - // included with the INFO field. For example, if the INFO field contains a single - // number, then this value should be 1. However, if the INFO field describes a pair - // of numbers, then this value should be 2 and so on. If the number of possible - // values varies, is unknown, or is unbounded, then this value should be '.'. - conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); - compOther.setNumberToUnbounded(); - } else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - map.put(key, compOther); - } else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - } else { - throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); - } - } - if ( ! compLine.getDescription().equals(compOther.getDescription()) ) - conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); - } else { - // we are not equal, but we're not anything special either - conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); - } - } else { - map.put(key, line); - //System.out.printf("Adding header line %s%n", line); - } - } - } - - return new HashSet(map.values()); - } - - /** - * Add / replace the contig header lines in the VCFHeader with the in the reference file and master reference dictionary - * - * @param oldHeader the header to update - * @param referenceFile the file path to the reference sequence used to generate this vcf - * @param refDict the SAM formatted reference sequence dictionary - */ - public static VCFHeader withUpdatedContigs(final VCFHeader oldHeader, final File referenceFile, final SAMSequenceDictionary refDict) { - return new VCFHeader(withUpdatedContigsAsLines(oldHeader.getMetaDataInInputOrder(), referenceFile, refDict), oldHeader.getGenotypeSamples()); - } - - public static Set withUpdatedContigsAsLines(final Set oldLines, final File referenceFile, final SAMSequenceDictionary refDict) { - return withUpdatedContigsAsLines(oldLines, referenceFile, refDict, false); - } - - public static Set withUpdatedContigsAsLines(final Set oldLines, final File referenceFile, final SAMSequenceDictionary refDict, boolean referenceNameOnly) { - final Set lines = new LinkedHashSet(oldLines.size()); - - for ( final VCFHeaderLine line : oldLines ) { - if ( line instanceof VCFContigHeaderLine ) - continue; // skip old contig lines - if ( line.getKey().equals(VCFHeader.REFERENCE_KEY) ) - continue; // skip the old reference key - lines.add(line); - } - - for ( final VCFHeaderLine contigLine : makeContigHeaderLines(refDict, referenceFile) ) - lines.add(contigLine); - - String referenceValue; - if (referenceFile != null) { - if (referenceNameOnly) - referenceValue = FilenameUtils.getBaseName(referenceFile.getName()); - else - referenceValue = "file://" + referenceFile.getAbsolutePath(); - lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, referenceValue)); - } - return lines; - } - - /** - * Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null - * @param refDict reference dictionary - * @param referenceFile for assembly name. May be null - * @return list of vcf contig header lines - */ - public static List makeContigHeaderLines(final SAMSequenceDictionary refDict, - final File referenceFile) { - final List lines = new ArrayList(); - final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null; - for ( SAMSequenceRecord contig : refDict.getSequences() ) - lines.add(makeContigHeaderLine(contig, assembly)); - return lines; - } - - private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { - final Map map = new LinkedHashMap(3); - map.put("ID", contig.getSequenceName()); - map.put("length", String.valueOf(contig.getSequenceLength())); - if ( assembly != null ) map.put("assembly", assembly); - return new VCFContigHeaderLine(map, contig.getSequenceIndex()); - } - - private static String getReferenceAssembly(final String refPath) { - // This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot - String assembly = null; - if (refPath.contains("b37") || refPath.contains("v37")) - assembly = "b37"; - else if (refPath.contains("b36")) - assembly = "b36"; - else if (refPath.contains("hg18")) - assembly = "hg18"; - else if (refPath.contains("hg19")) - assembly = "hg19"; - return assembly; - } - - /** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */ - private static final class HeaderConflictWarner { - boolean emitWarnings; - Set alreadyIssued = new HashSet(); - - private HeaderConflictWarner( final boolean emitWarnings ) { - this.emitWarnings = emitWarnings; - } - - public void warn(final VCFHeaderLine line, final String msg) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && emitWarnings && ! alreadyIssued.contains(line.getKey()) ) { - alreadyIssued.add(line.getKey()); - System.err.println(msg); - } - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 70ece1140..d6cba26d6 100644 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -31,9 +31,18 @@ import org.apache.log4j.Logger; import org.apache.log4j.PatternLayout; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.crypt.CryptUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.io.IOUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.bcf2.BCF2Codec; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.testng.Assert; import org.testng.Reporter; import org.testng.SkipException; @@ -343,4 +352,154 @@ public abstract class BaseTest { + (message == null ? "" : "message: " + message)); } } + + public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) { + Assert.assertNotNull(actual, "VariantContext expected not null"); + Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); + Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); + Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); + Assert.assertEquals(actual.getID(), expected.getID(), "id"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); + + assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); + Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); + assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); + assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); + + Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); + if ( expected.hasGenotypes() ) { + assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); + Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); + final Set samples = expected.getSampleNames(); + for ( final String sample : samples ) { + assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample)); + } + } + } + + public static void assertVariantContextStreamsAreEqual(final Iterable actual, final Iterable expected) { + final Iterator actualIT = actual.iterator(); + final Iterator expectedIT = expected.iterator(); + + while ( expectedIT.hasNext() ) { + final VariantContext expectedVC = expectedIT.next(); + if ( expectedVC == null ) + continue; + + VariantContext actualVC; + do { + Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); + actualVC = actualIT.next(); + } while ( actualIT.hasNext() && actualVC == null ); + + if ( actualVC == null ) + Assert.fail("Too few records in actual"); + + assertVariantContextsAreEqual(actualVC, expectedVC); + } + Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); + } + + + public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) { + Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); + Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); + Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); + + // filters are the same + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); + + // inline attributes + Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); + Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); + Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); + Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); + Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); + Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); + Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); + + Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); + Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); + Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); + Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); + + Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); + assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); + Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); + Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); + } + + public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) { + Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); + + // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? + //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); + final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); + final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); + for ( int i = 0; i < actualLines.size(); i++ ) { + Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); + } + } + + public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { + final Pair vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); + final Pair bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); + assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); + assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); + } + + private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { + if ( expected instanceof Double ) { + // must be very tolerant because doubles are being rounded to 2 sig figs + assertEqualsDoubleSmart(actual, (Double) expected, 1e-2); + } else + Assert.assertEquals(actual, expected, "Attribute " + key); + } + + private static void assertAttributesEquals(final Map actual, Map expected) { + final Set expectedKeys = new HashSet(expected.keySet()); + + for ( final Map.Entry act : actual.entrySet() ) { + final Object actualValue = act.getValue(); + if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { + final Object expectedValue = expected.get(act.getKey()); + if ( expectedValue instanceof List ) { + final List expectedList = (List)expectedValue; + Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); + final List actualList = (List)actualValue; + Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); + for ( int i = 0; i < expectedList.size(); i++ ) + assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); + } else + assertAttributeEquals(act.getKey(), actualValue, expectedValue); + } else { + // it's ok to have a binding in x -> null that's absent in y + Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); + } + expectedKeys.remove(act.getKey()); + } + + // now expectedKeys contains only the keys found in expected but not in actual, + // and they must all be null + for ( final String missingExpected : expectedKeys ) { + final Object value = expected.get(missingExpected); + Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); + } + } + + private static final boolean isMissing(final Object value) { + if ( value == null ) return true; + else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; + else if ( value instanceof List ) { + // handles the case where all elements are null or the list is empty + for ( final Object elt : (List)value) + if ( elt != null ) + return false; + return true; + } else + return false; + } } diff --git a/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java b/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java index 765511ae6..8a8faee8b 100644 --- a/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java @@ -49,7 +49,6 @@ import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.variantcontext.VariantContextTestProvider; import org.broadinstitute.variant.vcf.VCFCodec; import org.testng.Assert; import org.testng.annotations.BeforeClass; diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index eec0f653a..155d44ecd 100644 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.variant.variantcontext.VariantContextTestProvider; import org.testng.Assert; import org.testng.annotations.AfterSuite; import org.testng.annotations.BeforeMethod; @@ -82,7 +81,7 @@ public class WalkerTest extends BaseTest { if ( bcfFile != null && bcfFile.exists() ) { logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile); try { - VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(resultFile, bcfFile); + assertVCFandBCFFilesAreTheSame(resultFile, bcfFile); logger.warn(" Shadow BCF PASSED!"); } catch ( Exception e ) { Assert.fail("Exception received reading shadow BCFFile " + bcfFile + " for test " + name, e); diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index cb2a6bfb2..787db9a0f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -35,10 +35,12 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextTestProvider; import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFHeader; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -250,13 +252,13 @@ public class BandPassActivityProfileUnitTest extends BaseTest { final File file = new File(path); final VCFCodec codec = new VCFCodec(); - final VariantContextTestProvider.VariantContextContainer reader = VariantContextTestProvider.readAllVCs(file, codec); + final Pair reader = GATKVCFUtils.readAllVCs(file, codec); final List incRegions = new ArrayList(); final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser); final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser); int pos = start; - for ( final VariantContext vc : reader.getVCs() ) { + for ( final VariantContext vc : reader.getSecond() ) { if ( vc == null ) continue; while ( pos < vc.getStart() ) { final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 6ff052bdc..6eb9afc8c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -702,7 +702,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { for ( int i = 0; i < biallelics.size(); i++ ) { final VariantContext actual = biallelics.get(i); final VariantContext expected = expectedBiallelics.get(i); - VariantContextTestProvider.assertEquals(actual, expected); + assertVariantContextsAreEqual(actual, expected); } } diff --git a/public/java/test/org/broadinstitute/variant/VariantBaseTest.java b/public/java/test/org/broadinstitute/variant/VariantBaseTest.java deleted file mode 100644 index 6cec4d40b..000000000 --- a/public/java/test/org/broadinstitute/variant/VariantBaseTest.java +++ /dev/null @@ -1,166 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant; - -import org.testng.Assert; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -/** - * Base class for test classes within org.broadinstitute.variant - */ -public class VariantBaseTest { - - public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; - public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; - - // TODO: change this to an appropriate value once the move to the Picard repo takes place - public static final String variantTestDataRoot = new File("private/testdata/").getAbsolutePath() + "/"; - - /** - * Simple generic utility class to creating TestNG data providers: - * - * 1: inherit this class, as in - * - * private class SummarizeDifferenceTest extends TestDataProvider { - * public SummarizeDifferenceTest() { - * super(SummarizeDifferenceTest.class); - * } - * ... - * } - * - * Provide a reference to your class to the TestDataProvider constructor. - * - * 2: Create instances of your subclass. Return from it the call to getTests, providing - * the class type of your test - * - * @DataProvider(name = "summaries" - * public Object[][] createSummaries() { - * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); - * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); - * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); - * } - * - * This class magically tracks created objects of this - */ - public static class TestDataProvider { - private static final Map> tests = new HashMap>(); - protected String name; - - /** - * Create a new TestDataProvider instance bound to the class variable C - * @param c - */ - public TestDataProvider(Class c, String name) { - if ( ! tests.containsKey(c) ) - tests.put(c, new ArrayList()); - tests.get(c).add(this); - this.name = name; - } - - public TestDataProvider(Class c) { - this(c, ""); - } - - public void setName(final String name) { - this.name = name; - } - - /** - * Return all of the data providers in the form expected by TestNG of type class C - * @param c - * @return - */ - public static Object[][] getTests(Class c) { - List params2 = new ArrayList(); - for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @Override - public String toString() { - return "TestDataProvider("+name+")"; - } - } - - /** - * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Prefix of the file. - * @param extension Extension to concat to the end of the file. - * @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits. - */ - public static File createTempFile(String name, String extension) { - try { - File file = File.createTempFile(name, extension); - file.deleteOnExit(); - return file; - } catch (IOException ex) { - throw new RuntimeException("Cannot create temp file: " + ex.getMessage(), ex); - } - } - - private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected); - } - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); - } - - public static final void assertEqualsDoubleSmart(final double actual, final double expected) { - assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); - } - - public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { - final Set actualSet = new HashSet(actual); - final Set expectedSet = new HashSet(expected); - Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { - assertEqualsDoubleSmart(actual, expected, tolerance, null); - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { - if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); - else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); - else { - final double delta = Math.abs(actual - expected); - final double ratio = Math.abs(actual / expected - 1.0); - Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual - + " not within tolerance " + tolerance - + (message == null ? "" : "message: " + message)); - } - } -} diff --git a/public/java/test/org/broadinstitute/variant/bcf2/BCF2EncoderDecoderUnitTest.java b/public/java/test/org/broadinstitute/variant/bcf2/BCF2EncoderDecoderUnitTest.java deleted file mode 100644 index 8f3a216b7..000000000 --- a/public/java/test/org/broadinstitute/variant/bcf2/BCF2EncoderDecoderUnitTest.java +++ /dev/null @@ -1,573 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -// the imports for unit testing. -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.variantcontext.writer.BCF2Encoder; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - - -public class BCF2EncoderDecoderUnitTest extends VariantBaseTest { - private final double FLOAT_TOLERANCE = 1e-6; - final List primitives = new ArrayList(); - final List basicTypes = new ArrayList(); - final List forCombinations = new ArrayList(); - - @BeforeSuite - public void before() { - basicTypes.add(new BCF2TypedValue(1, BCF2Type.INT8)); - basicTypes.add(new BCF2TypedValue(1000, BCF2Type.INT16)); - basicTypes.add(new BCF2TypedValue(1000000, BCF2Type.INT32)); - basicTypes.add(new BCF2TypedValue(1.2345e6, BCF2Type.FLOAT)); - basicTypes.add(new BCF2TypedValue("A", BCF2Type.CHAR)); - - // small ints - primitives.add(new BCF2TypedValue(0, BCF2Type.INT8)); - primitives.add(new BCF2TypedValue(10, BCF2Type.INT8)); - primitives.add(new BCF2TypedValue(-1, BCF2Type.INT8)); - primitives.add(new BCF2TypedValue(100, BCF2Type.INT8)); - primitives.add(new BCF2TypedValue(-100, BCF2Type.INT8)); - primitives.add(new BCF2TypedValue(-127, BCF2Type.INT8)); // last value in range - primitives.add(new BCF2TypedValue( 127, BCF2Type.INT8)); // last value in range - - // medium ints - primitives.add(new BCF2TypedValue(-1000, BCF2Type.INT16)); - primitives.add(new BCF2TypedValue(1000, BCF2Type.INT16)); - primitives.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range - primitives.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range - primitives.add(new BCF2TypedValue(-32767, BCF2Type.INT16)); // last value in range - primitives.add(new BCF2TypedValue( 32767, BCF2Type.INT16)); // last value in range - - // larger ints - primitives.add(new BCF2TypedValue(-32768, BCF2Type.INT32)); // first value in range - primitives.add(new BCF2TypedValue( 32768, BCF2Type.INT32)); // first value in range - primitives.add(new BCF2TypedValue(-100000, BCF2Type.INT32)); - primitives.add(new BCF2TypedValue(100000, BCF2Type.INT32)); - primitives.add(new BCF2TypedValue(-2147483647, BCF2Type.INT32)); - primitives.add(new BCF2TypedValue(2147483647, BCF2Type.INT32)); - - // floats - primitives.add(new BCF2TypedValue(0.0, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-0.0, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(1.0, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-1.0, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(1.1, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-1.1, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(5.0 / 3.0, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-5.0 / 3.0, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(1.23e3, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(1.23e6, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(1.23e9, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(1.23e12, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(1.23e15, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-1.23e3, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-1.23e6, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-1.23e9, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-1.23e12, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(-1.23e15, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(Float.MIN_VALUE, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(Float.MAX_VALUE, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(Double.NEGATIVE_INFINITY, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(Double.POSITIVE_INFINITY, BCF2Type.FLOAT)); - primitives.add(new BCF2TypedValue(Double.NaN, BCF2Type.FLOAT)); - - // strings - //primitives.add(new BCF2TypedValue("", BCFType.CHAR)); <- will be null (which is right) - primitives.add(new BCF2TypedValue("S", BCF2Type.CHAR)); - primitives.add(new BCF2TypedValue("S2", BCF2Type.CHAR)); - primitives.add(new BCF2TypedValue("12345678910", BCF2Type.CHAR)); - primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); - primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); - - // missing values - for ( BCF2Type type : BCF2Type.values() ) { - primitives.add(new BCF2TypedValue(null, type)); - } - - forCombinations.add(new BCF2TypedValue(10, BCF2Type.INT8)); - forCombinations.add(new BCF2TypedValue(100, BCF2Type.INT8)); - forCombinations.add(new BCF2TypedValue(-100, BCF2Type.INT8)); - forCombinations.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range - forCombinations.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range - forCombinations.add(new BCF2TypedValue(-100000, BCF2Type.INT32)); - forCombinations.add(new BCF2TypedValue(100000, BCF2Type.INT32)); - forCombinations.add(new BCF2TypedValue(0.0, BCF2Type.FLOAT)); - forCombinations.add(new BCF2TypedValue(1.23e6, BCF2Type.FLOAT)); - forCombinations.add(new BCF2TypedValue(-1.23e6, BCF2Type.FLOAT)); - forCombinations.add(new BCF2TypedValue("S", BCF2Type.CHAR)); - forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); - forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR)); - - // missing values - for ( BCF2Type type : BCF2Type.values() ) { - forCombinations.add(new BCF2TypedValue(null, type)); - } - } - - // -------------------------------------------------------------------------------- - // - // merge case Provider - // - // -------------------------------------------------------------------------------- - - private class BCF2TypedValue { - final BCF2Type type; - final Object value; - - private BCF2TypedValue(final int value, final BCF2Type type) { - this(new Integer(value), type); - } - - private BCF2TypedValue(final double value, final BCF2Type type) { - this(new Double(value), type); - } - - private BCF2TypedValue(final Object value, final BCF2Type type) { - this.type = type; - this.value = value; - } - - public boolean isMissing() { return value == null; } - - @Override - public String toString() { - return String.format("%s of %s", value, type); - } - } - - // ----------------------------------------------------------------- - // - // Test encoding of basic types - // - // ----------------------------------------------------------------- - - @DataProvider(name = "BCF2EncodingTestProviderBasicTypes") - public Object[][] BCF2EncodingTestProviderBasicTypes() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv : basicTypes ) - tests.add(new Object[]{Arrays.asList(tv)}); - return tests.toArray(new Object[][]{}); - } - - private interface EncodeMe { - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException; - } - - - @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithStaticCalls(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - switch ( tv.type ) { - case INT8: - case INT16: - case INT32: - encoder.encodeTypedInt((Integer)tv.value, tv.type); - break; - case FLOAT: - encoder.encodeTypedFloat((Double)tv.value); - break; - case CHAR: - encoder.encodeTypedString((String)tv.value); - break; - } - } - }); - } - - @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithObjectType(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - encoder.encodeTyped(tv.value, tv.type); - } - }); - } - - @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2BasicTypesWithObjectNoType(final List toEncode) throws IOException { - testBCF2BasicTypesWithEncodeMe(toEncode, - new EncodeMe() { - @Override - public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException { - encoder.encode(tv.value); - } - }); - } - - public void testBCF2BasicTypesWithEncodeMe(final List toEncode, final EncodeMe func) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - BCF2Encoder encoder = new BCF2Encoder(); - func.encode(encoder, tv); - - BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); - final Object decoded = decoder.decodeTypedValue(); - - Assert.assertNotNull(decoded); - Assert.assertFalse(decoded instanceof List); - myAssertEquals(tv, decoded); - } - } - - @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2EncodingVectors(final List toEncode) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { - BCF2Encoder encoder = new BCF2Encoder(); - List expected = Collections.nCopies(length, tv.value); - encoder.encodeTyped(expected, tv.type); - - BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); - final Object decoded = decoder.decodeTypedValue(); - - Assert.assertTrue(decoded instanceof List); - final List decodedList = (List)decoded; - Assert.assertEquals(decodedList.size(), expected.size()); - for ( Object decodedValue : decodedList ) - myAssertEquals(tv, decodedValue); - } - } - } - - @DataProvider(name = "BCF2EncodingTestProviderSingletons") - public Object[][] BCF2EncodingTestProviderSingletons() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv : primitives ) - tests.add(new Object[]{Arrays.asList(tv)}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BCF2EncodingTestProviderSingletons") - public void testBCF2EncodingSingletons(final List toEncode) throws IOException { - final byte[] record = encodeRecord(toEncode); - decodeRecord(toEncode, record); - } - - // ----------------------------------------------------------------- - // - // Test encoding of vectors - // - // ----------------------------------------------------------------- - - @DataProvider(name = "BCF2EncodingTestProviderSequences") - public Object[][] BCF2EncodingTestProviderSequences() { - List tests = new ArrayList(); - for ( BCF2TypedValue tv1 : forCombinations ) - for ( BCF2TypedValue tv2 : forCombinations ) - for ( BCF2TypedValue tv3 : forCombinations ) - tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BCF2EncodingTestProviderBasicTypes") - public void testBCF2EncodingVectorsWithMissing(final List toEncode) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - if ( tv.type != BCF2Type.CHAR ) { - for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) { - final byte td = BCF2Utils.encodeTypeDescriptor(1, tv.type); - - final BCF2Encoder encoder = new BCF2Encoder(); - for ( int i = 0; i < length; i++ ) { - encoder.encodeRawValue(i % 2 == 0 ? null : tv.value, tv.type); - } - - final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); - - for ( int i = 0; i < length; i++ ) { - final Object decoded = decoder.decodeTypedValue(td); - myAssertEquals(i % 2 == 0 ? new BCF2TypedValue(null, tv.type) : tv, decoded); - } - } - } - } - } - - @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons") - public void testBCF2EncodingTestProviderSequences(final List toEncode) throws IOException { - final byte[] record = encodeRecord(toEncode); - decodeRecord(toEncode, record); - } - - // ----------------------------------------------------------------- - // - // Test strings and lists of strings - // - // ----------------------------------------------------------------- - - @DataProvider(name = "ListOfStrings") - public Object[][] listOfStringsProvider() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"}); - tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"}); - tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ListOfStrings") - public void testEncodingListOfString(List strings, String expected) throws IOException { - final String collapsed = BCF2Utils.collapseStringList(strings); - Assert.assertEquals(collapsed, expected); - Assert.assertEquals(BCF2Utils.explodeStringList(collapsed), strings); - } - - // ----------------------------------------------------------------- - // - // Tests to determine the best type of arrays of integers - // - // ----------------------------------------------------------------- - - @DataProvider(name = "BestIntTypeTests") - public Object[][] BestIntTypeTests() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList(1), BCF2Type.INT8}); - tests.add(new Object[]{Arrays.asList(1, 10), BCF2Type.INT8}); - tests.add(new Object[]{Arrays.asList(1, 10, 100), BCF2Type.INT8}); - tests.add(new Object[]{Arrays.asList(1, -1), BCF2Type.INT8}); - tests.add(new Object[]{Arrays.asList(1, 1000), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(1, 1000, 10), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(1, 1000, 100), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(1000), BCF2Type.INT16}); - tests.add(new Object[]{Arrays.asList(100000), BCF2Type.INT32}); - tests.add(new Object[]{Arrays.asList(100000, 10), BCF2Type.INT32}); - tests.add(new Object[]{Arrays.asList(100000, 100), BCF2Type.INT32}); - tests.add(new Object[]{Arrays.asList(100000, 1, -10), BCF2Type.INT32}); - tests.add(new Object[]{Arrays.asList(-100000, 1, -10), BCF2Type.INT32}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BestIntTypeTests") - public void determineBestEncoding(final List ints, final BCF2Type expectedType) throws IOException { - BCF2Encoder encoder = new BCF2Encoder(); - Assert.assertEquals(BCF2Utils.determineIntegerType(ints), expectedType); - Assert.assertEquals(BCF2Utils.determineIntegerType(ArrayUtils.toPrimitive(ints.toArray(new Integer[0]))), expectedType); - } - - // ----------------------------------------------------------------- - // - // Tests managing and skipping multiple blocks - // - // ----------------------------------------------------------------- - - @Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences") - public void testReadAndSkipWithMultipleBlocks(final List block) throws IOException { - testReadAndSkipWithMultipleBlocks(block, forCombinations); - testReadAndSkipWithMultipleBlocks(forCombinations, block); - } - - public void testReadAndSkipWithMultipleBlocks(final List block1, final List block2) throws IOException { - final byte[] record1 = encodeRecord(block1); - final byte[] record2 = encodeRecord(block2); - - // each record is individually good - decodeRecord(block1, record1); - decodeRecord(block2, record2); - - BCF2Decoder decoder = new BCF2Decoder(); - - // test setting - decoder.setRecordBytes(record1); - decodeRecord(block1, decoder); - decoder.setRecordBytes(record2); - decodeRecord(block2, decoder); - - // test combining the streams - final byte[] combined = combineRecords(record1, record2); - final List combinedObjects = new ArrayList(block1); - combinedObjects.addAll(block2); - - // the combined bytes is the same as the combined objects - InputStream stream = new ByteArrayInputStream(combined); - decoder.readNextBlock(record1.length, stream); - decodeRecord(block1, decoder); - decoder.readNextBlock(record2.length, stream); - decodeRecord(block2, decoder); - - // skipping the first block allows us to read the second block directly - stream = new ByteArrayInputStream(combined); - decoder.skipNextBlock(record1.length, stream); - decoder.readNextBlock(record2.length, stream); - decodeRecord(block2, decoder); - } - - // ----------------------------------------------------------------- - // - // Test encoding / decoding arrays of ints - // - // This checks that we can encode and decode correctly with the - // low-level decodeIntArray function arrays of values. This - // has to be pretty comprehensive as decodeIntArray is a highly optimized - // piece of code with lots of edge cases. The values we are encoding - // don't really matter -- just that the values come back as expected. - // - // ----------------------------------------------------------------- - - @DataProvider(name = "IntArrays") - public Object[][] makeIntArrays() { - List tests = new ArrayList(); - - for ( int nValues : Arrays.asList(0, 1, 2, 5, 10, 100) ) { - for ( int nPad : Arrays.asList(0, 1, 2, 5, 10, 100) ) { - int nElements = nValues + nPad; - - List values = new ArrayList(nElements); - - // add nValues from 0 to nValues - 1 - for ( int i = 0; i < nValues; i++ ) - values.add(i); - - // add nPad nulls - for ( int i = 0; i < nPad; i++ ) - values.add(null); - - tests.add(new Object[]{values}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "IntArrays") - public void testIntArrays(final List ints) throws IOException { - final BCF2Encoder encoder = new BCF2Encoder(); - encoder.encodeTyped(ints, BCF2Type.INT16); - - final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes()); - - final byte typeDescriptor = decoder.readTypeDescriptor(); - - // read the int[] with the low-level version - final int size = decoder.decodeNumberOfElements(typeDescriptor); - final int[] decoded = decoder.decodeIntArray(typeDescriptor, size); - - if ( isMissing(ints) ) { - // we expect that the result is null in this case - Assert.assertNull(decoded, "Encoded all missing values -- expected null"); - } else { - // we expect at least some values to come back - Assert.assertTrue(decoded.length > 0, "Must have at least 1 element for non-null encoded data"); - - // check corresponding values - for ( int i = 0; i < ints.size(); i++ ) { - final Integer expected = ints.get(i); - - if ( expected == null ) { - Assert.assertTrue(decoded.length <= i, "we expect decoded to be truncated for missing values"); - } else { - Assert.assertTrue(decoded.length > i, "we expected at least " + i + " values in decoded array"); - Assert.assertEquals(decoded[i], (int)expected); - } - } - } - } - - // ----------------------------------------------------------------- - // - // Helper routines - // - // ----------------------------------------------------------------- - - private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(record1); - baos.write(record2); - return baos.toByteArray(); - } - - private final byte[] encodeRecord(final List toEncode) throws IOException { - BCF2Encoder encoder = new BCF2Encoder(); - - for ( final BCF2TypedValue tv : toEncode ) { - if ( tv.isMissing() ) - encoder.encodeTypedMissing(tv.type); - else { - final BCF2Type encodedType = encoder.encode(tv.value); - if ( tv.type != null ) // only if we have an expectation - Assert.assertEquals(encodedType, tv.type); - } - } - - // check output - final byte[] record = encoder.getRecordBytes(); - Assert.assertNotNull(record); - Assert.assertTrue(record.length > 0); - return record; - } - - private final void decodeRecord(final List toEncode, final byte[] record) throws IOException { - decodeRecord(toEncode, new BCF2Decoder(record)); - } - - private final void decodeRecord(final List toEncode, final BCF2Decoder decoder) throws IOException { - for ( final BCF2TypedValue tv : toEncode ) { - Assert.assertFalse(decoder.blockIsFullyDecoded()); - final Object decoded = decoder.decodeTypedValue(); - - myAssertEquals(tv, decoded); - } - - Assert.assertTrue(decoder.blockIsFullyDecoded()); - } - - private final void myAssertEquals(final BCF2TypedValue tv, final Object decoded) { - if ( tv.value == null ) { // special needs for instanceof double - Assert.assertEquals(decoded, tv.value); - } else if ( tv.type == BCF2Type.FLOAT ) { // need tolerance for floats, and they aren't null - Assert.assertTrue(decoded instanceof Double); - - final double valueFloat = (Double)tv.value; - final double decodedFloat = (Double)decoded; - - VariantBaseTest.assertEqualsDoubleSmart(decodedFloat, valueFloat, FLOAT_TOLERANCE); - } else - Assert.assertEquals(decoded, tv.value); - } - - private final boolean isMissing(final List values) { - if ( values != null ) - for ( Integer value : values ) - if ( value != null ) - return false; - return true; - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/variant/bcf2/BCF2UtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/bcf2/BCF2UtilsUnitTest.java deleted file mode 100644 index 5d01a458b..000000000 --- a/public/java/test/org/broadinstitute/variant/bcf2/BCF2UtilsUnitTest.java +++ /dev/null @@ -1,153 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.bcf2; - -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.*; - -import java.util.*; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -/** - * Tests for BCF2Utils - */ -public final class BCF2UtilsUnitTest extends VariantBaseTest { - @DataProvider(name = "CollapseExpandTest") - public Object[][] makeCollapseExpandTest() { - List tests = new ArrayList(); - tests.add(new Object[]{Arrays.asList("A"), "A", false}); - tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true}); - tests.add(new Object[]{Arrays.asList("AB"), "AB", false}); - tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true}); - tests.add(new Object[]{Arrays.asList(), "", false}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CollapseExpandTest") - public void testCollapseExpandTest(final List in, final String expectedCollapsed, final boolean isCollapsed) { - final String actualCollapsed = BCF2Utils.collapseStringList(in); - Assert.assertEquals(actualCollapsed, expectedCollapsed); - Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed); - if ( isCollapsed ) - Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in); - } - - @DataProvider(name = "HeaderOrderTestProvider") - public Object[][] makeHeaderOrderTestProvider() { - final List inputLines = new ArrayList(); - final List extraLines = new ArrayList(); - - int counter = 0; - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - final int inputLineCounter = counter; - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); - - extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); - extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - extraLines.add(new VCFHeaderLine("x", "misc")); - extraLines.add(new VCFHeaderLine("y", "misc")); - - List tests = new ArrayList(); - for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) { - final List empty = Collections.emptyList(); - final List> permutations = extrasToTake == 0 - ? Collections.singletonList(empty) - : GeneralUtils.makePermutations(extraLines, extrasToTake, false); - for ( final List permutation : permutations ) { - for ( int i = -1; i < inputLines.size(); i++ ) { - final List allLines = new ArrayList(inputLines); - if ( i >= 0 ) - allLines.remove(i); - allLines.addAll(permutation); - final VCFHeader testHeader = new VCFHeader(new LinkedHashSet(allLines)); - final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter); - tests.add(new Object[]{inputHeader, testHeader, expectedConsistent}); - } - } - } - - // sample name tests - final List> sampleNameTests = Arrays.asList( - new ArrayList(), - Arrays.asList("A"), - Arrays.asList("A", "B"), - Arrays.asList("A", "B", "C")); - for ( final List inSamples : sampleNameTests ) { - for ( final List testSamples : sampleNameTests ) { - final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples); - - final List> permutations = testSamples.isEmpty() - ? Collections.singletonList(testSamples) - : GeneralUtils.makePermutations(testSamples, testSamples.size(), false); - for ( final List testSamplesPermutation : permutations ) { - final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation); - final boolean expectedConsistent = testSamples.equals(inSamples); - tests.add(new Object[]{inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) { - final List ids = new ArrayList(); - for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) { - if ( line instanceof VCFIDHeaderLine ) { - ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID())); - } - } - - // as long as the start contains all of the ids up to minCounterForInputLines in order - for ( int i = 0; i < minCounterForInputLines; i++ ) - if ( i >= ids.size() || ids.get(i) != i ) - return false; - - return true; - } - - // - // Test to make sure that we detect correctly the case where we can preserve the genotypes data in a BCF2 - // even when the header file is slightly different - // - @Test(dataProvider = "HeaderOrderTestProvider") - public void testHeaderOrder(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) { - final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testHeader, inputHeader); - Assert.assertEquals(actualOrderConsistency, expectedConsistent); - } -} diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/AlleleUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/AlleleUnitTest.java deleted file mode 100644 index 7fa652f2f..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/AlleleUnitTest.java +++ /dev/null @@ -1,180 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -// the imports for unit testing. - -import org.broadinstitute.variant.VariantBaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.Test; - -// public Allele(byte[] bases, boolean isRef) { -// public Allele(boolean isRef) { -// public Allele(String bases, boolean isRef) { -// public boolean isReference() { return isRef; } -// public boolean isNonReference() { return ! isReference(); } -// public byte[] getBases() { return bases; } -// public boolean equals(Allele other) { -// public int length() { - -/** - * Basic unit test for RecalData - */ -public class AlleleUnitTest extends VariantBaseTest { - Allele ARef, A, T, ATIns, ATCIns, NoCall; - - @BeforeSuite - public void before() { - A = Allele.create("A"); - ARef = Allele.create("A", true); - T = Allele.create("T"); - - ATIns = Allele.create("AT"); - ATCIns = Allele.create("ATC"); - - NoCall = Allele.create("."); - } - - @Test - public void testCreatingSNPAlleles() { - Assert.assertTrue(A.isNonReference()); - Assert.assertFalse(A.isReference()); - Assert.assertTrue(A.basesMatch("A")); - Assert.assertEquals(A.length(), 1); - - Assert.assertTrue(ARef.isReference()); - Assert.assertFalse(ARef.isNonReference()); - Assert.assertTrue(ARef.basesMatch("A")); - Assert.assertFalse(ARef.basesMatch("T")); - - Assert.assertTrue(T.isNonReference()); - Assert.assertFalse(T.isReference()); - Assert.assertTrue(T.basesMatch("T")); - Assert.assertFalse(T.basesMatch("A")); - } - - @Test - public void testCreatingNoCallAlleles() { - Assert.assertTrue(NoCall.isNonReference()); - Assert.assertFalse(NoCall.isReference()); - Assert.assertFalse(NoCall.basesMatch(".")); - Assert.assertEquals(NoCall.length(), 0); - Assert.assertTrue(NoCall.isNoCall()); - Assert.assertFalse(NoCall.isCalled()); - } - - - @Test - public void testCreatingIndelAlleles() { - Assert.assertEquals(ATIns.length(), 2); - Assert.assertEquals(ATCIns.length(), 3); - Assert.assertEquals(ATIns.getBases(), "AT".getBytes()); - Assert.assertEquals(ATCIns.getBases(), "ATC".getBytes()); - } - - - @Test - public void testConstructors1() { - Allele a1 = Allele.create("A"); - Allele a2 = Allele.create("A".getBytes()); - Allele a3 = Allele.create("A"); - Allele a4 = Allele.create("A", true); - - Assert.assertTrue(a1.equals(a2)); - Assert.assertTrue(a1.equals(a3)); - Assert.assertFalse(a1.equals(a4)); - } - - @Test - public void testInsConstructors() { - Allele a1 = Allele.create("AC"); - Allele a2 = Allele.create("AC".getBytes()); - Allele a3 = Allele.create("AC"); - Allele a4 = Allele.create("AC", true); - - Assert.assertTrue(a1.equals(a2)); - Assert.assertTrue(a1.equals(a3)); - Assert.assertFalse(a1.equals(a4)); - } - - @Test - public void testEquals() { - Assert.assertTrue(ARef.basesMatch(A)); - Assert.assertFalse(ARef.equals(A)); - Assert.assertFalse(ARef.equals(ATIns)); - Assert.assertFalse(ARef.equals(ATCIns)); - - Assert.assertTrue(T.basesMatch(T)); - Assert.assertFalse(T.basesMatch(A)); - Assert.assertFalse(T.equals(A)); - - Assert.assertTrue(ATIns.equals(ATIns)); - Assert.assertFalse(ATIns.equals(ATCIns)); - Assert.assertTrue(ATIns.basesMatch("AT")); - Assert.assertFalse(ATIns.basesMatch("A")); - Assert.assertFalse(ATIns.basesMatch("ATC")); - - Assert.assertTrue(ATIns.basesMatch("AT")); - Assert.assertFalse(ATIns.basesMatch("ATC")); - } - - @Test (expectedExceptions = IllegalArgumentException.class) - public void testBadConstructorArgs1() { - byte[] foo = null; - Allele.create(foo); - } - - @Test (expectedExceptions = IllegalArgumentException.class) - public void testBadConstructorArgs2() { - Allele.create("x"); - } - - @Test (expectedExceptions = IllegalArgumentException.class) - public void testBadConstructorArgs3() { - Allele.create("--"); - } - - @Test (expectedExceptions = IllegalArgumentException.class) - public void testBadConstructorArgs4() { - Allele.create("-A"); - } - - @Test (expectedExceptions = IllegalArgumentException.class) - public void testBadConstructorArgs5() { - Allele.create("A A"); - } - - @Test - public void testExtend() { - Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString()); - Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString()); - Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString()); - Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString()); - Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString()); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java deleted file mode 100644 index 562130101..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java +++ /dev/null @@ -1,203 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -// the imports for unit testing. - - -import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.EnumMap; -import java.util.List; - - -/** - * Basic unit test for Genotype likelihoods objects - */ -public class GenotypeLikelihoodsUnitTest extends VariantBaseTest { - double [] v = new double[]{-10.5, -1.25, -5.11}; - final static String vGLString = "-10.50,-1.25,-5.11"; - final static String vPLString = "93,0,39"; - double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC - - @Test - public void testFromVector2() { - GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(v); - assertDoubleArraysAreEqual(gl.getAsVector(), v); - Assert.assertEquals(gl.getAsString(), vPLString); - } - - @Test - public void testFromString1() { - GenotypeLikelihoods gl = GenotypeLikelihoods.fromPLField(vPLString); - assertDoubleArraysAreEqual(gl.getAsVector(), new double[]{-9.3, 0, -3.9}); - Assert.assertEquals(gl.getAsString(), vPLString); - } - - @Test - public void testFromString2() { - GenotypeLikelihoods gl = GenotypeLikelihoods.fromGLField(vGLString); - assertDoubleArraysAreEqual(gl.getAsVector(), v); - Assert.assertEquals(gl.getAsString(), vPLString); - } - - @Test (expectedExceptions = TribbleException.class) - public void testErrorBadFormat() { - GenotypeLikelihoods gl = GenotypeLikelihoods.fromPLField("adf,b,c"); - gl.getAsVector(); - } - - @Test - public void testGetAsMap(){ - GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(v); - //Log scale - EnumMap glMap = gl.getAsMap(false); - Assert.assertEquals(v[GenotypeType.HOM_REF.ordinal()-1],glMap.get(GenotypeType.HOM_REF)); - Assert.assertEquals(v[GenotypeType.HET.ordinal()-1],glMap.get(GenotypeType.HET)); - Assert.assertEquals(v[GenotypeType.HOM_VAR.ordinal()-1],glMap.get(GenotypeType.HOM_VAR)); - - //Linear scale - glMap = gl.getAsMap(true); - double [] vl = GeneralUtils.normalizeFromLog10(v); - Assert.assertEquals(vl[GenotypeType.HOM_REF.ordinal()-1],glMap.get(GenotypeType.HOM_REF)); - Assert.assertEquals(vl[GenotypeType.HET.ordinal()-1],glMap.get(GenotypeType.HET)); - Assert.assertEquals(vl[GenotypeType.HOM_VAR.ordinal()-1],glMap.get(GenotypeType.HOM_VAR)); - - //Test missing likelihoods - gl = GenotypeLikelihoods.fromPLField("."); - glMap = gl.getAsMap(false); - Assert.assertNull(glMap); - - } - - @Test - public void testCalculateNumLikelihoods() { - - for (int nAlleles=2; nAlleles<=5; nAlleles++) - // simplest case: diploid - Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); - - // some special cases: ploidy = 20, #alleles = 4 - Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(4, 20), 1771); - } - - @Test - public void testGetLog10GQ(){ - GenotypeLikelihoods gl = GenotypeLikelihoods.fromPLField(vPLString); - - //GQ for the best guess genotype - Assert.assertEquals(gl.getLog10GQ(GenotypeType.HET),-3.9); - - double[] test = GeneralUtils.normalizeFromLog10(gl.getAsVector()); - - //GQ for the other genotypes - Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_REF), Math.log10(1.0 - test[GenotypeType.HOM_REF.ordinal()-1])); - Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_VAR), Math.log10(1.0 - test[GenotypeType.HOM_VAR.ordinal()-1])); - - //Test missing likelihoods - gl = GenotypeLikelihoods.fromPLField("."); - Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_REF),Double.NEGATIVE_INFINITY); - Assert.assertEquals(gl.getLog10GQ(GenotypeType.HET),Double.NEGATIVE_INFINITY); - Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_VAR),Double.NEGATIVE_INFINITY); - - } - - @Test - public void testgetQualFromLikelihoods() { - double[] likelihoods = new double[]{-1, 0, -2}; - // qual values we expect for each possible "best" genotype - double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294}; - - for ( int i = 0; i < likelihoods.length; i++ ) { - Assert.assertEquals(GenotypeLikelihoods.getGQLog10FromLikelihoods(i, likelihoods), expectedQuals[i], 1e-6, - "GQ value for genotype " + i + " was not calculated correctly"); - } - } - - // this test is completely broken, the method is wrong. - public void testGetQualFromLikelihoodsMultiAllelicBroken() { - GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); - double actualGQ = gl.getLog10GQ(GenotypeType.HET); - double expectedGQ = 1.6; - Assert.assertEquals(actualGQ,expectedGQ); - } - - public void testGetQualFromLikelihoodsMultiAllelic() { - GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); - Allele ref = Allele.create((byte)'A',true); - Allele alt1 = Allele.create((byte)'C'); - Allele alt2 = Allele.create((byte)'T'); - List allAlleles = Arrays.asList(ref,alt1,alt2); - List gtAlleles = Arrays.asList(alt1,alt2); - GenotypeBuilder gtBuilder = new GenotypeBuilder(); - gtBuilder.alleles(gtAlleles); - double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles); - double expectedGQ = 1.6; - Assert.assertEquals(actualGQ,expectedGQ); - } - - private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { - Assert.assertEquals(v1.length, v2.length); - for ( int i = 0; i < v1.length; i++ ) { - Assert.assertEquals(v1[i], v2[i], 1e-6); - } - } - - @Test - public void testCalculatePLindex(){ - int counter = 0; - for ( int i = 0; i <= 3; i++ ) { - for ( int j = i; j <= 3; j++ ) { - Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), GenotypeLikelihoods.PLindexConversion[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly"); - } - } - } - - @Test - public void testGetAllelePair(){ - allelePairTest(0, 0, 0); - allelePairTest(1, 0, 1); - allelePairTest(2, 1, 1); - allelePairTest(3, 0, 2); - allelePairTest(4, 1, 2); - allelePairTest(5, 2, 2); - allelePairTest(6, 0, 3); - allelePairTest(7, 1, 3); - allelePairTest(8, 2, 3); - allelePairTest(9, 3, 3); - } - - private void allelePairTest(int PLindex, int allele1, int allele2) { - Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex1, allele1, "allele index " + allele1 + " from PL index " + PLindex + " was not calculated correctly"); - Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex2, allele2, "allele index " + allele2 + " from PL index " + PLindex + " was not calculated correctly"); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeUnitTest.java deleted file mode 100644 index 8d0d2af90..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeUnitTest.java +++ /dev/null @@ -1,101 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -// the imports for unit testing. - - -import org.broadinstitute.variant.VariantBaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.Test; - - -public class GenotypeUnitTest extends VariantBaseTest { - Allele A, Aref, T; - - @BeforeSuite - public void before() { - A = Allele.create("A"); - Aref = Allele.create("A", true); - T = Allele.create("T"); - } - - private static final GenotypeBuilder makeGB() { - return new GenotypeBuilder("misc"); - } - - @Test - public void testFilters() { - Assert.assertFalse(makeGB().make().isFiltered(), "by default Genotypes must be PASS"); - Assert.assertNull(makeGB().make().getFilters(), "by default Genotypes must be PASS => getFilters() == null"); - Assert.assertFalse(makeGB().filter(null).make().isFiltered(), "setting filter == null => Genotypes must be PASS"); - Assert.assertNull(makeGB().filter(null).make().getFilters(), "Genotypes PASS => getFilters == null"); - Assert.assertFalse(makeGB().filter("PASS").make().isFiltered(), "setting filter == PASS => Genotypes must be PASS"); - Assert.assertNull(makeGB().filter("PASS").make().getFilters(), "Genotypes PASS => getFilters == null"); - Assert.assertTrue(makeGB().filter("x").make().isFiltered(), "setting filter != null => Genotypes must be PASS"); - Assert.assertEquals(makeGB().filter("x").make().getFilters(), "x", "Should get back the expected filter string"); - Assert.assertEquals(makeGB().filters("x", "y").make().getFilters(), "x;y", "Multiple filter field values should be joined with ;"); - Assert.assertEquals(makeGB().filters("x", "y", "z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;"); - Assert.assertTrue(makeGB().filters("x", "y", "z").make().isFiltered(), "Multiple filter values should be filtered"); - Assert.assertEquals(makeGB().filter("x;y;z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;"); - } - -// public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { -// public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { -// public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) -// public Genotype(String sampleName, List alleles, double negLog10PError) -// public Genotype(String sampleName, List alleles) -// public List getAlleles() -// public List getAlleles(Allele allele) -// public Allele getAllele(int i) -// public boolean isPhased() -// public int getPloidy() -// public Type getType() -// public boolean isHom() -// public boolean isHomRef() -// public boolean isHomVar() -// public boolean isHet() -// public boolean isNoCall() -// public boolean isCalled() -// public boolean isAvailable() -// public boolean hasLikelihoods() -// public GenotypeLikelihoods getLikelihoods() -// public boolean sameGenotype(Genotype other) -// public boolean sameGenotype(Genotype other, boolean ignorePhase) -// public String getSampleName() -// public boolean hasLog10PError() -// public double getLog10PError() -// public double getPhredScaledQual() -// public boolean hasExtendedAttribute(String key) -// public Object getExtendedAttribute(String key) -// public Object getExtendedAttribute(String key, Object defaultValue) -// public String getAttributeAsString(String key, String defaultValue) -// public int getAttributeAsInt(String key, int defaultValue) -// public double getAttributeAsDouble(String key, double defaultValue) -// public boolean getAttributeAsBoolean(String key, boolean defaultValue) -} diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypesContextUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypesContextUnitTest.java deleted file mode 100644 index 1618ad1f2..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypesContextUnitTest.java +++ /dev/null @@ -1,309 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -// the imports for unit testing. - - -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - - -public class GenotypesContextUnitTest extends VariantBaseTest { - Allele Aref, C, T; - Genotype AA, AT, TT, AC, CT, CC, MISSING; - List allGenotypes; - - @BeforeSuite - public void before() { - C = Allele.create("C"); - Aref = Allele.create("A", true); - T = Allele.create("T"); - AA = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - AT = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - TT = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - AC = GenotypeBuilder.create("AC", Arrays.asList(Aref, C)); - CT = GenotypeBuilder.create("CT", Arrays.asList(C, T)); - CC = GenotypeBuilder.create("CC", Arrays.asList(C, C)); - MISSING = GenotypeBuilder.create("MISSING", Arrays.asList(C, C)); - - allGenotypes = Arrays.asList(AA, AT, TT, AC, CT, CC); - } - - // -------------------------------------------------------------------------------- - // - // Provider - // - // -------------------------------------------------------------------------------- - - private interface ContextMaker { - public GenotypesContext make(List initialSamples); - } - - private ContextMaker baseMaker = new ContextMaker() { - @Override - public GenotypesContext make(final List initialSamples) { - return GenotypesContext.copy(initialSamples); - } - - @Override - public String toString() { - return "GenotypesContext"; - } - }; - - private final class lazyMaker implements LazyGenotypesContext.LazyParser, ContextMaker { - @Override - public LazyGenotypesContext.LazyData parse(final Object data) { - GenotypesContext gc = GenotypesContext.copy((List)data); - gc.ensureSampleNameMap(); - gc.ensureSampleOrdering(); - return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset); - } - - @Override - public GenotypesContext make(final List initialSamples) { - return new LazyGenotypesContext(this, initialSamples, initialSamples.size()); - } - - @Override - public String toString() { - return "LazyGenotypesContext"; - } - } - - private Collection allMakers = Arrays.asList(baseMaker, new lazyMaker()); - - private class GenotypesContextProvider extends TestDataProvider { - ContextMaker maker; - final List initialSamples; - - private GenotypesContextProvider(ContextMaker maker, List initialSamples) { - super(GenotypesContextProvider.class, String.format("%s with %d samples", maker.toString(), initialSamples.size())); - this.maker = maker; - this.initialSamples = initialSamples; - } - - public GenotypesContext makeContext() { - return maker.make(initialSamples); - } - } - - @DataProvider(name = "GenotypesContextProvider") - public Object[][] MakeSampleNamesTest() { - for ( ContextMaker maker : allMakers ) { - for ( int i = 0; i < allGenotypes.size(); i++ ) { - List samples = allGenotypes.subList(0, i); - // sorted - new GenotypesContextProvider(maker, samples); - // unsorted - new GenotypesContextProvider(maker, GeneralUtils.reverse(samples)); - } - } - - return GenotypesContextProvider.getTests(GenotypesContextProvider.class); - } - - private final static void testIterable(Iterable genotypeIterable, Set expectedNames) { - int count = 0; - for ( final Genotype g : genotypeIterable ) { - Assert.assertTrue(expectedNames.contains(g.getSampleName())); - count++; - } - Assert.assertEquals(count, expectedNames.size(), "Iterable returned unexpected number of genotypes"); - } - - @Test(dataProvider = "GenotypesContextProvider") - public void testInitialSamplesAreAsExpected(GenotypesContextProvider cfg) { - testGenotypesContextContainsExpectedSamples(cfg.makeContext(), cfg.initialSamples); - } - - private final void testGenotypesContextContainsExpectedSamples(GenotypesContext gc, List expectedSamples) { - Assert.assertEquals(gc.isEmpty(), expectedSamples.isEmpty()); - Assert.assertEquals(gc.size(), expectedSamples.size()); - - // get(index) is doing the right thing - for ( int i = 0; i < expectedSamples.size(); i++ ) { - Assert.assertEquals(gc.get(i), expectedSamples.get(i)); - } - Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); - - // we can fetch samples by name - final Set genotypeNames = VariantContextUtils.genotypeNames(expectedSamples); - for ( final String name : genotypeNames ) { - Assert.assertTrue(gc.containsSample(name)); - } - Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); - - // all of the iterators are working - testIterable(gc.iterateInSampleNameOrder(), genotypeNames); - testIterable(gc, genotypeNames); - testIterable(gc.iterateInSampleNameOrder(genotypeNames), genotypeNames); - if ( ! genotypeNames.isEmpty() ) { - Set first = Collections.singleton(genotypeNames.iterator().next()); - testIterable(gc.iterateInSampleNameOrder(first), first); - } - - // misc. utils are working as expected - Assert.assertEquals(gc.getSampleNames(), genotypeNames); - Assert.assertTrue(ParsingUtils.isSorted(gc.getSampleNamesOrderedByName())); - Assert.assertTrue(ParsingUtils.isSorted(gc.iterateInSampleNameOrder())); - Assert.assertTrue(gc.containsSamples(genotypeNames)); - - final Set withMissing = new HashSet(Arrays.asList(MISSING.getSampleName())); - withMissing.addAll(genotypeNames); - Assert.assertFalse(gc.containsSamples(withMissing)); - } - - @Test(enabled = true, dataProvider = "GenotypesContextProvider") - public void testImmutable(GenotypesContextProvider cfg) { - GenotypesContext gc = cfg.makeContext(); - Assert.assertEquals(gc.isMutable(), true); - gc.immutable(); - Assert.assertEquals(gc.isMutable(), false); - } - - @Test(enabled = true, dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class ) - public void testImmutableCall1(GenotypesContextProvider cfg) { - GenotypesContext gc = cfg.makeContext(); - gc.immutable(); - gc.add(MISSING); - } - - @Test(enabled = true, dataProvider = "GenotypesContextProvider") - public void testClear(GenotypesContextProvider cfg) { - GenotypesContext gc = cfg.makeContext(); - gc.clear(); - testGenotypesContextContainsExpectedSamples(gc, Collections.emptyList()); - } - - private static final List with(List genotypes, Genotype ... add) { - List l = new ArrayList(genotypes); - l.addAll(Arrays.asList(add)); - return l; - } - - private static final List without(List genotypes, Genotype ... remove) { - List l = new ArrayList(genotypes); - l.removeAll(Arrays.asList(remove)); - return l; - } - - @Test(enabled = true, dataProvider = "GenotypesContextProvider") - public void testAdds(GenotypesContextProvider cfg) { - Genotype add1 = GenotypeBuilder.create("add1", Arrays.asList(Aref, Aref)); - Genotype add2 = GenotypeBuilder.create("add2", Arrays.asList(Aref, Aref)); - - GenotypesContext gc = cfg.makeContext(); - gc.add(add1); - testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1)); - - gc = cfg.makeContext(); - gc.add(add1); - gc.add(add2); - testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); - - gc = cfg.makeContext(); - gc.addAll(Arrays.asList(add1, add2)); - testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2)); - } - - @Test(enabled = true, dataProvider = "GenotypesContextProvider") - public void testRemoves(GenotypesContextProvider cfg) { - Genotype rm1 = AA; - Genotype rm2 = AC; - - GenotypesContext gc = cfg.makeContext(); - if (gc.size() > 1) { - Genotype rm = gc.get(0); - gc.remove(rm); - testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm)); - } - - gc = cfg.makeContext(); - gc.remove(rm1); - testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1)); - - gc = cfg.makeContext(); - gc.remove(rm1); - gc.remove(rm2); - testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2)); - - gc = cfg.makeContext(); - gc.removeAll(Arrays.asList(rm1, rm2)); - testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2)); - - gc = cfg.makeContext(); - HashSet expected = new HashSet(); - if ( gc.contains(rm1) ) expected.add(rm1); - if ( gc.contains(rm2) ) expected.add(rm2); - gc.retainAll(Arrays.asList(rm1, rm2)); - - // ensure that the two lists are the same - Assert.assertEquals(new HashSet(gc.getGenotypes()), expected); - // because the list order can change, we use the gc's list itself - testGenotypesContextContainsExpectedSamples(gc, gc.getGenotypes()); - } - - @Test(enabled = true, dataProvider = "GenotypesContextProvider") - public void testSet(GenotypesContextProvider cfg) { - Genotype set = GenotypeBuilder.create("replace", Arrays.asList(Aref, Aref)); - int n = cfg.makeContext().size(); - for ( int i = 0; i < n; i++ ) { - GenotypesContext gc = cfg.makeContext(); - Genotype setted = gc.set(i, set); - Assert.assertNotNull(setted); - ArrayList l = new ArrayList(cfg.initialSamples); - l.set(i, set); - testGenotypesContextContainsExpectedSamples(gc, l); - } - } - - @Test(enabled = true, dataProvider = "GenotypesContextProvider") - public void testReplace(GenotypesContextProvider cfg) { - int n = cfg.makeContext().size(); - for ( int i = 0; i < n; i++ ) { - GenotypesContext gc = cfg.makeContext(); - Genotype toReplace = gc.get(i); - Genotype replacement = GenotypeBuilder.create(toReplace.getSampleName(), Arrays.asList(Aref, Aref)); - gc.replace(replacement); - ArrayList l = new ArrayList(cfg.initialSamples); - l.set(i, replacement); - Assert.assertEquals(replacement, gc.get(i)); - testGenotypesContextContainsExpectedSamples(gc, l); - } - } - - // subset to samples tested in VariantContextUnitTest -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java deleted file mode 100644 index 4c948e8e2..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java +++ /dev/null @@ -1,974 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.FeatureCodecHeader; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.variant.utils.GeneralUtils; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.variant.variantcontext.writer.Options; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.testng.Assert; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.*; - -/** - * Routines for generating all sorts of VCs for testing - * - * @author Your Name - * @since Date created - */ -public class VariantContextTestProvider { - final private static boolean ENABLE_GENOTYPE_TESTS = true; - final private static boolean ENABLE_A_AND_G_TESTS = true; - final private static boolean ENABLE_VARARRAY_TESTS = true; - final private static boolean ENABLE_PLOIDY_TESTS = true; - final private static boolean ENABLE_PL_TESTS = true; - final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = true; - final private static boolean ENABLE_SOURCE_VCF_TESTS = true; - final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true; - final private static List TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); - - private static VCFHeader syntheticHeader; - final static List TEST_DATAs = new ArrayList(); - private static VariantContext ROOT; - - private final static List testSourceVCFs = new ArrayList(); - static { - testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf")); - testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "ex2.vcf")); - testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "dbsnp_135.b37.1000.vcf")); - if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) { - testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "diagnosis_targets_testfile.vcf")); - testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "VQSR.mixedTest.recal")); - } - } - - public static class VariantContextContainer { - private VCFHeader header; - private Iterable vcs; - - public VariantContextContainer( VCFHeader header, Iterable vcs ) { - this.header = header; - this.vcs = vcs; - } - - public VCFHeader getHeader() { - return header; - } - - public Iterable getVCs() { - return vcs; - } - } - - public abstract static class VariantContextIOTest { - public String toString() { - return "VariantContextIOTest:" + getExtension(); - } - public abstract String getExtension(); - public abstract FeatureCodec makeCodec(); - public abstract VariantContextWriter makeWriter(final File outputFile, final EnumSet baseOptions); - - public List preprocess(final VCFHeader header, List vcsBeforeIO) { - return vcsBeforeIO; - } - - public List postprocess(final VCFHeader header, List vcsAfterIO) { - return vcsAfterIO; - } - } - - public static class VariantContextTestData { - public final VCFHeader header; - public List vcs; - - public VariantContextTestData(final VCFHeader header, final VariantContextBuilder builder) { - this(header, Collections.singletonList(builder.fullyDecoded(true).make())); - } - - public VariantContextTestData(final VCFHeader header, final List vcs) { - final Set samples = new HashSet(); - for ( final VariantContext vc : vcs ) - if ( vc.hasGenotypes() ) - samples.addAll(vc.getSampleNames()); - this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaDataInSortedOrder(), samples); - this.vcs = vcs; - } - - public boolean hasGenotypes() { - return vcs.get(0).hasGenotypes(); - } - - public String toString() { - StringBuilder b = new StringBuilder(); - b.append("VariantContextTestData: ["); - final VariantContext vc = vcs.get(0); - final VariantContextBuilder builder = new VariantContextBuilder(vc); - builder.noGenotypes(); - b.append(builder.make().toString()); - if ( vc.getNSamples() < 5 ) { - for ( final Genotype g : vc.getGenotypes() ) - b.append(g.toString()); - } else { - b.append(" nGenotypes = ").append(vc.getNSamples()); - } - - if ( vcs.size() > 1 ) b.append(" ----- with another ").append(vcs.size() - 1).append(" VariantContext records"); - b.append("]"); - return b.toString(); - } - } - - private final static VariantContextBuilder builder() { - return new VariantContextBuilder(ROOT); - } - - private final static void add(VariantContextBuilder builder) { - TEST_DATAs.add(new VariantContextTestData(syntheticHeader, builder)); - } - - public static void initializeTests() throws IOException { - createSyntheticHeader(); - makeSyntheticTests(); - makeEmpiricalTests(); - } - - private static void makeEmpiricalTests() throws IOException { - if ( ENABLE_SOURCE_VCF_TESTS ) { - for ( final File file : testSourceVCFs ) { - VCFCodec codec = new VCFCodec(); - VariantContextContainer x = readAllVCs( file, codec ); - List fullyDecoded = new ArrayList(); - - for ( final VariantContext raw : x.getVCs() ) { - if ( raw != null ) - fullyDecoded.add(raw.fullyDecode(x.getHeader(), false)); - } - - TEST_DATAs.add(new VariantContextTestData(x.getHeader(), fullyDecoded)); - } - } - } - - private final static void addHeaderLine(final Set metaData, final String id, final int count, final VCFHeaderLineType type) { - metaData.add(new VCFInfoHeaderLine(id, count, type, "x")); - if ( type != VCFHeaderLineType.Flag ) - metaData.add(new VCFFormatHeaderLine(id, count, type, "x")); - } - - private final static void addHeaderLine(final Set metaData, final String id, final VCFHeaderLineCount count, final VCFHeaderLineType type) { - metaData.add(new VCFInfoHeaderLine(id, count, type, "x")); - if ( type != VCFHeaderLineType.Flag ) - metaData.add(new VCFFormatHeaderLine(id, count, type, "x")); - } - - private static void createSyntheticHeader() { - Set metaData = new TreeSet(); - - addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); - addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); - addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String); - addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); - - addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String); - addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); - addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); - addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String); - - // prep the header - metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); - - metaData.add(new VCFFilterHeaderLine("FILTER1")); - metaData.add(new VCFFilterHeaderLine("FILTER2")); - - addHeaderLine(metaData, "INT1", 1, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "INT3", 3, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "INT20", 20, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer); - addHeaderLine(metaData, "FLOAT1", 1, VCFHeaderLineType.Float); - addHeaderLine(metaData, "FLOAT3", 3, VCFHeaderLineType.Float); - addHeaderLine(metaData, "FLAG", 0, VCFHeaderLineType.Flag); - - syntheticHeader = new VCFHeader(metaData); - } - - - private static void makeSyntheticTests() { - VariantContextBuilder rootBuilder = new VariantContextBuilder(); - rootBuilder.source("test"); - rootBuilder.loc("1", 10, 10); - rootBuilder.alleles("A", "C"); - rootBuilder.unfiltered(); - ROOT = rootBuilder.make(); - - add(builder()); - add(builder().alleles("A")); - add(builder().alleles("A", "C", "T")); - add(builder().alleles("A", "AC")); - add(builder().alleles("A", "ACAGT")); - add(builder().loc("1", 10, 11).alleles("AC", "A")); - add(builder().loc("1", 10, 13).alleles("ACGT", "A")); - - // make sure filters work - add(builder().unfiltered()); - add(builder().passFilters()); - add(builder().filters("FILTER1")); - add(builder().filters("FILTER1", "FILTER2")); - - add(builder().log10PError(VariantContext.NO_LOG10_PERROR)); - add(builder().log10PError(-1)); - add(builder().log10PError(-1.234e6)); - - add(builder().noID()); - add(builder().id("rsID12345")); - - - add(builder().attribute("INT1", 1)); - add(builder().attribute("INT1", 100)); - add(builder().attribute("INT1", 1000)); - add(builder().attribute("INT1", 100000)); - add(builder().attribute("INT1", null)); - add(builder().attribute("INT3", Arrays.asList(1, 2, 3))); - add(builder().attribute("INT3", Arrays.asList(1000, 2000, 3000))); - add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000))); - add(builder().attribute("INT3", null)); - add(builder().attribute("INT20", TWENTY_INTS)); - - add(builder().attribute("FLOAT1", 1.0)); - add(builder().attribute("FLOAT1", 100.0)); - add(builder().attribute("FLOAT1", 1000.0)); - add(builder().attribute("FLOAT1", 100000.0)); - add(builder().attribute("FLOAT1", null)); - add(builder().attribute("FLOAT3", Arrays.asList(1.0, 2.0, 3.0))); - add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0))); - add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0))); - add(builder().attribute("FLOAT3", null)); - - add(builder().attribute("FLAG", true)); - //add(builder().attribute("FLAG", false)); // NOTE -- VCF doesn't allow false flags - - add(builder().attribute("STRING1", "s1")); - add(builder().attribute("STRING1", null)); - add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3"))); - add(builder().attribute("STRING3", null)); - add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20"))); - - add(builder().attribute("VAR.INFO.STRING", "s1")); - add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2"))); - add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3"))); - add(builder().attribute("VAR.INFO.STRING", null)); - - if ( ENABLE_GENOTYPE_TESTS ) { - addGenotypesToTestData(); - addComplexGenotypesTest(); - } - - if ( ENABLE_A_AND_G_TESTS ) - addGenotypesAndGTests(); - - if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) - addSymbolicAlleleTests(); - } - - private static void addSymbolicAlleleTests() { - // two tests to ensure that the end is computed correctly when there's (and not) an END field present - add(builder().alleles("N", "").start(10).stop(11).attribute("END", 11)); - add(builder().alleles("N", "").start(10).stop(10)); - } - - private static void addGenotypesToTestData() { - final ArrayList sites = new ArrayList(); - - sites.add(builder().alleles("A").make()); - sites.add(builder().alleles("A", "C", "T").make()); - sites.add(builder().alleles("A", "AC").make()); - sites.add(builder().alleles("A", "ACAGT").make()); - - for ( VariantContext site : sites ) { - addGenotypes(site); - } - } - - private static void addGenotypeTests( final VariantContext site, Genotype ... genotypes ) { - // for each sites VC, we are going to add create two root genotypes. - // The first is the primary, and will be added to each new test - // The second is variable. In some tests it's absent (testing 1 genotype), in others it is duplicated - // 1 once, 10, 100, or 1000 times to test scaling - - final VariantContextBuilder builder = new VariantContextBuilder(site); - - // add a single context - builder.genotypes(genotypes[0]); - add(builder); - - if ( genotypes.length > 1 ) { - // add all - add(builder.genotypes(Arrays.asList(genotypes))); - - // add all with the last replicated 10x and 100x times - for ( int nCopiesOfLast : Arrays.asList(10, 100, 1000) ) { - final GenotypesContext gc = new GenotypesContext(); - final Genotype last = genotypes[genotypes.length-1]; - for ( int i = 0; i < genotypes.length - 1; i++ ) - gc.add(genotypes[i]); - for ( int i = 0; i < nCopiesOfLast; i++ ) - gc.add(new GenotypeBuilder(last).name("copy" + i).make()); - add(builder.genotypes(gc)); - } - } - } - - private static void addGenotypes( final VariantContext site) { - // test ref/ref - final Allele ref = site.getReference(); - final Allele alt1 = site.getNAlleles() > 1 ? site.getAlternateAllele(0) : null; - final Genotype homRef = GenotypeBuilder.create("homRef", Arrays.asList(ref, ref)); - addGenotypeTests(site, homRef); - - if ( alt1 != null ) { - final Genotype het = GenotypeBuilder.create("het", Arrays.asList(ref, alt1)); - final Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(alt1, alt1)); - addGenotypeTests(site, homRef, het); - addGenotypeTests(site, homRef, het, homVar); - - // test no GT at all - addGenotypeTests(site, new GenotypeBuilder("noGT", new ArrayList(0)).attribute("INT1", 10).make()); - - final List noCall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - - // ploidy - if ( ENABLE_PLOIDY_TESTS ) { - addGenotypeTests(site, - GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), - GenotypeBuilder.create("hap", Arrays.asList(ref))); - - addGenotypeTests(site, - GenotypeBuilder.create("noCall", noCall), - GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), - GenotypeBuilder.create("hap", Arrays.asList(ref))); - - addGenotypeTests(site, - GenotypeBuilder.create("noCall", noCall), - GenotypeBuilder.create("noCall2", noCall), - GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), - GenotypeBuilder.create("hap", Arrays.asList(ref))); - - addGenotypeTests(site, - GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), - GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); - - addGenotypeTests(site, - GenotypeBuilder.create("noCall", noCall), - GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), - GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); - - addGenotypeTests(site, - GenotypeBuilder.create("noCall", noCall), - GenotypeBuilder.create("noCall2", noCall), - GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), - GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); - - addGenotypeTests(site, - GenotypeBuilder.create("nocall", noCall), - GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)), - GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1))); - } - - - // - // - // TESTING PHASE - // - // - final Genotype gUnphased = new GenotypeBuilder("gUnphased", Arrays.asList(ref, alt1)).make(); - final Genotype gPhased = new GenotypeBuilder("gPhased", Arrays.asList(ref, alt1)).phased(true).make(); - final Genotype gPhased2 = new GenotypeBuilder("gPhased2", Arrays.asList(alt1, alt1)).phased(true).make(); - final Genotype gPhased3 = new GenotypeBuilder("gPhased3", Arrays.asList(ref, ref)).phased(true).make(); - final Genotype haploidNoPhase = new GenotypeBuilder("haploidNoPhase", Arrays.asList(ref)).make(); - addGenotypeTests(site, gUnphased, gPhased); - addGenotypeTests(site, gUnphased, gPhased2); - addGenotypeTests(site, gUnphased, gPhased3); - addGenotypeTests(site, gPhased, gPhased2); - addGenotypeTests(site, gPhased, gPhased3); - addGenotypeTests(site, gPhased2, gPhased3); - addGenotypeTests(site, haploidNoPhase, gPhased); - addGenotypeTests(site, haploidNoPhase, gPhased2); - addGenotypeTests(site, haploidNoPhase, gPhased3); - addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2); - addGenotypeTests(site, haploidNoPhase, gPhased, gPhased3); - addGenotypeTests(site, haploidNoPhase, gPhased2, gPhased3); - addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2, gPhased3); - - final Genotype gUnphasedTet = new GenotypeBuilder("gUnphasedTet", Arrays.asList(ref, alt1, ref, alt1)).make(); - final Genotype gPhasedTet = new GenotypeBuilder("gPhasedTet", Arrays.asList(ref, alt1, alt1, alt1)).phased(true).make(); - addGenotypeTests(site, gUnphasedTet, gPhasedTet); - } - - if ( ENABLE_PL_TESTS ) { - if ( site.getNAlleles() == 2 ) { - // testing PLs - addGenotypeTests(site, - GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2}), - GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3})); - - addGenotypeTests(site, - GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}), - GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3})); - - addGenotypeTests(site, - GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}), - GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2000, -1000})); - - addGenotypeTests(site, // missing PLs - GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}), - GenotypeBuilder.create("g2", Arrays.asList(ref, ref))); - } - else if ( site.getNAlleles() == 3 ) { - // testing PLs - addGenotypeTests(site, - GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2, -3, -4, -5}), - GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3, -4, -5, -6})); - } - } - - // test attributes - addGenotypeTests(site, - attr("g1", ref, "INT1", 1), - attr("g2", ref, "INT1", 2)); - addGenotypeTests(site, - attr("g1", ref, "INT1", 1), - attr("g2", ref, "INT1")); - addGenotypeTests(site, - attr("g1", ref, "INT3", 1, 2, 3), - attr("g2", ref, "INT3", 4, 5, 6)); - addGenotypeTests(site, - attr("g1", ref, "INT3", 1, 2, 3), - attr("g2", ref, "INT3")); - - addGenotypeTests(site, - attr("g1", ref, "INT20", TWENTY_INTS), - attr("g2", ref, "INT20", TWENTY_INTS)); - - - if (ENABLE_VARARRAY_TESTS) { - addGenotypeTests(site, - attr("g1", ref, "INT.VAR", 1, 2, 3), - attr("g2", ref, "INT.VAR", 4, 5), - attr("g3", ref, "INT.VAR", 6)); - addGenotypeTests(site, - attr("g1", ref, "INT.VAR", 1, 2, 3), - attr("g2", ref, "INT.VAR"), - attr("g3", ref, "INT.VAR", 5)); - } - - addGenotypeTests(site, - attr("g1", ref, "FLOAT1", 1.0), - attr("g2", ref, "FLOAT1", 2.0)); - addGenotypeTests(site, - attr("g1", ref, "FLOAT1", 1.0), - attr("g2", ref, "FLOAT1")); - addGenotypeTests(site, - attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0), - attr("g2", ref, "FLOAT3", 4.0, 5.0, 6.0)); - addGenotypeTests(site, - attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0), - attr("g2", ref, "FLOAT3")); - - if (ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS) { - // - // - // TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD - // - // - addGenotypeTests(site, - attr("g1", ref, "GS", Arrays.asList("S1", "S2")), - attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); - - addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1 - attr("g1", ref, "FLOAT1", 1.0), - attr("g2", ref, "GS", Arrays.asList("S3", "S4"))); - - // variable sized lists - addGenotypeTests(site, - attr("g1", ref, "GV", "S1"), - attr("g2", ref, "GV", Arrays.asList("S3", "S4"))); - - addGenotypeTests(site, - attr("g1", ref, "GV", Arrays.asList("S1", "S2")), - attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); - - addGenotypeTests(site, // missing value in varlist of string - attr("g1", ref, "FLOAT1", 1.0), - attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); - } - - // - // - // TESTING GENOTYPE FILTERS - // - // - addGenotypeTests(site, - new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); - } - - private static void addGenotypesAndGTests() { -// for ( final int ploidy : Arrays.asList(2)) { - for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) { - final List> alleleCombinations = - Arrays.asList( - Arrays.asList("A"), - Arrays.asList("A", "C"), - Arrays.asList("A", "C", "G"), - Arrays.asList("A", "C", "G", "T")); - - for ( final List alleles : alleleCombinations ) { - final VariantContextBuilder vcb = builder().alleles(alleles); - final VariantContext site = vcb.make(); - final int nAlleles = site.getNAlleles(); - final Allele ref = site.getReference(); - - // base genotype is ref/.../ref up to ploidy - final List baseGenotype = new ArrayList(ploidy); - for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref); - final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy); - - // ada is 0, 1, ..., nAlleles - 1 - final List ada = new ArrayList(nAlleles); - for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i); - - // pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy) - final int[] pl = new int[nPLs]; - for ( int i = 0; i < pl.length; i++ ) pl[i] = i; - - final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE"); - gb.alleles(baseGenotype); - gb.PL(pl); - gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada); - vcb.genotypes(gb.make()); - - add(vcb); - } - } - } - - private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) { - if ( value.length == 0 ) - return GenotypeBuilder.create(name, Arrays.asList(ref, ref)); - else { - final Object toAdd = value.length == 1 ? value[0] : Arrays.asList(value); - return new GenotypeBuilder(name, Arrays.asList(ref, ref)).attribute(key, toAdd).make(); - } - } - - public static List generateSiteTests() { - return TEST_DATAs; - } - - public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { - final int nSamples = data.header.getNGenotypeSamples(); - if ( nSamples > 2 ) { - for ( final VariantContext vc : data.vcs ) - if ( vc.isSymbolic() ) - // cannot handle symbolic alleles because they may be weird non-call VCFs - return; - - final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); - tmpFile.deleteOnExit(); - - // write expected to disk - final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); - final VariantContextWriter writer = tester.makeWriter(tmpFile, options); - - final Set samplesInVCF = new HashSet(data.header.getGenotypeSamples()); - final List missingSamples = Arrays.asList("MISSING1", "MISSING2"); - final List allSamples = new ArrayList(missingSamples); - allSamples.addAll(samplesInVCF); - - final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples); - writeVCsToFile(writer, header, data.vcs); - - // ensure writing of expected == actual - final VariantContextContainer p = readAllVCs(tmpFile, tester.makeCodec()); - final Iterable actual = p.getVCs(); - - int i = 0; - for ( final VariantContext readVC : actual ) { - if ( readVC == null ) continue; // sometimes we read null records... - final VariantContext expected = data.vcs.get(i++); - for ( final Genotype g : readVC.getGenotypes() ) { - Assert.assertTrue(allSamples.contains(g.getSampleName())); - if ( samplesInVCF.contains(g.getSampleName()) ) { - assertEquals(g, expected.getGenotype(g.getSampleName())); - } else { - // missing - Assert.assertTrue(g.isNoCall()); - } - } - } - - } - } - - public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { - testReaderWriter(tester, data.header, data.vcs, data.vcs, true); - } - - public static void testReaderWriter(final VariantContextIOTest tester, - final VCFHeader header, - final List expected, - final Iterable vcs, - final boolean recurse) throws IOException { - final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); - tmpFile.deleteOnExit(); - - // write expected to disk - final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); - final VariantContextWriter writer = tester.makeWriter(tmpFile, options); - writeVCsToFile(writer, header, vcs); - - // ensure writing of expected == actual - final VariantContextContainer p = readAllVCs(tmpFile, tester.makeCodec()); - final Iterable actual = p.getVCs(); - assertEquals(actual, expected); - - if ( recurse ) { - // if we are doing a recursive test, grab a fresh iterator over the written values - final Iterable read = readAllVCs(tmpFile, tester.makeCodec()).getVCs(); - testReaderWriter(tester, p.getHeader(), expected, read, false); - } - } - - private static void writeVCsToFile(final VariantContextWriter writer, final VCFHeader header, final Iterable vcs) { - // write - writer.writeHeader(header); - for ( VariantContext vc : vcs ) - if (vc != null) - writer.add(vc); - writer.close(); - } - - /** - * Utility class to read all of the VC records from a file - * - * @param source - * @param codec - * @return - * @throws IOException - */ - public final static VariantContextContainer readAllVCs( final File source, final FeatureCodec codec ) throws IOException { - // read in the features - PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); - FeatureCodecHeader header = codec.readHeader(pbs); - pbs.close(); - - pbs = new PositionalBufferedStream(new FileInputStream(source)); - pbs.skip(header.getHeaderEnd()); - - final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - return new VariantContextContainer(vcfHeader, new VCIterable(pbs, codec, vcfHeader)); - } - - public static class VCIterable implements Iterable, Iterator { - final PositionalBufferedStream pbs; - final FeatureCodec codec; - final VCFHeader header; - - private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec codec, final VCFHeader header) { - this.pbs = pbs; - this.codec = codec; - this.header = header; - } - - @Override - public Iterator iterator() { - return this; - } - - @Override - public boolean hasNext() { - try { - return ! pbs.isDone(); - } catch ( IOException e ) { - throw new RuntimeException(e); - } - } - - @Override - public VariantContext next() { - try { - final VariantContext vc = codec.decode(pbs); - return vc == null ? null : vc.fullyDecode(header, false); - } catch ( IOException e ) { - throw new RuntimeException(e); - } - } - - @Override - public void remove() { - //To change body of implemented methods use File | Settings | File Templates. - } - } - - public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { - final VariantContextContainer vcfData = readAllVCs(vcfFile, new VCFCodec()); - final VariantContextContainer bcfData = readAllVCs(bcfFile, new BCF2Codec()); - assertEquals(bcfData.getHeader(), vcfData.getHeader()); - assertEquals(bcfData.getVCs(), vcfData.getVCs()); - } - - public static void assertEquals(final Iterable actual, final Iterable expected) { - final Iterator actualIT = actual.iterator(); - final Iterator expectedIT = expected.iterator(); - - while ( expectedIT.hasNext() ) { - final VariantContext expectedVC = expectedIT.next(); - if ( expectedVC == null ) - continue; - - VariantContext actualVC; - do { - Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); - actualVC = actualIT.next(); - } while ( actualIT.hasNext() && actualVC == null ); - - if ( actualVC == null ) - Assert.fail("Too few records in actual"); - - assertEquals(actualVC, expectedVC); - } - Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); - } - - /** - * Assert that two variant contexts are actually equal - * @param actual - * @param expected - */ - public static void assertEquals( final VariantContext actual, final VariantContext expected ) { - Assert.assertNotNull(actual, "VariantContext expected not null"); - Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); - Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); - Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); - Assert.assertEquals(actual.getID(), expected.getID(), "id"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); - - assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); - VariantBaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); - VariantBaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); - - Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); - if ( expected.hasGenotypes() ) { - VariantBaseTest.assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); - Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); - final Set samples = expected.getSampleNames(); - for ( final String sample : samples ) { - assertEquals(actual.getGenotype(sample), expected.getGenotype(sample)); - } - } - } - - public static void assertEquals(final Genotype actual, final Genotype expected) { - Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); - Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); - Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); - - // filters are the same - Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); - - // inline attributes - Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); - Assert.assertEquals(actual.getAD(), expected.getAD(), "Genotype ad"); - Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); - Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); - Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); - Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); - Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); - - Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); - Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); - Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); - Assert.assertEquals(actual.getPL(), expected.getPL(), "Genotype getPL"); - - Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); - assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); - Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); - Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); - } - - private static void assertAttributesEquals(final Map actual, Map expected) { - final Set expectedKeys = new HashSet(expected.keySet()); - - for ( final Map.Entry act : actual.entrySet() ) { - final Object actualValue = act.getValue(); - if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { - final Object expectedValue = expected.get(act.getKey()); - if ( expectedValue instanceof List ) { - final List expectedList = (List)expectedValue; - Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); - final List actualList = (List)actualValue; - Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); - for ( int i = 0; i < expectedList.size(); i++ ) - assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); - } else - assertAttributeEquals(act.getKey(), actualValue, expectedValue); - } else { - // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); - } - expectedKeys.remove(act.getKey()); - } - - // now expectedKeys contains only the keys found in expected but not in actual, - // and they must all be null - for ( final String missingExpected : expectedKeys ) { - final Object value = expected.get(missingExpected); - Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); - } - } - - private static final boolean isMissing(final Object value) { - if ( value == null ) return true; - else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; - else if ( value instanceof List ) { - // handles the case where all elements are null or the list is empty - for ( final Object elt : (List)value) - if ( elt != null ) - return false; - return true; - } else - return false; - } - - private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { - if ( expected instanceof Double ) { - // must be very tolerant because doubles are being rounded to 2 sig figs - VariantBaseTest.assertEqualsDoubleSmart(actual, (Double)expected, 1e-2); - } else - Assert.assertEquals(actual, expected, "Attribute " + key); - } - - public static void addComplexGenotypesTest() { - final List allAlleles = Arrays.asList( - Allele.create("A", true), - Allele.create("C", false), - Allele.create("G", false)); - - for ( int nAlleles : Arrays.asList(2, 3) ) { - for ( int highestPloidy : Arrays.asList(1, 2, 3) ) { - // site alleles - final List siteAlleles = allAlleles.subList(0, nAlleles); - - // possible alleles for genotypes - final List possibleGenotypeAlleles = new ArrayList(siteAlleles); - possibleGenotypeAlleles.add(Allele.NO_CALL); - - // there are n^ploidy possible genotypes - final List> possibleGenotypes = makeAllGenotypes(possibleGenotypeAlleles, highestPloidy); - final int nPossibleGenotypes = possibleGenotypes.size(); - - VariantContextBuilder vb = new VariantContextBuilder("unittest", "1", 1, 1, siteAlleles); - - // first test -- create n copies of each genotype - for ( int i = 0; i < nPossibleGenotypes; i++ ) { - final List samples = new ArrayList(3); - samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i))); - add(vb.genotypes(samples)); - } - - // second test -- create one sample with each genotype - { - final List samples = new ArrayList(nPossibleGenotypes); - for ( int i = 0; i < nPossibleGenotypes; i++ ) { - samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i))); - } - add(vb.genotypes(samples)); - } - - // test mixed ploidy - for ( int i = 0; i < nPossibleGenotypes; i++ ) { - for ( int ploidy = 1; ploidy < highestPloidy; ploidy++ ) { - final List samples = new ArrayList(highestPloidy); - final List genotype = possibleGenotypes.get(i).subList(0, ploidy); - samples.add(GenotypeBuilder.create("sample" + i, genotype)); - add(vb.genotypes(samples)); - } - } - } - } - } - - private static List> makeAllGenotypes(final List alleles, final int highestPloidy) { - return GeneralUtils.makePermutations(alleles, highestPloidy, true); - } - - public static void assertEquals(final VCFHeader actual, final VCFHeader expected) { - Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); - - // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? - //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); - final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); - final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); - for ( int i = 0; i < actualLines.size(); i++ ) { - Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); - } - } - - public static void main( String argv[] ) { - final File variants1 = new File(argv[0]); - final File variants2 = new File(argv[1]); - try { - VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(variants1, variants2); - } catch ( IOException e ) { - throw new RuntimeException(e); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java deleted file mode 100644 index 103c8ab3b..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUnitTest.java +++ /dev/null @@ -1,918 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - - -// the imports for unit testing. - -import org.broadinstitute.variant.VariantBaseTest; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - - -public class VariantContextUnitTest extends VariantBaseTest { - Allele A, Aref, C, T, Tref; - Allele del, delRef, ATC, ATCref; - - // A [ref] / T at 10 - String snpLoc = "chr1"; - int snpLocStart = 10; - int snpLocStop = 10; - - // - / ATC [ref] from 20-22 - String delLoc = "chr1"; - int delLocStart = 20; - int delLocStop = 22; - - // - [ref] / ATC from 20-20 - String insLoc = "chr1"; - int insLocStart = 20; - int insLocStop = 20; - - VariantContextBuilder basicBuilder, snpBuilder, insBuilder; - - @BeforeSuite - public void before() { - del = Allele.create("A"); - delRef = Allele.create("A", true); - - A = Allele.create("A"); - C = Allele.create("C"); - Aref = Allele.create("A", true); - T = Allele.create("T"); - Tref = Allele.create("T", true); - - ATC = Allele.create("ATC"); - ATCref = Allele.create("ATC", true); - } - - @BeforeMethod - public void beforeTest() { - basicBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)); - snpBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T)); - insBuilder = new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATC)); - } - - @Test - public void testDetermineTypes() { - Allele ACref = Allele.create("AC", true); - Allele AC = Allele.create("AC"); - Allele AT = Allele.create("AT"); - Allele C = Allele.create("C"); - Allele CAT = Allele.create("CAT"); - Allele TAref = Allele.create("TA", true); - Allele TA = Allele.create("TA"); - Allele TC = Allele.create("TC"); - Allele symbolic = Allele.create(""); - - // test REF - List alleles = Arrays.asList(Tref); - VariantContext vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION); - - // test SNPs - alleles = Arrays.asList(Tref, A); - vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); - - alleles = Arrays.asList(Tref, A, C); - vc = snpBuilder.alleles(alleles).stop(snpLocStop).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); - - // test MNPs - alleles = Arrays.asList(ACref, TA); - vc = snpBuilder.alleles(alleles).stop(snpLocStop+1).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.MNP); - - alleles = Arrays.asList(ATCref, CAT, Allele.create("GGG")); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.MNP); - - // test INDELs - alleles = Arrays.asList(Aref, ATC); - vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); - - alleles = Arrays.asList(ATCref, A); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); - - alleles = Arrays.asList(Tref, TA, TC); - vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); - - alleles = Arrays.asList(ATCref, A, AC); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); - - alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC")); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); - - // test MIXED - alleles = Arrays.asList(TAref, T, TC); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); - - alleles = Arrays.asList(TAref, T, AC); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); - - alleles = Arrays.asList(ACref, ATC, AT); - vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); - - alleles = Arrays.asList(Aref, T, symbolic); - vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); - - // test SYMBOLIC - alleles = Arrays.asList(Tref, symbolic); - vc = basicBuilder.alleles(alleles).stop(snpLocStop).make(); - Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC); - } - - @Test - public void testMultipleSNPAlleleOrdering() { - final List allelesNaturalOrder = Arrays.asList(Aref, C, T); - final List allelesUnnaturalOrder = Arrays.asList(Aref, T, C); - VariantContext naturalVC = snpBuilder.alleles(allelesNaturalOrder).make(); - VariantContext unnaturalVC = snpBuilder.alleles(allelesUnnaturalOrder).make(); - Assert.assertEquals(new ArrayList(naturalVC.getAlleles()), allelesNaturalOrder); - Assert.assertEquals(new ArrayList(unnaturalVC.getAlleles()), allelesUnnaturalOrder); - } - - @Test - public void testCreatingSNPVariantContext() { - - List alleles = Arrays.asList(Aref, T); - VariantContext vc = snpBuilder.alleles(alleles).make(); - - Assert.assertEquals(vc.getChr(), snpLoc); - Assert.assertEquals(vc.getStart(), snpLocStart); - Assert.assertEquals(vc.getEnd(), snpLocStop); - Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); - Assert.assertTrue(vc.isSNP()); - Assert.assertFalse(vc.isIndel()); - Assert.assertFalse(vc.isSimpleInsertion()); - Assert.assertFalse(vc.isSimpleDeletion()); - Assert.assertFalse(vc.isMixed()); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertEquals(vc.getNAlleles(), 2); - - Assert.assertEquals(vc.getReference(), Aref); - Assert.assertEquals(vc.getAlleles().size(), 2); - Assert.assertEquals(vc.getAlternateAlleles().size(), 1); - Assert.assertEquals(vc.getAlternateAllele(0), T); - - Assert.assertFalse(vc.hasGenotypes()); - - Assert.assertEquals(vc.getSampleNames().size(), 0); - } - - @Test - public void testCreatingRefVariantContext() { - List alleles = Arrays.asList(Aref); - VariantContext vc = snpBuilder.alleles(alleles).make(); - - Assert.assertEquals(vc.getChr(), snpLoc); - Assert.assertEquals(vc.getStart(), snpLocStart); - Assert.assertEquals(vc.getEnd(), snpLocStop); - Assert.assertEquals(VariantContext.Type.NO_VARIATION, vc.getType()); - Assert.assertFalse(vc.isSNP()); - Assert.assertFalse(vc.isIndel()); - Assert.assertFalse(vc.isSimpleInsertion()); - Assert.assertFalse(vc.isSimpleDeletion()); - Assert.assertFalse(vc.isMixed()); - Assert.assertFalse(vc.isBiallelic()); - Assert.assertEquals(vc.getNAlleles(), 1); - - Assert.assertEquals(vc.getReference(), Aref); - Assert.assertEquals(vc.getAlleles().size(), 1); - Assert.assertEquals(vc.getAlternateAlleles().size(), 0); - //Assert.assertEquals(vc.getAlternateAllele(0), T); - - Assert.assertFalse(vc.hasGenotypes()); - Assert.assertEquals(vc.getSampleNames().size(), 0); - } - - @Test - public void testCreatingDeletionVariantContext() { - List alleles = Arrays.asList(ATCref, del); - VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make(); - - Assert.assertEquals(vc.getChr(), delLoc); - Assert.assertEquals(vc.getStart(), delLocStart); - Assert.assertEquals(vc.getEnd(), delLocStop); - Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); - Assert.assertFalse(vc.isSNP()); - Assert.assertTrue(vc.isIndel()); - Assert.assertFalse(vc.isSimpleInsertion()); - Assert.assertTrue(vc.isSimpleDeletion()); - Assert.assertFalse(vc.isMixed()); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertEquals(vc.getNAlleles(), 2); - - Assert.assertEquals(vc.getReference(), ATCref); - Assert.assertEquals(vc.getAlleles().size(), 2); - Assert.assertEquals(vc.getAlternateAlleles().size(), 1); - Assert.assertEquals(vc.getAlternateAllele(0), del); - - Assert.assertFalse(vc.hasGenotypes()); - - Assert.assertEquals(vc.getSampleNames().size(), 0); - } - - @Test - public void testMatchingAlleles() { - List alleles = Arrays.asList(ATCref, del); - VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make(); - VariantContext vc2 = new VariantContextBuilder("test2", delLoc, delLocStart+12, delLocStop+12, alleles).make(); - - Assert.assertTrue(vc.hasSameAllelesAs(vc2)); - Assert.assertTrue(vc.hasSameAlternateAllelesAs(vc2)); - } - - @Test - public void testCreatingInsertionVariantContext() { - List alleles = Arrays.asList(delRef, ATC); - VariantContext vc = insBuilder.alleles(alleles).make(); - - Assert.assertEquals(vc.getChr(), insLoc); - Assert.assertEquals(vc.getStart(), insLocStart); - Assert.assertEquals(vc.getEnd(), insLocStop); - Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); - Assert.assertFalse(vc.isSNP()); - Assert.assertTrue(vc.isIndel()); - Assert.assertTrue(vc.isSimpleInsertion()); - Assert.assertFalse(vc.isSimpleDeletion()); - Assert.assertFalse(vc.isMixed()); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertEquals(vc.getNAlleles(), 2); - - Assert.assertEquals(vc.getReference(), delRef); - Assert.assertEquals(vc.getAlleles().size(), 2); - Assert.assertEquals(vc.getAlternateAlleles().size(), 1); - Assert.assertEquals(vc.getAlternateAllele(0), ATC); - Assert.assertFalse(vc.hasGenotypes()); - - Assert.assertEquals(vc.getSampleNames().size(), 0); - } - - @Test - public void testCreatingPartiallyCalledGenotype() { - List alleles = Arrays.asList(Aref, C); - Genotype g = GenotypeBuilder.create("foo", Arrays.asList(C, Allele.NO_CALL)); - VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g).make(); - - Assert.assertTrue(vc.isSNP()); - Assert.assertEquals(vc.getNAlleles(), 2); - Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphicInSamples()); - Assert.assertTrue(vc.isPolymorphicInSamples()); - Assert.assertEquals(vc.getGenotype("foo"), g); - Assert.assertEquals(vc.getCalledChrCount(), 1); // we only have 1 called chromosomes, we exclude the NO_CALL one isn't called - Assert.assertEquals(vc.getCalledChrCount(Aref), 0); - Assert.assertEquals(vc.getCalledChrCount(C), 1); - Assert.assertFalse(vc.getGenotype("foo").isHet()); - Assert.assertFalse(vc.getGenotype("foo").isHom()); - Assert.assertFalse(vc.getGenotype("foo").isNoCall()); - Assert.assertFalse(vc.getGenotype("foo").isHom()); - Assert.assertTrue(vc.getGenotype("foo").isMixed()); - Assert.assertEquals(vc.getGenotype("foo").getType(), GenotypeType.MIXED); - } - - @Test (expectedExceptions = Exception.class) - public void testBadConstructorArgs1() { - new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).make(); - } - - @Test (expectedExceptions = Exception.class) - public void testBadConstructorArgs2() { - new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)).make(); - } - - @Test (expectedExceptions = Exception.class) - public void testBadConstructorArgs3() { - new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)).make(); - } - - @Test (expectedExceptions = Throwable.class) - public void testBadConstructorArgs4() { - new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Collections.emptyList()).make(); - } - - @Test (expectedExceptions = Exception.class) - public void testBadConstructorArgsDuplicateAlleles1() { - new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)).make(); - } - - @Test (expectedExceptions = Exception.class) - public void testBadConstructorArgsDuplicateAlleles2() { - new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)).make(); - } - - @Test (expectedExceptions = Throwable.class) - public void testBadLoc1() { - List alleles = Arrays.asList(Aref, T, del); - new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make(); - } - - @Test (expectedExceptions = Throwable.class) - public void testBadID1() { - new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id(null).make(); - } - - @Test (expectedExceptions = Exception.class) - public void testBadID2() { - new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id("").make(); - } - - @Test (expectedExceptions = Throwable.class) - public void testBadPError() { - new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).log10PError(0.5).make(); - } - - @Test - public void testAccessingSimpleSNPGenotypes() { - List alleles = Arrays.asList(Aref, T); - - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - - VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) - .genotypes(g1, g2, g3).make(); - - Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphicInSamples()); - Assert.assertTrue(vc.isPolymorphicInSamples()); - Assert.assertEquals(vc.getSampleNames().size(), 3); - - Assert.assertEquals(vc.getGenotypes().size(), 3); - Assert.assertEquals(vc.getGenotypes().get("AA"), g1); - Assert.assertEquals(vc.getGenotype("AA"), g1); - Assert.assertEquals(vc.getGenotypes().get("AT"), g2); - Assert.assertEquals(vc.getGenotype("AT"), g2); - Assert.assertEquals(vc.getGenotypes().get("TT"), g3); - Assert.assertEquals(vc.getGenotype("TT"), g3); - - Assert.assertTrue(vc.hasGenotype("AA")); - Assert.assertTrue(vc.hasGenotype("AT")); - Assert.assertTrue(vc.hasGenotype("TT")); - Assert.assertFalse(vc.hasGenotype("foo")); - Assert.assertFalse(vc.hasGenotype("TTT")); - Assert.assertFalse(vc.hasGenotype("at")); - Assert.assertFalse(vc.hasGenotype("tt")); - - Assert.assertEquals(vc.getCalledChrCount(), 6); - Assert.assertEquals(vc.getCalledChrCount(Aref), 3); - Assert.assertEquals(vc.getCalledChrCount(T), 3); - } - - @Test - public void testAccessingCompleteGenotypes() { - List alleles = Arrays.asList(Aref, T, ATC); - - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - Genotype g4 = GenotypeBuilder.create("Td", Arrays.asList(T, ATC)); - Genotype g5 = GenotypeBuilder.create("dd", Arrays.asList(ATC, ATC)); - Genotype g6 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); - - VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) - .genotypes(g1, g2, g3, g4, g5, g6).make(); - - Assert.assertTrue(vc.hasGenotypes()); - Assert.assertFalse(vc.isMonomorphicInSamples()); - Assert.assertTrue(vc.isPolymorphicInSamples()); - Assert.assertEquals(vc.getGenotypes().size(), 6); - - Assert.assertEquals(3, vc.getGenotypes(Arrays.asList("AA", "Td", "dd")).size()); - - Assert.assertEquals(10, vc.getCalledChrCount()); - Assert.assertEquals(3, vc.getCalledChrCount(Aref)); - Assert.assertEquals(4, vc.getCalledChrCount(T)); - Assert.assertEquals(3, vc.getCalledChrCount(ATC)); - Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL)); - } - - @Test - public void testAccessingRefGenotypes() { - List alleles1 = Arrays.asList(Aref, T); - List alleles2 = Arrays.asList(Aref); - List alleles3 = Arrays.asList(Aref, T); - for ( List alleles : Arrays.asList(alleles1, alleles2, alleles3)) { - Genotype g1 = GenotypeBuilder.create("AA1", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AA2", Arrays.asList(Aref, Aref)); - Genotype g3 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); - VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles) - .genotypes(g1, g2, g3).make(); - - Assert.assertTrue(vc.hasGenotypes()); - Assert.assertTrue(vc.isMonomorphicInSamples()); - Assert.assertFalse(vc.isPolymorphicInSamples()); - Assert.assertEquals(vc.getGenotypes().size(), 3); - - Assert.assertEquals(4, vc.getCalledChrCount()); - Assert.assertEquals(4, vc.getCalledChrCount(Aref)); - Assert.assertEquals(0, vc.getCalledChrCount(T)); - Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL)); - } - } - - @Test - public void testFilters() { - List alleles = Arrays.asList(Aref, T); - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - - VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1, g2).make(); - - Assert.assertTrue(vc.isNotFiltered()); - Assert.assertFalse(vc.isFiltered()); - Assert.assertEquals(0, vc.getFilters().size()); - Assert.assertFalse(vc.filtersWereApplied()); - Assert.assertNull(vc.getFiltersMaybeNull()); - - vc = new VariantContextBuilder(vc).filters("BAD_SNP_BAD!").make(); - - Assert.assertFalse(vc.isNotFiltered()); - Assert.assertTrue(vc.isFiltered()); - Assert.assertEquals(1, vc.getFilters().size()); - Assert.assertTrue(vc.filtersWereApplied()); - Assert.assertNotNull(vc.getFiltersMaybeNull()); - - Set filters = new HashSet(Arrays.asList("BAD_SNP_BAD!", "REALLY_BAD_SNP", "CHRIST_THIS_IS_TERRIBLE")); - vc = new VariantContextBuilder(vc).filters(filters).make(); - - Assert.assertFalse(vc.isNotFiltered()); - Assert.assertTrue(vc.isFiltered()); - Assert.assertEquals(3, vc.getFilters().size()); - Assert.assertTrue(vc.filtersWereApplied()); - Assert.assertNotNull(vc.getFiltersMaybeNull()); - } - - @Test - public void testGetGenotypeCounts() { - List alleles = Arrays.asList(Aref, T); - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - Genotype g4 = GenotypeBuilder.create("A.", Arrays.asList(Aref, Allele.NO_CALL)); - Genotype g5 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); - - // we need to create a new VariantContext each time - VariantContext vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); - Assert.assertEquals(1, vc.getHetCount()); - vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); - Assert.assertEquals(1, vc.getHomRefCount()); - vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); - Assert.assertEquals(1, vc.getHomVarCount()); - vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); - Assert.assertEquals(1, vc.getMixedCount()); - vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); - Assert.assertEquals(1, vc.getNoCallCount()); - } - - @Test - public void testVCFfromGenotypes() { - List alleles = Arrays.asList(Aref, T); - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - Genotype g4 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); - VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4).make(); - - VariantContext vc12 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName(), g2.getSampleName())), true); - VariantContext vc1 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName())), true); - VariantContext vc23 = vc.subContextFromSamples(new HashSet(Arrays.asList(g2.getSampleName(), g3.getSampleName())), true); - VariantContext vc4 = vc.subContextFromSamples(new HashSet(Arrays.asList(g4.getSampleName())), true); - VariantContext vc14 = vc.subContextFromSamples(new HashSet(Arrays.asList(g1.getSampleName(), g4.getSampleName())), true); - - Assert.assertTrue(vc12.isPolymorphicInSamples()); - Assert.assertTrue(vc23.isPolymorphicInSamples()); - Assert.assertTrue(vc1.isMonomorphicInSamples()); - Assert.assertTrue(vc4.isMonomorphicInSamples()); - Assert.assertTrue(vc14.isMonomorphicInSamples()); - - Assert.assertTrue(vc12.isSNP()); - Assert.assertTrue(vc12.isVariant()); - Assert.assertTrue(vc12.isBiallelic()); - - Assert.assertFalse(vc1.isSNP()); - Assert.assertFalse(vc1.isVariant()); - Assert.assertFalse(vc1.isBiallelic()); - - Assert.assertTrue(vc23.isSNP()); - Assert.assertTrue(vc23.isVariant()); - Assert.assertTrue(vc23.isBiallelic()); - - Assert.assertFalse(vc4.isSNP()); - Assert.assertFalse(vc4.isVariant()); - Assert.assertFalse(vc4.isBiallelic()); - - Assert.assertFalse(vc14.isSNP()); - Assert.assertFalse(vc14.isVariant()); - Assert.assertFalse(vc14.isBiallelic()); - - Assert.assertEquals(3, vc12.getCalledChrCount(Aref)); - Assert.assertEquals(1, vc23.getCalledChrCount(Aref)); - Assert.assertEquals(2, vc1.getCalledChrCount(Aref)); - Assert.assertEquals(0, vc4.getCalledChrCount(Aref)); - Assert.assertEquals(2, vc14.getCalledChrCount(Aref)); - } - - public void testGetGenotypeMethods() { - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - GenotypesContext gc = GenotypesContext.create(g1, g2, g3); - VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); - - Assert.assertEquals(vc.getGenotype("AA"), g1); - Assert.assertEquals(vc.getGenotype("AT"), g2); - Assert.assertEquals(vc.getGenotype("TT"), g3); - Assert.assertEquals(vc.getGenotype("CC"), null); - - Assert.assertEquals(vc.getGenotypes(), gc); - Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT")), Arrays.asList(g1, g2)); - Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "TT")), Arrays.asList(g1, g3)); - Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "TT")), Arrays.asList(g1, g2, g3)); - Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "CC")), Arrays.asList(g1, g2)); - - Assert.assertEquals(vc.getGenotype(0), g1); - Assert.assertEquals(vc.getGenotype(1), g2); - Assert.assertEquals(vc.getGenotype(2), g3); - } - - // -------------------------------------------------------------------------------- - // - // Test allele merging - // - // -------------------------------------------------------------------------------- - - private class GetAllelesTest extends TestDataProvider { - List alleles; - - private GetAllelesTest(String name, Allele... arg) { - super(GetAllelesTest.class, name); - this.alleles = Arrays.asList(arg); - } - - public String toString() { - return String.format("%s input=%s", super.toString(), alleles); - } - } - - @DataProvider(name = "getAlleles") - public Object[][] mergeAllelesData() { - new GetAllelesTest("A*", Aref); - new GetAllelesTest("A*/C", Aref, C); - new GetAllelesTest("A*/C/T", Aref, C, T); - new GetAllelesTest("A*/T/C", Aref, T, C); - new GetAllelesTest("A*/C/T/ATC", Aref, C, T, ATC); - new GetAllelesTest("A*/T/C/ATC", Aref, T, C, ATC); - new GetAllelesTest("A*/ATC/T/C", Aref, ATC, T, C); - - return GetAllelesTest.getTests(GetAllelesTest.class); - } - - @Test(dataProvider = "getAlleles") - public void testMergeAlleles(GetAllelesTest cfg) { - final List altAlleles = cfg.alleles.subList(1, cfg.alleles.size()); - final VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles).make(); - - Assert.assertEquals(vc.getAlleles(), cfg.alleles, "VC alleles not the same as input alleles"); - Assert.assertEquals(vc.getNAlleles(), cfg.alleles.size(), "VC getNAlleles not the same as input alleles size"); - Assert.assertEquals(vc.getAlternateAlleles(), altAlleles, "VC alt alleles not the same as input alt alleles"); - - - for ( int i = 0; i < cfg.alleles.size(); i++ ) { - final Allele inputAllele = cfg.alleles.get(i); - - Assert.assertTrue(vc.hasAllele(inputAllele)); - if ( inputAllele.isReference() ) { - final Allele nonRefVersion = Allele.create(inputAllele.getBases(), false); - Assert.assertTrue(vc.hasAllele(nonRefVersion, true)); - Assert.assertFalse(vc.hasAllele(nonRefVersion, false)); - } - - Assert.assertEquals(inputAllele, vc.getAllele(inputAllele.getBaseString())); - Assert.assertEquals(inputAllele, vc.getAllele(inputAllele.getBases())); - - if ( i > 0 ) { // it's an alt allele - Assert.assertEquals(inputAllele, vc.getAlternateAllele(i-1)); - } - } - - final Allele missingAllele = Allele.create("AACCGGTT"); // does not exist - Assert.assertNull(vc.getAllele(missingAllele.getBases())); - Assert.assertFalse(vc.hasAllele(missingAllele)); - Assert.assertFalse(vc.hasAllele(missingAllele, true)); - } - - private class SitesAndGenotypesVC extends TestDataProvider { - VariantContext vc, copy; - - private SitesAndGenotypesVC(String name, VariantContext original) { - super(SitesAndGenotypesVC.class, name); - this.vc = original; - this.copy = new VariantContextBuilder(original).make(); - } - - public String toString() { - return String.format("%s input=%s", super.toString(), vc); - } - } - - @DataProvider(name = "SitesAndGenotypesVC") - public Object[][] MakeSitesAndGenotypesVCs() { - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - - VariantContext sites = new VariantContextBuilder("sites", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).make(); - VariantContext genotypes = new VariantContextBuilder(sites).source("genotypes").genotypes(g1, g2, g3).make(); - - new SitesAndGenotypesVC("sites", sites); - new SitesAndGenotypesVC("genotypes", genotypes); - - return SitesAndGenotypesVC.getTests(SitesAndGenotypesVC.class); - } - - // -------------------------------------------------------------------------------- - // - // Test modifying routines - // - // -------------------------------------------------------------------------------- - @Test(dataProvider = "SitesAndGenotypesVC") - public void runModifyVCTests(SitesAndGenotypesVC cfg) { - VariantContext modified = new VariantContextBuilder(cfg.vc).loc("chr2", 123, 123).make(); - Assert.assertEquals(modified.getChr(), "chr2"); - Assert.assertEquals(modified.getStart(), 123); - Assert.assertEquals(modified.getEnd(), 123); - - modified = new VariantContextBuilder(cfg.vc).id("newID").make(); - Assert.assertEquals(modified.getID(), "newID"); - - Set newFilters = Collections.singleton("newFilter"); - modified = new VariantContextBuilder(cfg.vc).filters(newFilters).make(); - Assert.assertEquals(modified.getFilters(), newFilters); - - // test the behavior when the builder's attribute object is null - modified = new VariantContextBuilder(modified).attributes(null).make(); - Assert.assertTrue(modified.getAttributes().isEmpty()); - modified = new VariantContextBuilder(modified).attributes(null).rmAttribute("AC").make(); - Assert.assertTrue(modified.getAttributes().isEmpty()); - modified = new VariantContextBuilder(modified).attributes(null).attribute("AC", 1).make(); - Assert.assertEquals(modified.getAttribute("AC"), 1); - - // test the behavior when the builder's attribute object is not initialized - modified = new VariantContextBuilder(modified.getSource(), modified.getChr(), modified.getStart(), modified.getEnd(), modified.getAlleles()).attribute("AC", 1).make(); - - // test normal attribute modification - modified = new VariantContextBuilder(cfg.vc).attribute("AC", 1).make(); - Assert.assertEquals(modified.getAttribute("AC"), 1); - modified = new VariantContextBuilder(modified).attribute("AC", 2).make(); - Assert.assertEquals(modified.getAttribute("AC"), 2); - - Genotype g1 = GenotypeBuilder.create("AA2", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT2", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT2", Arrays.asList(T, T)); - GenotypesContext gc = GenotypesContext.create(g1,g2,g3); - modified = new VariantContextBuilder(cfg.vc).genotypes(gc).make(); - Assert.assertEquals(modified.getGenotypes(), gc); - modified = new VariantContextBuilder(cfg.vc).noGenotypes().make(); - Assert.assertTrue(modified.getGenotypes().isEmpty()); - - // test that original hasn't changed - Assert.assertEquals(cfg.vc.getChr(), cfg.copy.getChr()); - Assert.assertEquals(cfg.vc.getStart(), cfg.copy.getStart()); - Assert.assertEquals(cfg.vc.getEnd(), cfg.copy.getEnd()); - Assert.assertEquals(cfg.vc.getAlleles(), cfg.copy.getAlleles()); - Assert.assertEquals(cfg.vc.getAttributes(), cfg.copy.getAttributes()); - Assert.assertEquals(cfg.vc.getID(), cfg.copy.getID()); - Assert.assertEquals(cfg.vc.getGenotypes(), cfg.copy.getGenotypes()); - Assert.assertEquals(cfg.vc.getLog10PError(), cfg.copy.getLog10PError()); - Assert.assertEquals(cfg.vc.getFilters(), cfg.copy.getFilters()); - } - - // -------------------------------------------------------------------------------- - // - // Test subcontext - // - // -------------------------------------------------------------------------------- - private class SubContextTest extends TestDataProvider { - Set samples; - boolean updateAlleles; - - private SubContextTest(Collection samples, boolean updateAlleles) { - super(SubContextTest.class); - this.samples = new HashSet(samples); - this.updateAlleles = updateAlleles; - } - - public String toString() { - return String.format("%s samples=%s updateAlleles=%b", super.toString(), samples, updateAlleles); - } - } - - @DataProvider(name = "SubContextTest") - public Object[][] MakeSubContextTest() { - for ( boolean updateAlleles : Arrays.asList(true, false)) { - new SubContextTest(Collections.emptySet(), updateAlleles); - new SubContextTest(Collections.singleton("MISSING"), updateAlleles); - new SubContextTest(Collections.singleton("AA"), updateAlleles); - new SubContextTest(Collections.singleton("AT"), updateAlleles); - new SubContextTest(Collections.singleton("TT"), updateAlleles); - new SubContextTest(Arrays.asList("AA", "AT"), updateAlleles); - new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles); - new SubContextTest(Arrays.asList("AA", "AT", "MISSING"), updateAlleles); - new SubContextTest(Arrays.asList("AA", "AT", "TT", "MISSING"), updateAlleles); - } - - return SubContextTest.getTests(SubContextTest.class); - } - - @Test(dataProvider = "SubContextTest") - public void runSubContextTest(SubContextTest cfg) { - Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref)); - Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T)); - Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T)); - - GenotypesContext gc = GenotypesContext.create(g1, g2, g3); - VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); - VariantContext sub = vc.subContextFromSamples(cfg.samples, cfg.updateAlleles); - - // unchanged attributes should be the same - Assert.assertEquals(sub.getChr(), vc.getChr()); - Assert.assertEquals(sub.getStart(), vc.getStart()); - Assert.assertEquals(sub.getEnd(), vc.getEnd()); - Assert.assertEquals(sub.getLog10PError(), vc.getLog10PError()); - Assert.assertEquals(sub.getFilters(), vc.getFilters()); - Assert.assertEquals(sub.getID(), vc.getID()); - Assert.assertEquals(sub.getAttributes(), vc.getAttributes()); - - Set expectedGenotypes = new HashSet(); - if ( cfg.samples.contains(g1.getSampleName()) ) expectedGenotypes.add(g1); - if ( cfg.samples.contains(g2.getSampleName()) ) expectedGenotypes.add(g2); - if ( cfg.samples.contains(g3.getSampleName()) ) expectedGenotypes.add(g3); - GenotypesContext expectedGC = GenotypesContext.copy(expectedGenotypes); - - // these values depend on the results of sub - if ( cfg.updateAlleles ) { - // do the work to see what alleles should be here, and which not - Set alleles = new HashSet(); - for ( final Genotype g : expectedGC ) alleles.addAll(g.getAlleles()); - if ( ! alleles.contains(Aref) ) alleles.add(Aref); // always have the reference - Assert.assertEquals(new HashSet(sub.getAlleles()), alleles); - } else { - // not updating alleles -- should be the same - Assert.assertEquals(sub.getAlleles(), vc.getAlleles()); - } - - // same sample names => success - Assert.assertEquals(sub.getGenotypes().getSampleNames(), expectedGC.getSampleNames()); - } - - // -------------------------------------------------------------------------------- - // - // Test sample name functions - // - // -------------------------------------------------------------------------------- - private class SampleNamesTest extends TestDataProvider { - List sampleNames; - List sampleNamesInOrder; - - private SampleNamesTest(List sampleNames, List sampleNamesInOrder) { - super(SampleNamesTest.class); - this.sampleNamesInOrder = sampleNamesInOrder; - this.sampleNames = sampleNames; - } - - public String toString() { - return String.format("%s samples=%s order=%s", super.toString(), sampleNames, sampleNamesInOrder); - } - } - - @DataProvider(name = "SampleNamesTest") - public Object[][] MakeSampleNamesTest() { - new SampleNamesTest(Arrays.asList("1"), Arrays.asList("1")); - new SampleNamesTest(Arrays.asList("2", "1"), Arrays.asList("1", "2")); - new SampleNamesTest(Arrays.asList("1", "2"), Arrays.asList("1", "2")); - new SampleNamesTest(Arrays.asList("1", "2", "3"), Arrays.asList("1", "2", "3")); - new SampleNamesTest(Arrays.asList("2", "1", "3"), Arrays.asList("1", "2", "3")); - new SampleNamesTest(Arrays.asList("2", "3", "1"), Arrays.asList("1", "2", "3")); - new SampleNamesTest(Arrays.asList("3", "1", "2"), Arrays.asList("1", "2", "3")); - new SampleNamesTest(Arrays.asList("3", "2", "1"), Arrays.asList("1", "2", "3")); - new SampleNamesTest(Arrays.asList("NA2", "NA1"), Arrays.asList("NA1", "NA2")); - return SampleNamesTest.getTests(SampleNamesTest.class); - } - - private final static void assertGenotypesAreInOrder(Iterable gIt, List names) { - int i = 0; - for ( final Genotype g : gIt ) { - Assert.assertEquals(g.getSampleName(), names.get(i), "Unexpected genotype ordering"); - i++; - } - } - - - @Test(dataProvider = "SampleNamesTest") - public void runSampleNamesTest(SampleNamesTest cfg) { - GenotypesContext gc = GenotypesContext.create(cfg.sampleNames.size()); - for ( final String name : cfg.sampleNames ) { - gc.add(GenotypeBuilder.create(name, Arrays.asList(Aref, T))); - } - - VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); - - // same sample names => success - Assert.assertEquals(vc.getSampleNames(), new HashSet(cfg.sampleNames), "vc.getSampleNames() = " + vc.getSampleNames()); - Assert.assertEquals(vc.getSampleNamesOrderedByName(), cfg.sampleNamesInOrder, "vc.getSampleNamesOrderedByName() = " + vc.getSampleNamesOrderedByName()); - - assertGenotypesAreInOrder(vc.getGenotypesOrderedByName(), cfg.sampleNamesInOrder); - assertGenotypesAreInOrder(vc.getGenotypesOrderedBy(cfg.sampleNames), cfg.sampleNames); - } - - @Test - public void testGenotypeCounting() { - Genotype noCall = GenotypeBuilder.create("nocall", Arrays.asList(Allele.NO_CALL)); - Genotype mixed = GenotypeBuilder.create("mixed", Arrays.asList(Aref, Allele.NO_CALL)); - Genotype homRef = GenotypeBuilder.create("homRef", Arrays.asList(Aref, Aref)); - Genotype het = GenotypeBuilder.create("het", Arrays.asList(Aref, T)); - Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(T, T)); - - List allGenotypes = Arrays.asList(noCall, mixed, homRef, het, homVar); - final int nCycles = allGenotypes.size() * 10; - - for ( int i = 0; i < nCycles; i++ ) { - int nNoCall = 0, nNoCallAlleles = 0, nA = 0, nT = 0, nMixed = 0, nHomRef = 0, nHet = 0, nHomVar = 0; - int nSamples = 0; - GenotypesContext gc = GenotypesContext.create(); - for ( int j = 0; j < i; j++ ) { - nSamples++; - Genotype g = allGenotypes.get(j % allGenotypes.size()); - final String name = String.format("%s_%d%d", g.getSampleName(), i, j); - gc.add(GenotypeBuilder.create(name, g.getAlleles())); - switch ( g.getType() ) { - case NO_CALL: nNoCall++; nNoCallAlleles++; break; - case HOM_REF: nA += 2; nHomRef++; break; - case HET: nA++; nT++; nHet++; break; - case HOM_VAR: nT += 2; nHomVar++; break; - case MIXED: nA++; nNoCallAlleles++; nMixed++; break; - default: throw new RuntimeException("Unexpected genotype type " + g.getType()); - } - - } - - VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make(); - Assert.assertEquals(vc.getNSamples(), nSamples); - if ( nSamples > 0 ) { - Assert.assertEquals(vc.isPolymorphicInSamples(), nT > 0); - Assert.assertEquals(vc.isMonomorphicInSamples(), nT == 0); - } - Assert.assertEquals(vc.getCalledChrCount(), nA + nT); - - Assert.assertEquals(vc.getCalledChrCount(Allele.NO_CALL), nNoCallAlleles); - Assert.assertEquals(vc.getCalledChrCount(Aref), nA); - Assert.assertEquals(vc.getCalledChrCount(T), nT); - - Assert.assertEquals(vc.getNoCallCount(), nNoCall); - Assert.assertEquals(vc.getHomRefCount(), nHomRef); - Assert.assertEquals(vc.getHetCount(), nHet); - Assert.assertEquals(vc.getHomVarCount(), nHomVar); - Assert.assertEquals(vc.getMixedCount(), nMixed); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantJEXLContextUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/VariantJEXLContextUnitTest.java deleted file mode 100644 index 8d2569771..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantJEXLContextUnitTest.java +++ /dev/null @@ -1,130 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext; - -import org.broadinstitute.variant.VariantBaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; - - -/** - * - * @author aaron - * - * Class VariantJEXLContextUnitTest - * - * Test out parts of the VariantJEXLContext - */ -public class VariantJEXLContextUnitTest extends VariantBaseTest { - - private static String expression = "QUAL > 500.0"; - private static VariantContextUtils.JexlVCMatchExp exp; - - Allele A, Aref, T, Tref; - - Allele ATC, ATCref; - // A [ref] / T at 10 - - // - / ATC [ref] from 20-23 - - @BeforeClass - public void beforeClass() { - try { - exp = new VariantContextUtils.JexlVCMatchExp("name", VariantContextUtils.engine.createExpression(expression)); - } catch (Exception e) { - Assert.fail("Unable to create expression" + e.getMessage()); - } - } - - @BeforeMethod - public void before() { - A = Allele.create("A"); - Aref = Allele.create("A", true); - T = Allele.create("T"); - Tref = Allele.create("T", true); - - ATC = Allele.create("ATC"); - ATCref = Allele.create("ATC", true); - } - - - @Test - public void testGetValue() { - Map map = getVarContext(); - - // make sure the context has a value - Assert.assertTrue(!map.isEmpty()); - Assert.assertEquals(map.size(), 1); - - // eval our known expression - Assert.assertTrue(!map.get(exp)); - } - - @Test(expectedExceptions=UnsupportedOperationException.class) - public void testContainsValue() { - Map map = getVarContext(); - - map.containsValue(exp); - } - - @Test(expectedExceptions=UnsupportedOperationException.class) - public void testRemove() { - Map map = getVarContext(); - - map.remove(exp); - } - - @Test(expectedExceptions=UnsupportedOperationException.class) - public void testEntrySet() { - Map map = getVarContext(); - - map.entrySet(); - } - - @Test(expectedExceptions=UnsupportedOperationException.class) - public void testClear() { - Map map = getVarContext(); - - map.clear(); - } - - /** - * helper method - * @return a VariantJEXLContext - */ - private JEXLMap getVarContext() { - List alleles = Arrays.asList(Aref, T); - - VariantContext vc = new VariantContextBuilder("test", "chr1", 10, 10, alleles).make(); - return new JEXLMap(Arrays.asList(exp),vc); - } -} diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/writer/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/writer/VCFWriterUnitTest.java deleted file mode 100644 index bbfac11cb..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/writer/VCFWriterUnitTest.java +++ /dev/null @@ -1,200 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.FeatureReader; -import org.broad.tribble.Tribble; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderVersion; -import org.broadinstitute.variant.variantcontext.*; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; - - -/** - * @author aaron - *

- * Class VCFWriterUnitTest - *

- * This class tests out the ability of the VCF writer to correctly write VCF files - */ -public class VCFWriterUnitTest extends VariantBaseTest { - private Set metaData = new HashSet(); - private Set additionalColumns = new HashSet(); - private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); - private IndexedFastaSequenceFile seq; - - @BeforeClass - public void beforeTests() { - File referenceFile = new File(hg19Reference); - try { - seq = new IndexedFastaSequenceFile(referenceFile); - } - catch(FileNotFoundException ex) { - throw new RuntimeException(referenceFile.getAbsolutePath(), ex); - } - } - - /** test, using the writer and reader, that we can output and input a VCF file without problems */ - @Test - public void testBasicWriteAndRead() { - VCFHeader header = createFakeHeader(metaData,additionalColumns); - final EnumSet options = EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER); - VariantContextWriter writer = VariantContextWriterFactory.create(fakeVCFFile, seq.getSequenceDictionary(), options); - writer.writeHeader(header); - writer.add(createVC(header)); - writer.add(createVC(header)); - writer.close(); - VCFCodec codec = new VCFCodec(); - VCFHeader headerFromFile = null; - FeatureReader reader = AbstractFeatureReader.getFeatureReader(fakeVCFFile.getAbsolutePath(), codec, false); - headerFromFile = (VCFHeader)reader.getHeader(); - - int counter = 0; - - // validate what we're reading in - validateHeader(headerFromFile); - - try { - Iterator it = reader.iterator(); - while(it.hasNext()) { - VariantContext vc = it.next(); - counter++; - } - Assert.assertEquals(counter, 2); - Tribble.indexFile(fakeVCFFile).delete(); - fakeVCFFile.delete(); - } - catch (IOException e ) { - throw new RuntimeException(e.getMessage()); - } - - } - - /** - * create a fake header of known quantity - * @param metaData the header lines - * @param additionalColumns the additional column names - * @return a fake VCF header - */ - public static VCFHeader createFakeHeader(Set metaData, Set additionalColumns) { - metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); - metaData.add(new VCFHeaderLine("two", "2")); - additionalColumns.add("extra1"); - additionalColumns.add("extra2"); - return new VCFHeader(metaData, additionalColumns); - } - - /** - * create a fake VCF record - * @param header the VCF header - * @return a VCFRecord - */ - private VariantContext createVC(VCFHeader header) { - List alleles = new ArrayList(); - Set filters = null; - Map attributes = new HashMap(); - GenotypesContext genotypes = GenotypesContext.create(header.getGenotypeSamples().size()); - - alleles.add(Allele.create("A",true)); - alleles.add(Allele.create("ACC",false)); - - attributes.put("DP","50"); - for (String name : header.getGenotypeSamples()) { - Genotype gt = new GenotypeBuilder(name,alleles.subList(1,2)).GQ(0).attribute("BB", "1").phased(true).make(); - genotypes.add(gt); - } - return new VariantContextBuilder("RANDOM", "chr1", 1, 1, alleles) - .genotypes(genotypes).attributes(attributes).make(); - } - - - /** - * validate a VCF header - * @param header the header to validate - */ - public void validateHeader(VCFHeader header) { - // check the fields - int index = 0; - for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { - Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field); - index++; - } - Assert.assertEquals(header.getMetaDataInSortedOrder().size(), metaData.size()); - index = 0; - for (String key : header.getGenotypeSamples()) { - Assert.assertTrue(additionalColumns.contains(key)); - index++; - } - Assert.assertEquals(index, additionalColumns.size()); - } - - @DataProvider(name = "VCFWriterDoubleFormatTestData") - public Object[][] makeVCFWriterDoubleFormatTestData() { - List tests = new ArrayList(); - tests.add(new Object[]{1.0, "1.00"}); - tests.add(new Object[]{10.1, "10.10"}); - tests.add(new Object[]{10.01, "10.01"}); - tests.add(new Object[]{10.012, "10.01"}); - tests.add(new Object[]{10.015, "10.02"}); - tests.add(new Object[]{0.0, "0.00"}); - tests.add(new Object[]{0.5, "0.500"}); - tests.add(new Object[]{0.55, "0.550"}); - tests.add(new Object[]{0.555, "0.555"}); - tests.add(new Object[]{0.5555, "0.556"}); - tests.add(new Object[]{0.1, "0.100"}); - tests.add(new Object[]{0.050, "0.050"}); - tests.add(new Object[]{0.010, "0.010"}); - tests.add(new Object[]{0.012, "0.012"}); - tests.add(new Object[]{0.0012, "1.200e-03"}); - tests.add(new Object[]{1.2e-4, "1.200e-04"}); - tests.add(new Object[]{1.21e-4, "1.210e-04"}); - tests.add(new Object[]{1.212e-5, "1.212e-05"}); - tests.add(new Object[]{1.2123e-6, "1.212e-06"}); - tests.add(new Object[]{Double.POSITIVE_INFINITY, "Infinity"}); - tests.add(new Object[]{Double.NEGATIVE_INFINITY, "-Infinity"}); - tests.add(new Object[]{Double.NaN, "NaN"}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "VCFWriterDoubleFormatTestData") - public void testVCFWriterDoubleFormatTestData(final double d, final String expected) { - Assert.assertEquals(VCFWriter.formatVCFDouble(d), expected, "Failed to pretty print double in VCFWriter"); - } -} - diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/writer/VariantContextWritersUnitTest.java deleted file mode 100644 index 9e6541bfa..000000000 --- a/public/java/test/org/broadinstitute/variant/variantcontext/writer/VariantContextWritersUnitTest.java +++ /dev/null @@ -1,146 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.variantcontext.writer; - - -// the imports for unit testing. - - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; -import org.broad.tribble.FeatureCodec; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.bcf2.BCF2Codec; -import org.broadinstitute.variant.vcf.VCFCodec; -import org.broadinstitute.variant.vcf.VCFHeader; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextTestProvider; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.EnumSet; -import java.util.List; - - -public class VariantContextWritersUnitTest extends VariantBaseTest { - private SAMSequenceDictionary dictionary; - - @BeforeSuite - public void before() throws IOException { - final File source = new File(b37KGReference); - IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(source); - dictionary = seq.getSequenceDictionary(); - VariantContextTestProvider.initializeTests(); - } - - @DataProvider(name = "VariantContextTest_SingleContexts") - public Object[][] SiteVCsTest() { - List tests = new ArrayList(); - for ( VariantContextTestProvider.VariantContextTestData testData : VariantContextTestProvider.generateSiteTests() ) - tests.add(new Object[]{testData}); - return tests.toArray(new Object[][]{}); - } - - // -------------------------------------------------------------------------------- - // - // Test BCF2 reader / writer - // - // -------------------------------------------------------------------------------- - - @Test(dataProvider = "VariantContextTest_SingleContexts") - public void testBCF2WriterReader(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { - VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData); - } - - @Test(dataProvider = "VariantContextTest_SingleContexts") - public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { - VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData); - } - - private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest { - @Override - public String getExtension() { - return ".bcf"; - } - - @Override - public FeatureCodec makeCodec() { - return new BCF2Codec(); - } - - @Override - public VariantContextWriter makeWriter(final File file, final EnumSet baseOptions) { - return VariantContextWriterFactory.create(file, dictionary, baseOptions); - } - } - - // -------------------------------------------------------------------------------- - // - // Test VCF reader / writer - // - // -------------------------------------------------------------------------------- - - @Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts") - public void testVCF4WriterReader(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { - VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData); - } - - @Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts") - public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { - VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData); - } - - private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest { - @Override - public String getExtension() { - return ".vcf"; - } - - @Override - public List postprocess(final VCFHeader header, final List vcsAfterIO) { - final List fullyDecoded = new ArrayList(vcsAfterIO.size()); - - for ( final VariantContext withStrings : vcsAfterIO ) - fullyDecoded.add(withStrings.fullyDecode(header, false)); - - return fullyDecoded; - } - - @Override - public FeatureCodec makeCodec() { - return new VCFCodec(); - } - - @Override - public VariantContextWriter makeWriter(final File file, final EnumSet baseOptions) { - return VariantContextWriterFactory.create(file, dictionary, baseOptions); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/variant/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/variant/vcf/IndexFactoryUnitTest.java deleted file mode 100644 index 6292baae3..000000000 --- a/public/java/test/org/broadinstitute/variant/vcf/IndexFactoryUnitTest.java +++ /dev/null @@ -1,100 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; -import org.broad.tribble.AbstractFeatureReader; -import org.broad.tribble.CloseableTribbleIterator; -import org.broad.tribble.Tribble; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.writer.Options; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; -import org.testng.annotations.BeforeTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Arrays; -import java.util.EnumSet; - -/** - * tests out the various functions in the index factory class - */ -public class IndexFactoryUnitTest extends VariantBaseTest { - - File inputFile = new File(variantTestDataRoot + "HiSeq.10000.vcf"); - File outputFile = new File(variantTestDataRoot + "onTheFlyOutputTest.vcf"); - File outputFileIndex = Tribble.indexFile(outputFile); - - private SAMSequenceDictionary dict; - - @BeforeTest - public void setup() { - try { - dict = new IndexedFastaSequenceFile(new File(b37KGReference)).getSequenceDictionary(); - } - catch(FileNotFoundException ex) { - throw new RuntimeException(b37KGReference,ex); - } - } - - // - // test out scoring the indexes - // - @Test - public void testOnTheFlyIndexing1() throws IOException { - Index indexFromInputFile = IndexFactory.createDynamicIndex(inputFile, new VCFCodec()); - if ( outputFileIndex.exists() ) { - System.err.println("Deleting " + outputFileIndex); - outputFileIndex.delete(); - } - - for ( int maxRecords : Arrays.asList(0, 1, 10, 100, 1000, -1)) { - AbstractFeatureReader source = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), new VCFCodec(), indexFromInputFile); - - int counter = 0; - final EnumSet options = EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER); - VariantContextWriter writer = VariantContextWriterFactory.create(outputFile, dict, options); - writer.writeHeader((VCFHeader)source.getHeader()); - CloseableTribbleIterator it = source.iterator(); - while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { - VariantContext vc = it.next(); - writer.add(vc); - } - writer.close(); - - // test that the input index is the same as the one created from the identical input file - // test that the dynamic index is the same as the output index, which is equal to the input index - //WalkerTest.assertOnDiskIndexEqualToNewlyCreatedIndex(outputFileIndex, "unittest", outputFile); - } - } -} diff --git a/public/java/test/org/broadinstitute/variant/vcf/VCFHeaderUnitTest.java b/public/java/test/org/broadinstitute/variant/vcf/VCFHeaderUnitTest.java deleted file mode 100644 index 7d6b11953..000000000 --- a/public/java/test/org/broadinstitute/variant/vcf/VCFHeaderUnitTest.java +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.variant.VariantBaseTest; -import org.testng.Assert; - -import org.testng.annotations.Test; - -import java.io.*; -import java.math.BigInteger; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; - -/** - * Created by IntelliJ IDEA. - * User: aaron - * Date: Jun 30, 2010 - * Time: 3:32:08 PM - * To change this template use File | Settings | File Templates. - */ -public class VCFHeaderUnitTest extends VariantBaseTest { - - private VCFHeader createHeader(String headerStr) { - VCFCodec codec = new VCFCodec(); - VCFHeader header = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(headerStr)))); - Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount); - return header; - } - - @Test - public void testVCF4ToVCF4() { - VCFHeader header = createHeader(VCF4headerStrings); - checkMD5ofHeaderFile(header, "f05a57053a0c6a5bac15dba566f7f7ff"); - } - - @Test - public void testVCF4ToVCF4_alternate() { - VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne); - checkMD5ofHeaderFile(header, "b1d71cc94261053131f8d239d65a8c9f"); - } - - /** - * a little utility function for all tests to md5sum a file - * Shameless taken from: - * - * http://www.javalobby.org/java/forums/t84420.html - * - * @param file the file - * @return a string - */ - private static String md5SumFile(File file) { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new RuntimeException("Unable to find MD5 digest"); - } - InputStream is; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to open file " + file); - } - byte[] buffer = new byte[8192]; - int read; - try { - while ((read = is.read(buffer)) > 0) { - digest.update(buffer, 0, read); - } - byte[] md5sum = digest.digest(); - BigInteger bigInt = new BigInteger(1, md5sum); - return bigInt.toString(16); - - } - catch (IOException e) { - throw new RuntimeException("Unable to process file for MD5", e); - } - finally { - try { - is.close(); - } - catch (IOException e) { - throw new RuntimeException("Unable to close input stream for MD5 calculation", e); - } - } - } - - private void checkMD5ofHeaderFile(VCFHeader header, String md5sum) { - File myTempFile = null; - PrintWriter pw = null; - try { - myTempFile = File.createTempFile("VCFHeader","vcf"); - myTempFile.deleteOnExit(); - pw = new PrintWriter(myTempFile); - } catch (IOException e) { - Assert.fail("Unable to make a temp file!"); - } - for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) - pw.println(line); - pw.close(); - Assert.assertEquals(md5SumFile(myTempFile), md5sum); - } - - public static int VCF4headerStringCount = 16; - - public static String VCF4headerStrings = - "##fileformat=VCFv4.0\n"+ - "##filedate=2010-06-21\n"+ - "##reference=NCBI36\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##FILTER=\n"+ - "##FORMAT=\n"+ - "##FORMAT=\n"+ - "##FORMAT=\n"+ - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - - - public static String VCF4headerStrings_with_negativeOne = - "##fileformat=VCFv4.0\n"+ - "##filedate=2010-06-21\n"+ - "##reference=NCBI36\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##INFO=\n"+ - "##FILTER=\n"+ - "##FORMAT=\n"+ - "##FORMAT=\n"+ - "##FORMAT=\n"+ - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - -} diff --git a/public/java/test/org/broadinstitute/variant/vcf/VCFStandardHeaderLinesUnitTest.java b/public/java/test/org/broadinstitute/variant/vcf/VCFStandardHeaderLinesUnitTest.java deleted file mode 100644 index 02090c9cd..000000000 --- a/public/java/test/org/broadinstitute/variant/vcf/VCFStandardHeaderLinesUnitTest.java +++ /dev/null @@ -1,149 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.variant.vcf; - -import org.broadinstitute.variant.VariantBaseTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: aaron - * Date: Jun 30, 2010 - * Time: 3:32:08 PM - * To change this template use File | Settings | File Templates. - */ -public class VCFStandardHeaderLinesUnitTest extends VariantBaseTest { - @DataProvider(name = "getStandardLines") - public Object[][] makeGetStandardLines() { - List tests = new ArrayList(); - - // info - tests.add(new Object[]{"AC", "info", true}); - tests.add(new Object[]{"AN", "info", true}); - tests.add(new Object[]{"AF", "info", true}); - tests.add(new Object[]{"DP", "info", true}); - tests.add(new Object[]{"DB", "info", true}); - tests.add(new Object[]{"END", "info", true}); - - // format - tests.add(new Object[]{"GT", "format", true}); - tests.add(new Object[]{"GQ", "format", true}); - tests.add(new Object[]{"DP", "format", true}); - tests.add(new Object[]{"AD", "format", true}); - tests.add(new Object[]{"PL", "format", true}); - - tests.add(new Object[]{"NOT_STANDARD", "info", false}); - tests.add(new Object[]{"NOT_STANDARD", "format", false}); - - return tests.toArray(new Object[][]{}); - } - - - @Test(dataProvider = "getStandardLines") - public void getStandardLines(final String key, final String type, final boolean expectedToBeStandard) { - VCFCompoundHeaderLine line = null; - if ( type.equals("info") ) - line = VCFStandardHeaderLines.getInfoLine(key, false); - else if ( type.equals("format") ) - line = VCFStandardHeaderLines.getFormatLine(key, false); - else - throw new IllegalArgumentException("Unexpected type in getStandardLines " + type); - - if ( expectedToBeStandard ) { - Assert.assertNotNull(line); - Assert.assertEquals(line.getID(), key); - } else - Assert.assertNull(line); - } - - private class RepairHeaderTest extends TestDataProvider { - final VCFCompoundHeaderLine original, expectedResult; - - private RepairHeaderTest(final VCFCompoundHeaderLine original) { - this(original, original); - } - - private RepairHeaderTest(final VCFCompoundHeaderLine original, final VCFCompoundHeaderLine expectedResult) { - super(RepairHeaderTest.class); - this.original = original; - this.expectedResult = expectedResult; - } - } - - @DataProvider(name = "RepairHeaderTest") - public Object[][] makeRepairHeaderTest() { - final VCFInfoHeaderLine standardAC = VCFStandardHeaderLines.getInfoLine("AC"); - final VCFInfoHeaderLine goodAC = new VCFInfoHeaderLine("AC", VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "x"); - - final VCFFormatHeaderLine standardGT = VCFStandardHeaderLines.getFormatLine("GT"); - final VCFFormatHeaderLine goodGT = new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "x"); - - new RepairHeaderTest( standardGT, standardGT); - new RepairHeaderTest( goodGT, goodGT ); - new RepairHeaderTest( new VCFFormatHeaderLine("GT", 2, VCFHeaderLineType.String, "x"), standardGT); - new RepairHeaderTest( new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.Integer, "x"), standardGT); - new RepairHeaderTest( new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.Float, "x"), standardGT); - new RepairHeaderTest( new VCFFormatHeaderLine("GT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "x"), standardGT); - new RepairHeaderTest( new VCFFormatHeaderLine("GT", VCFHeaderLineCount.G, VCFHeaderLineType.String, "x"), standardGT); - new RepairHeaderTest( new VCFFormatHeaderLine("GT", VCFHeaderLineCount.A, VCFHeaderLineType.String, "x"), standardGT); - - new RepairHeaderTest( standardAC, standardAC); - new RepairHeaderTest( goodAC, goodAC ); - new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.Integer, "x"), standardAC); - new RepairHeaderTest( new VCFInfoHeaderLine("AC", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "x"), standardAC); - new RepairHeaderTest( new VCFInfoHeaderLine("AC", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"), standardAC); - new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.Float, "x"), standardAC); - new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.String, "x"), standardAC); - new RepairHeaderTest( new VCFInfoHeaderLine("AC", 0, VCFHeaderLineType.Flag, "x"), standardAC); - - new RepairHeaderTest( new VCFInfoHeaderLine("NON_STANDARD_INFO", 1, VCFHeaderLineType.String, "x")); - new RepairHeaderTest( new VCFFormatHeaderLine("NON_STANDARD_FORMAT", 1, VCFHeaderLineType.String, "x")); - - return RepairHeaderTest.getTests(RepairHeaderTest.class); - } - - @Test(dataProvider = "RepairHeaderTest") - public void testRepairHeaderTest(RepairHeaderTest cfg) { - final VCFHeader toRepair = new VCFHeader(Collections.singleton((VCFHeaderLine)cfg.original)); - final VCFHeader repaired = VCFStandardHeaderLines.repairStandardHeaderLines(toRepair); - - VCFCompoundHeaderLine repairedLine = (VCFCompoundHeaderLine)repaired.getFormatHeaderLine(cfg.original.getID()); - if ( repairedLine == null ) repairedLine = (VCFCompoundHeaderLine)repaired.getInfoHeaderLine(cfg.original.getID()); - - Assert.assertNotNull(repairedLine, "Repaired header didn't contain the expected line"); - Assert.assertEquals(repairedLine.getID(), cfg.expectedResult.getID()); - Assert.assertEquals(repairedLine.getType(), cfg.expectedResult.getType()); - Assert.assertEquals(repairedLine.getCountType(), cfg.expectedResult.getCountType()); - if ( repairedLine.getCountType() == VCFHeaderLineCount.INTEGER ) - Assert.assertEquals(repairedLine.getCount(), cfg.expectedResult.getCount()); - } -} diff --git a/settings/repository/org.broadinstitute/variant-1.84.1338.jar b/settings/repository/org.broadinstitute/variant-1.84.1338.jar new file mode 100644 index 000000000..16812d569 Binary files /dev/null and b/settings/repository/org.broadinstitute/variant-1.84.1338.jar differ diff --git a/settings/repository/org.broadinstitute/variant-1.84.1338.xml b/settings/repository/org.broadinstitute/variant-1.84.1338.xml new file mode 100644 index 000000000..dde6f560d --- /dev/null +++ b/settings/repository/org.broadinstitute/variant-1.84.1338.xml @@ -0,0 +1,3 @@ + + +