Replace org.broadinstitute.variant with jar built from the Picard repo
The migration of org.broadinstitute.variant into the Picard repo is complete. This commit deletes the org.broadinstitute.variant sources from our repo and replaces it with a jar built from a checkout of the latest Picard-public svn revision.
This commit is contained in:
parent
cb2dd470b6
commit
e7e76ed76e
3
ivy.xml
3
ivy.xml
|
|
@ -35,6 +35,9 @@
|
|||
<!-- Tribble -->
|
||||
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
|
||||
|
||||
<!-- Variant -->
|
||||
<dependency org="org.broadinstitute" name="variant" rev="latest.integration"/>
|
||||
|
||||
<dependency org="log4j" name="log4j" rev="1.2.15"/>
|
||||
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
|
||||
<dependency org="colt" name="colt" rev="1.2.0"/>
|
||||
|
|
|
|||
|
|
@ -62,6 +62,27 @@ import java.util.*;
|
|||
*/
|
||||
public class CombineVariantsUnitTest {
|
||||
|
||||
public static int VCF4headerStringCount = 16;
|
||||
|
||||
public static String VCF4headerStrings =
|
||||
"##fileformat=VCFv4.0\n"+
|
||||
"##filedate=2010-06-21\n"+
|
||||
"##reference=NCBI36\n"+
|
||||
"##INFO=<ID=GC, Number=0, Type=Flag, Description=\"Overlap with Gencode CCDS coding sequence\">\n"+
|
||||
"##INFO=<ID=DP, Number=1, Type=Integer, Description=\"Total number of reads in haplotype window\">\n"+
|
||||
"##INFO=<ID=AF, Number=A, Type=Float, Description=\"Dindel estimated population allele frequency\">\n"+
|
||||
"##INFO=<ID=CA, Number=1, Type=String, Description=\"Pilot 1 callability mask\">\n"+
|
||||
"##INFO=<ID=HP, Number=1, Type=Integer, Description=\"Reference homopolymer tract length\">\n"+
|
||||
"##INFO=<ID=NS, Number=1, Type=Integer, Description=\"Number of samples with data\">\n"+
|
||||
"##INFO=<ID=DB, Number=0, Type=Flag, Description=\"dbSNP membership build 129 - type match and indel sequence length match within 25 bp\">\n"+
|
||||
"##INFO=<ID=NR, Number=1, Type=Integer, Description=\"Number of reads covering non-ref variant on reverse strand\">\n"+
|
||||
"##INFO=<ID=NF, Number=1, Type=Integer, Description=\"Number of reads covering non-ref variant on forward strand\">\n"+
|
||||
"##FILTER=<ID=NoQCALL, Description=\"Variant called by Dindel but not confirmed by QCALL\">\n"+
|
||||
"##FORMAT=<ID=GT, Number=1, Type=String, Description=\"Genotype\">\n"+
|
||||
"##FORMAT=<ID=HQ, Number=2, Type=Integer, Description=\"Haplotype quality\">\n"+
|
||||
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
|
||||
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
|
||||
|
||||
// this header is a small subset of the header in VCFHeaderUnitTest: VCF4headerStrings
|
||||
public static String VCF4headerStringsSmallSubset =
|
||||
"##fileformat=VCFv4.0\n" +
|
||||
|
|
@ -159,34 +180,34 @@ public class CombineVariantsUnitTest {
|
|||
|
||||
@Test
|
||||
public void testHeadersWhereOneIsAStrictSubsetOfTheOther() {
|
||||
VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings);
|
||||
VCFHeader one = createHeader(VCF4headerStrings);
|
||||
VCFHeader two = createHeader(VCF4headerStringsSmallSubset);
|
||||
ArrayList<VCFHeader> headers = new ArrayList<VCFHeader>();
|
||||
headers.add(one);
|
||||
headers.add(two);
|
||||
Set<VCFHeaderLine> lines = VCFUtils.smartMergeHeaders(headers, false);
|
||||
Assert.assertEquals(lines.size(), VCFHeaderUnitTest.VCF4headerStringCount);
|
||||
Assert.assertEquals(lines.size(), VCF4headerStringCount);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=IllegalStateException.class)
|
||||
public void testHeadersInfoDifferentValues() {
|
||||
VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings);
|
||||
VCFHeader one = createHeader(VCF4headerStrings);
|
||||
VCFHeader two = createHeader(VCF4headerStringsBrokenInfo);
|
||||
ArrayList<VCFHeader> headers = new ArrayList<VCFHeader>();
|
||||
headers.add(one);
|
||||
headers.add(two);
|
||||
Set<VCFHeaderLine> lines = VCFUtils.smartMergeHeaders(headers, false);
|
||||
Assert.assertEquals(lines.size(), VCFHeaderUnitTest.VCF4headerStringCount);
|
||||
Assert.assertEquals(lines.size(), VCF4headerStringCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeadersFormatDifferentValues() {
|
||||
VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings);
|
||||
VCFHeader one = createHeader(VCF4headerStrings);
|
||||
VCFHeader two = createHeader(VCF4headerStringsBrokenFormat);
|
||||
ArrayList<VCFHeader> headers = new ArrayList<VCFHeader>();
|
||||
headers.add(one);
|
||||
headers.add(two);
|
||||
Set<VCFHeaderLine> lines = VCFUtils.smartMergeHeaders(headers, false);
|
||||
Assert.assertEquals(lines.size(), VCFHeaderUnitTest.VCF4headerStringCount);
|
||||
Assert.assertEquals(lines.size(), VCF4headerStringCount);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@
|
|||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -79,7 +80,6 @@ import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
|||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextTestProvider;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -26,12 +26,14 @@
|
|||
package org.broadinstitute.sting.utils.variant;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Codec;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
|
|
@ -162,6 +164,67 @@ public class GATKVCFUtils {
|
|||
return rsID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility class to read all of the VC records from a file
|
||||
*
|
||||
* @param source
|
||||
* @param codec
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public final static Pair<VCFHeader, VCIterable> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
|
||||
// read in the features
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
pbs.close();
|
||||
|
||||
pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
pbs.skip(header.getHeaderEnd());
|
||||
|
||||
final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
|
||||
return new Pair<VCFHeader, VCIterable>(vcfHeader, new VCIterable(pbs, codec, vcfHeader));
|
||||
}
|
||||
|
||||
public static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
|
||||
final PositionalBufferedStream pbs;
|
||||
final FeatureCodec<VariantContext> codec;
|
||||
final VCFHeader header;
|
||||
|
||||
private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec<VariantContext> codec, final VCFHeader header) {
|
||||
this.pbs = pbs;
|
||||
this.codec = codec;
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<VariantContext> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
return ! pbs.isDone();
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public VariantContext next() {
|
||||
try {
|
||||
final VariantContext vc = codec.decode(pbs);
|
||||
return vc == null ? null : vc.fullyDecode(header, false);
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read all of the VCF records from source into memory, returning the header and the VariantContexts
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1,499 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Decode BCF2 files
|
||||
*/
|
||||
public final class BCF2Codec implements FeatureCodec<VariantContext> {
|
||||
private final static int ALLOWED_MAJOR_VERSION = 2;
|
||||
private final static int MIN_MINOR_VERSION = 1;
|
||||
|
||||
private BCFVersion bcfVersion = null;
|
||||
|
||||
private VCFHeader header = null;
|
||||
|
||||
/**
|
||||
* Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field
|
||||
*/
|
||||
private final ArrayList<String> contigNames = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Maps header string names (encoded in VCF) into strings found in the BCF header
|
||||
*
|
||||
* Initialized when processing the header
|
||||
*/
|
||||
private ArrayList<String> dictionary;
|
||||
|
||||
/**
|
||||
* Our decoder that reads low-level objects from the BCF2 records
|
||||
*/
|
||||
private final BCF2Decoder decoder = new BCF2Decoder();
|
||||
|
||||
/**
|
||||
* Provides some sanity checking on the header
|
||||
*/
|
||||
private final static int MAX_HEADER_SIZE = 0x08000000;
|
||||
|
||||
/**
|
||||
* Genotype field decoders that are initialized when the header is read
|
||||
*/
|
||||
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
|
||||
|
||||
/**
|
||||
* A cached array of GenotypeBuilders for efficient genotype decoding.
|
||||
*
|
||||
* Caching it allows us to avoid recreating this intermediate data
|
||||
* structure each time we decode genotypes
|
||||
*/
|
||||
private GenotypeBuilder[] builders = null;
|
||||
|
||||
// for error handling
|
||||
private int recordNo = 0;
|
||||
private int pos = 0;
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Feature codec interface functions
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
|
||||
return decode(inputStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public VariantContext decode( final PositionalBufferedStream inputStream ) {
|
||||
try {
|
||||
recordNo++;
|
||||
final VariantContextBuilder builder = new VariantContextBuilder();
|
||||
|
||||
final int sitesBlockSize = decoder.readBlockSize(inputStream);
|
||||
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
|
||||
|
||||
decoder.readNextBlock(sitesBlockSize, inputStream);
|
||||
decodeSiteLoc(builder);
|
||||
final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);
|
||||
|
||||
decoder.readNextBlock(genotypeBlockSize, inputStream);
|
||||
createLazyGenotypesDecoder(info, builder);
|
||||
return builder.fullyDecoded(true).make();
|
||||
} catch ( IOException e ) {
|
||||
throw new TribbleException("Failed to read BCF file", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Class<VariantContext> getFeatureType() {
|
||||
return VariantContext.class;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
|
||||
try {
|
||||
// note that this reads the magic as well, and so does double duty
|
||||
bcfVersion = BCFVersion.readBCFVersion(inputStream);
|
||||
if ( bcfVersion == null )
|
||||
error("Input stream does not contain a BCF encoded file; BCF magic header info not found");
|
||||
|
||||
if ( bcfVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION )
|
||||
error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion());
|
||||
if ( bcfVersion.getMinorVersion() < MIN_MINOR_VERSION )
|
||||
error("BCF2Codec can only process BCF2 files with minor version >= " + MIN_MINOR_VERSION + " but this file has minor version " + bcfVersion.getMinorVersion());
|
||||
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
|
||||
System.err.println("Parsing data stream with BCF version " + bcfVersion);
|
||||
}
|
||||
|
||||
final int headerSizeInBytes = BCF2Type.INT32.read(inputStream);
|
||||
|
||||
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
|
||||
error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
|
||||
|
||||
final byte[] headerBytes = new byte[headerSizeInBytes];
|
||||
if ( inputStream.read(headerBytes) != headerSizeInBytes )
|
||||
error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
|
||||
|
||||
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
|
||||
final AsciiLineReader headerReader = new AsciiLineReader(bps);
|
||||
final VCFCodec headerParser = new VCFCodec();
|
||||
this.header = (VCFHeader)headerParser.readHeader(headerReader);
|
||||
bps.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new TribbleException("I/O error while reading BCF2 header");
|
||||
}
|
||||
|
||||
// create the config offsets
|
||||
if ( ! header.getContigLines().isEmpty() ) {
|
||||
contigNames.clear();
|
||||
for ( final VCFContigHeaderLine contig : header.getContigLines()) {
|
||||
if ( contig.getID() == null || contig.getID().equals("") )
|
||||
error("found a contig with an invalid ID " + contig);
|
||||
contigNames.add(contig.getID());
|
||||
}
|
||||
} else {
|
||||
error("Didn't find any contig lines in BCF2 file header");
|
||||
}
|
||||
|
||||
// create the string dictionary
|
||||
dictionary = parseDictionary(header);
|
||||
|
||||
// prepare the genotype field decoders
|
||||
gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);
|
||||
|
||||
// create and initialize the genotype builder array
|
||||
final int nSamples = header.getNGenotypeSamples();
|
||||
builders = new GenotypeBuilder[nSamples];
|
||||
for ( int i = 0; i < nSamples; i++ ) {
|
||||
builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i));
|
||||
}
|
||||
|
||||
// position right before next line (would be right before first real record byte at end of header)
|
||||
return new FeatureCodecHeader(header, inputStream.getPosition());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canDecode( final String path ) {
|
||||
FileInputStream fis = null;
|
||||
try {
|
||||
fis = new FileInputStream(path);
|
||||
final BCFVersion version = BCFVersion.readBCFVersion(fis);
|
||||
return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION;
|
||||
} catch ( FileNotFoundException e ) {
|
||||
return false;
|
||||
} catch ( IOException e ) {
|
||||
return false;
|
||||
} finally {
|
||||
try {
|
||||
if ( fis != null ) fis.close();
|
||||
} catch ( IOException e ) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// implicit block
|
||||
//
|
||||
// The first four records of BCF are inline untype encoded data of:
|
||||
//
|
||||
// 4 byte integer chrom offset
|
||||
// 4 byte integer start
|
||||
// 4 byte integer ref length
|
||||
// 4 byte float qual
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Decode the sites level data from this classes decoder
|
||||
*
|
||||
* @param builder
|
||||
* @return
|
||||
*/
|
||||
@Requires({"builder != null"})
|
||||
private final void decodeSiteLoc(final VariantContextBuilder builder) throws IOException {
|
||||
final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
|
||||
final String contig = lookupContigName(contigOffset);
|
||||
builder.chr(contig);
|
||||
|
||||
this.pos = decoder.decodeInt(BCF2Type.INT32) + 1; // GATK is one based, BCF2 is zero-based
|
||||
final int refLength = decoder.decodeInt(BCF2Type.INT32);
|
||||
builder.start((long)pos);
|
||||
builder.stop((long)(pos + refLength - 1)); // minus one because GATK has closed intervals but BCF2 is open
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the sites level data from this classes decoder
|
||||
*
|
||||
* @param builder
|
||||
* @return
|
||||
*/
|
||||
@Requires({"builder != null", "decoder != null"})
|
||||
@Ensures({"result != null", "result.isValid()"})
|
||||
private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) throws IOException {
|
||||
final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT);
|
||||
if ( qual != null ) {
|
||||
builder.log10PError(((Double)qual) / -10.0);
|
||||
}
|
||||
|
||||
final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32);
|
||||
final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32);
|
||||
final int nAlleles = nAlleleInfo >> 16;
|
||||
final int nInfo = nAlleleInfo & 0x0000FFFF;
|
||||
final int nFormatFields = nFormatSamples >> 24;
|
||||
final int nSamples = nFormatSamples & 0x00FFFFF;
|
||||
|
||||
if ( header.getNGenotypeSamples() != nSamples )
|
||||
error("Reading BCF2 files with different numbers of samples per record " +
|
||||
"is not currently supported. Saw " + header.getNGenotypeSamples() +
|
||||
" samples in header but have a record with " + nSamples + " samples");
|
||||
|
||||
decodeID(builder);
|
||||
final List<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
|
||||
decodeFilter(builder);
|
||||
decodeInfo(builder, nInfo);
|
||||
|
||||
final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles);
|
||||
if ( ! info.isValid() )
|
||||
error("Sites info is malformed: " + info);
|
||||
return info;
|
||||
}
|
||||
|
||||
protected final static class SitesInfoForDecoding {
|
||||
final int nFormatFields;
|
||||
final int nSamples;
|
||||
final List<Allele> alleles;
|
||||
|
||||
private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final List<Allele> alleles) {
|
||||
this.nFormatFields = nFormatFields;
|
||||
this.nSamples = nSamples;
|
||||
this.alleles = alleles;
|
||||
}
|
||||
|
||||
public boolean isValid() {
|
||||
return nFormatFields >= 0 &&
|
||||
nSamples >= 0 &&
|
||||
alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the id field in this BCF2 file and store it in the builder
|
||||
* @param builder
|
||||
*/
|
||||
private void decodeID( final VariantContextBuilder builder ) throws IOException {
|
||||
final String id = (String)decoder.decodeTypedValue();
|
||||
|
||||
if ( id == null )
|
||||
builder.noID();
|
||||
else
|
||||
builder.id(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the alleles from this BCF2 file and put the results in builder
|
||||
* @param builder
|
||||
* @param pos
|
||||
* @param nAlleles
|
||||
* @return the alleles
|
||||
*/
|
||||
@Requires("nAlleles > 0")
|
||||
private List<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) throws IOException {
|
||||
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
|
||||
List<Allele> alleles = new ArrayList<Allele>(nAlleles);
|
||||
String ref = null;
|
||||
|
||||
for ( int i = 0; i < nAlleles; i++ ) {
|
||||
final String alleleBases = (String)decoder.decodeTypedValue();
|
||||
|
||||
final boolean isRef = i == 0;
|
||||
final Allele allele = Allele.create(alleleBases, isRef);
|
||||
if ( isRef ) ref = alleleBases;
|
||||
|
||||
alleles.add(allele);
|
||||
}
|
||||
assert ref != null;
|
||||
|
||||
builder.alleles(alleles);
|
||||
|
||||
assert ref.length() > 0;
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the filter field of this BCF2 file and store the result in the builder
|
||||
* @param builder
|
||||
*/
|
||||
private void decodeFilter( final VariantContextBuilder builder ) throws IOException {
|
||||
final Object value = decoder.decodeTypedValue();
|
||||
|
||||
if ( value == null )
|
||||
builder.unfiltered();
|
||||
else {
|
||||
if ( value instanceof Integer ) {
|
||||
// fast path for single integer result
|
||||
final String filterString = getDictionaryString((Integer)value);
|
||||
if ( VCFConstants.PASSES_FILTERS_v4.equals(filterString))
|
||||
builder.passFilters();
|
||||
else
|
||||
builder.filter(filterString);
|
||||
} else {
|
||||
for ( final int offset : (List<Integer>)value )
|
||||
builder.filter(getDictionaryString(offset));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the info field key / value pairs in this BCF2 file and decode them into the builder
|
||||
*
|
||||
* @param builder
|
||||
* @param numInfoFields
|
||||
*/
|
||||
@Requires("numInfoFields >= 0")
|
||||
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) throws IOException {
|
||||
if ( numInfoFields == 0 )
|
||||
// fast path, don't bother doing any work if there are no fields
|
||||
return;
|
||||
|
||||
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
|
||||
for ( int i = 0; i < numInfoFields; i++ ) {
|
||||
final String key = getDictionaryString();
|
||||
Object value = decoder.decodeTypedValue();
|
||||
final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, key);
|
||||
if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags
|
||||
infoFieldEntries.put(key, value);
|
||||
}
|
||||
|
||||
builder.attributes(infoFieldEntries);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Decoding Genotypes
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Create the lazy loader for the genotypes data, and store it in the builder
|
||||
* so that the VC will be able to decode on demand the genotypes data
|
||||
*
|
||||
* @param siteInfo
|
||||
* @param builder
|
||||
*/
|
||||
private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo,
|
||||
final VariantContextBuilder builder ) {
|
||||
if (siteInfo.nSamples > 0) {
|
||||
final LazyGenotypesContext.LazyParser lazyParser =
|
||||
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);
|
||||
|
||||
final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes());
|
||||
final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples());
|
||||
|
||||
// did we resort the sample names? If so, we need to load the genotype data
|
||||
if ( !header.samplesWereAlreadySorted() )
|
||||
lazy.decode();
|
||||
|
||||
builder.genotypesNoValidation(lazy);
|
||||
}
|
||||
}
|
||||
|
||||
public static class LazyData {
|
||||
final public VCFHeader header;
|
||||
final public int nGenotypeFields;
|
||||
final public byte[] bytes;
|
||||
|
||||
@Requires({"nGenotypeFields > 0", "bytes != null"})
|
||||
public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) {
|
||||
this.header = header;
|
||||
this.nGenotypeFields = nGenotypeFields;
|
||||
this.bytes = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
private final String getDictionaryString() throws IOException {
|
||||
return getDictionaryString((Integer) decoder.decodeTypedValue());
|
||||
}
|
||||
|
||||
@Requires("offset < dictionary.size()")
|
||||
@Ensures("result != null")
|
||||
protected final String getDictionaryString(final int offset) {
|
||||
return dictionary.get(offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate the config offset as encoded in the BCF file into the actual string
|
||||
* name of the contig from the dictionary
|
||||
*
|
||||
* @param contigOffset
|
||||
* @return
|
||||
*/
|
||||
@Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"})
|
||||
@Ensures("result != null")
|
||||
private final String lookupContigName( final int contigOffset ) {
|
||||
return contigNames.get(contigOffset);
|
||||
}
|
||||
|
||||
@Requires("header != null")
|
||||
@Ensures({"result != null", "! result.isEmpty()"})
|
||||
private final ArrayList<String> parseDictionary(final VCFHeader header) {
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
|
||||
// if we got here we never found a dictionary, or there are no elements in the dictionary
|
||||
if ( dict.isEmpty() )
|
||||
error("Dictionary header element was absent or empty");
|
||||
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the VCFHeader we found in this BCF2 file
|
||||
*/
|
||||
protected VCFHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
@Requires("field != null")
|
||||
@Ensures("result != null")
|
||||
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
|
||||
return gtFieldDecoders.getDecoder(field);
|
||||
}
|
||||
|
||||
private void error(final String message) throws RuntimeException {
|
||||
throw new TribbleException(String.format("%s, at record %d with position %d:", message, recordNo, pos));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,375 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
public final class BCF2Decoder {
|
||||
byte[] recordBytes = null;
|
||||
ByteArrayInputStream recordStream = null;
|
||||
|
||||
public BCF2Decoder() {
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes
|
||||
*
|
||||
* @param recordBytes
|
||||
*/
|
||||
protected BCF2Decoder(final byte[] recordBytes) {
|
||||
setRecordBytes(recordBytes);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Routines to load, set, skip blocks of underlying data we are decoding
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Reads the next record from input stream and prepare this decoder to decode values from it
|
||||
*
|
||||
* @param stream
|
||||
* @return
|
||||
*/
|
||||
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
|
||||
if ( blockSizeInBytes < 0 ) throw new TribbleException("Invalid block size " + blockSizeInBytes);
|
||||
setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
|
||||
}
|
||||
|
||||
/**
|
||||
* Skips the next record from input stream, invalidating current block data
|
||||
*
|
||||
* @param stream
|
||||
* @return
|
||||
*/
|
||||
public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) {
|
||||
try {
|
||||
final int bytesRead = (int)stream.skip(blockSizeInBytes);
|
||||
validateReadBytes(bytesRead, 1, blockSizeInBytes);
|
||||
} catch ( IOException e ) {
|
||||
throw new TribbleException("I/O error while reading BCF2 file", e);
|
||||
}
|
||||
this.recordBytes = null;
|
||||
this.recordStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the byte[] for the block of data we are currently decoding
|
||||
* @return
|
||||
*/
|
||||
public byte[] getRecordBytes() {
|
||||
return recordBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* The size of the current block in bytes
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getBlockSize() {
|
||||
return recordBytes.length;
|
||||
}
|
||||
|
||||
public boolean blockIsFullyDecoded() {
|
||||
return recordStream.available() == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the recordBytes[] to read BCF2 records from now on
|
||||
*
|
||||
* @param recordBytes
|
||||
*/
|
||||
@Requires("recordBytes != null")
|
||||
@Ensures({"this.recordBytes == recordBytes", "recordStream != null"})
|
||||
public void setRecordBytes(final byte[] recordBytes) {
|
||||
this.recordBytes = recordBytes;
|
||||
this.recordStream = new ByteArrayInputStream(recordBytes);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// High-level decoder
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
public final Object decodeTypedValue() throws IOException {
|
||||
final byte typeDescriptor = readTypeDescriptor();
|
||||
return decodeTypedValue(typeDescriptor);
|
||||
}
|
||||
|
||||
public final Object decodeTypedValue(final byte typeDescriptor) throws IOException {
|
||||
final int size = decodeNumberOfElements(typeDescriptor);
|
||||
return decodeTypedValue(typeDescriptor, size);
|
||||
}
|
||||
|
||||
@Requires("size >= 0")
|
||||
public final Object decodeTypedValue(final byte typeDescriptor, final int size) throws IOException {
|
||||
if ( size == 0 ) {
|
||||
// missing value => null in java
|
||||
return null;
|
||||
} else {
|
||||
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
|
||||
if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency
|
||||
return decodeLiteralString(size);
|
||||
} else if ( size == 1 ) {
|
||||
return decodeSingleValue(type);
|
||||
} else {
|
||||
final ArrayList<Object> ints = new ArrayList<Object>(size);
|
||||
for ( int i = 0; i < size; i++ ) {
|
||||
final Object val = decodeSingleValue(type);
|
||||
if ( val == null ) continue; // auto-pruning. We remove trailing nulls
|
||||
ints.add(val);
|
||||
}
|
||||
return ints.isEmpty() ? null : ints; // return null when all of the values are null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final Object decodeSingleValue(final BCF2Type type) throws IOException {
|
||||
// TODO -- decodeTypedValue should integrate this routine
|
||||
final int value = decodeInt(type);
|
||||
|
||||
if ( value == type.getMissingBytes() )
|
||||
return null;
|
||||
else {
|
||||
switch (type) {
|
||||
case INT8:
|
||||
case INT16:
|
||||
case INT32: return value;
|
||||
case FLOAT: return rawFloatToFloat(value);
|
||||
case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased
|
||||
default: throw new TribbleException("BCF2 codec doesn't know how to decode type " + type );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Decode raw primitive data types (ints, floats, and strings)
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
private final Object decodeLiteralString(final int size) {
|
||||
assert size > 0;
|
||||
|
||||
// TODO -- assumes size > 0
|
||||
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
|
||||
try {
|
||||
recordStream.read(bytes);
|
||||
|
||||
int goodLength = 0;
|
||||
for ( ; goodLength < bytes.length ; goodLength++ )
|
||||
if ( bytes[goodLength] == 0 ) break;
|
||||
|
||||
if ( goodLength == 0 )
|
||||
return null;
|
||||
else {
|
||||
final String s = new String(bytes, 0, goodLength);
|
||||
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s;
|
||||
}
|
||||
} catch ( IOException e ) {
|
||||
throw new TribbleException("readByte failure", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public final int decodeNumberOfElements(final byte typeDescriptor) throws IOException {
|
||||
if ( BCF2Utils.sizeIsOverflow(typeDescriptor) )
|
||||
// -1 ensures we explode immediately with a bad size if the result is missing
|
||||
return decodeInt(readTypeDescriptor(), -1);
|
||||
else
|
||||
// the size is inline, so just decode it
|
||||
return BCF2Utils.decodeSize(typeDescriptor);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode an int from the stream. If the value in the stream is missing,
|
||||
* returns missingValue. Requires the typeDescriptor indicate an inline
|
||||
* single element event
|
||||
*
|
||||
* @param typeDescriptor
|
||||
* @return
|
||||
*/
|
||||
@Requires("BCF2Utils.decodeSize(typeDescriptor) == 1")
|
||||
public final int decodeInt(final byte typeDescriptor, final int missingValue) throws IOException {
|
||||
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
|
||||
final int i = decodeInt(type);
|
||||
return i == type.getMissingBytes() ? missingValue : i;
|
||||
}
|
||||
|
||||
@Requires("type != null")
|
||||
public final int decodeInt(final BCF2Type type) throws IOException {
|
||||
return type.read(recordStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Low-level reader for int[]
|
||||
*
|
||||
* Requires a typeDescriptor so the function knows how many elements to read,
|
||||
* and how they are encoded.
|
||||
*
|
||||
* If size == 0 => result is null
|
||||
* If size > 0 => result depends on the actual values in the stream
|
||||
* -- If the first element read is MISSING, result is null (all values are missing)
|
||||
* -- Else result = int[N] where N is the first N non-missing values decoded
|
||||
*
|
||||
* @param maybeDest if not null we'll not allocate space for the vector, but instead use
|
||||
* the externally allocated array of ints to store values. If the
|
||||
* size of this vector is < the actual size of the elements, we'll be
|
||||
* forced to use freshly allocated arrays. Also note that padded
|
||||
* int elements are still forced to do a fresh allocation as well.
|
||||
* @return see description
|
||||
*/
|
||||
@Requires({"type != null", "type.isIntegerType()", "size >= 0"})
|
||||
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException {
|
||||
if ( size == 0 ) {
|
||||
return null;
|
||||
} else {
|
||||
if ( maybeDest != null && maybeDest.length < size )
|
||||
maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small
|
||||
|
||||
final int val1 = decodeInt(type);
|
||||
if ( val1 == type.getMissingBytes() ) {
|
||||
// fast path for first element being missing
|
||||
for ( int i = 1; i < size; i++ ) decodeInt(type);
|
||||
return null;
|
||||
} else {
|
||||
// we know we will have at least 1 element, so making the int[] is worth it
|
||||
final int[] ints = maybeDest == null ? new int[size] : maybeDest;
|
||||
ints[0] = val1; // we already read the first one
|
||||
for ( int i = 1; i < size; i++ ) {
|
||||
ints[i] = decodeInt(type);
|
||||
if ( ints[i] == type.getMissingBytes() ) {
|
||||
// read the rest of the missing values, dropping them
|
||||
for ( int j = i + 1; j < size; j++ ) decodeInt(type);
|
||||
// deal with auto-pruning by returning an int[] containing
|
||||
// only the non-MISSING values. We do this by copying the first
|
||||
// i elements, as i itself is missing
|
||||
return Arrays.copyOf(ints, i);
|
||||
}
|
||||
}
|
||||
return ints; // all of the elements were non-MISSING
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final int[] decodeIntArray(final byte typeDescriptor, final int size) throws IOException {
|
||||
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
|
||||
return decodeIntArray(size, type, null);
|
||||
}
|
||||
|
||||
private double rawFloatToFloat(final int rawFloat) {
|
||||
return (double)Float.intBitsToFloat(rawFloat);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Utility functions
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Read the size of the next block from inputStream
|
||||
*
|
||||
* @param inputStream
|
||||
* @return
|
||||
*/
|
||||
public final int readBlockSize(final InputStream inputStream) throws IOException {
|
||||
return BCF2Type.INT32.read(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read all bytes for a BCF record block into a byte[], and return it
|
||||
*
|
||||
* Is smart about reading from the stream multiple times to fill the buffer, if necessary
|
||||
*
|
||||
* @param blockSizeInBytes number of bytes to read
|
||||
* @param inputStream the stream to read from
|
||||
* @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream
|
||||
*/
|
||||
@Requires({"blockSizeInBytes >= 0", "inputStream != null"})
|
||||
@Ensures("result != null")
|
||||
private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) {
|
||||
assert blockSizeInBytes >= 0;
|
||||
|
||||
final byte[] record = new byte[blockSizeInBytes];
|
||||
try {
|
||||
int bytesRead = 0;
|
||||
int nReadAttempts = 0; // keep track of how many times we've read
|
||||
|
||||
// because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF
|
||||
while ( bytesRead < blockSizeInBytes ) {
|
||||
final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead);
|
||||
if ( read1 == -1 )
|
||||
validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes);
|
||||
else
|
||||
bytesRead += read1;
|
||||
}
|
||||
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1 ) { // TODO -- remove me
|
||||
System.err.println("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior");
|
||||
}
|
||||
|
||||
validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes);
|
||||
} catch ( IOException e ) {
|
||||
throw new TribbleException("I/O error while reading BCF2 file", e);
|
||||
}
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure we read the right number of bytes, or throw an error
|
||||
*
|
||||
* @param actuallyRead
|
||||
* @param nReadAttempts
|
||||
* @param expected
|
||||
*/
|
||||
private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) {
|
||||
assert expected >= 0;
|
||||
|
||||
if ( actuallyRead < expected ) {
|
||||
throw new TribbleException(
|
||||
String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations",
|
||||
expected, actuallyRead, nReadAttempts));
|
||||
}
|
||||
}
|
||||
|
||||
public final byte readTypeDescriptor() throws IOException {
|
||||
return BCF2Utils.readByte(recordStream);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,284 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* An efficient scheme for building and obtaining specialized
|
||||
* genotype field decoders. Used by the BCFCodec to parse
|
||||
* with little overhead the fields from BCF2 encoded genotype
|
||||
* records
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 6/12
|
||||
*/
|
||||
public class BCF2GenotypeFieldDecoders {
|
||||
private final static boolean ENABLE_FASTPATH_GT = true;
|
||||
private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number
|
||||
|
||||
// initialized once per writer to allow parallel writers to work
|
||||
private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
|
||||
private final Decoder defaultDecoder = new GenericDecoder();
|
||||
|
||||
public BCF2GenotypeFieldDecoders(final VCFHeader header) {
|
||||
// TODO -- fill in appropriate decoders for each FORMAT field in the header
|
||||
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
|
||||
// currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_PL_KEY, new PLDecoder());
|
||||
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder());
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Genotype field decoder
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return decoder appropriate for field, or the generic decoder if no
|
||||
* specialized one is bound
|
||||
* @param field the GT field to decode
|
||||
* @return a non-null decoder
|
||||
*/
|
||||
@Requires("field != null")
|
||||
@Ensures("result != null")
|
||||
public Decoder getDecoder(final String field) {
|
||||
final Decoder d = genotypeFieldDecoder.get(field);
|
||||
return d == null ? defaultDecoder : d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decoder a field (implicit from creation) encoded as
|
||||
* typeDescriptor in the decoder object in the GenotypeBuilders
|
||||
* one for each sample in order.
|
||||
*
|
||||
* The way this works is that this decode method
|
||||
* iterates over the builders, decoding a genotype field
|
||||
* in BCF2 for each sample from decoder.
|
||||
*
|
||||
* This system allows us to easily use specialized
|
||||
* decoders for specific genotype field values. For example,
|
||||
* we use a special decoder to directly read the BCF2 data for
|
||||
* the PL field into a int[] rather than the generic List of Integer
|
||||
*/
|
||||
public interface Decoder {
|
||||
@Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
|
||||
"field != null", "decoder != null", "gbs != null", "gbs.length != 0"})
|
||||
public void decode(final List<Allele> siteAlleles,
|
||||
final String field,
|
||||
final BCF2Decoder decoder,
|
||||
final byte typeDescriptor,
|
||||
final int numElements,
|
||||
final GenotypeBuilder[] gbs) throws IOException;
|
||||
}
|
||||
|
||||
private class GTDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException {
|
||||
if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && numElements == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
|
||||
fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
|
||||
else {
|
||||
generalDecode(siteAlleles, numElements, decoder, typeDescriptor, gbs);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* fast path for many samples with diploid genotypes
|
||||
*
|
||||
* The way this would work is simple. Create a List<Allele> diploidGenotypes[] object
|
||||
* After decoding the offset, if that sample is diploid compute the
|
||||
* offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
|
||||
* if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
|
||||
* cache it and use that
|
||||
*
|
||||
* Some notes. If there are nAlleles at the site, there are implicitly actually
|
||||
* n + 1 options including
|
||||
*/
|
||||
@Requires("siteAlleles.size() == 2")
|
||||
@SuppressWarnings({"unchecked"})
|
||||
private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
|
||||
final BCF2Decoder decoder,
|
||||
final byte typeDescriptor,
|
||||
final GenotypeBuilder[] gbs) throws IOException {
|
||||
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
|
||||
|
||||
final int nPossibleGenotypes = 3 * 3;
|
||||
final Object allGenotypes[] = new Object[nPossibleGenotypes];
|
||||
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
final int a1 = decoder.decodeInt(type);
|
||||
final int a2 = decoder.decodeInt(type);
|
||||
|
||||
if ( a1 == type.getMissingBytes() ) {
|
||||
assert a2 == type.getMissingBytes();
|
||||
// no called sample GT = .
|
||||
gb.alleles(null);
|
||||
} else if ( a2 == type.getMissingBytes() ) {
|
||||
gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1)));
|
||||
} else {
|
||||
// downshift to remove phase
|
||||
final int offset = (a1 >> 1) * 3 + (a2 >> 1);
|
||||
assert offset < allGenotypes.length;
|
||||
|
||||
// TODO -- how can I get rid of this cast?
|
||||
List<Allele> gt = (List<Allele>)allGenotypes[offset];
|
||||
if ( gt == null ) {
|
||||
final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1);
|
||||
final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2);
|
||||
gt = Arrays.asList(allele1, allele2);
|
||||
allGenotypes[offset] = gt;
|
||||
}
|
||||
|
||||
gb.alleles(gt);
|
||||
}
|
||||
|
||||
final boolean phased = (a1 & 0x01) == 1;
|
||||
gb.phased(phased);
|
||||
}
|
||||
}
|
||||
|
||||
private final void generalDecode(final List<Allele> siteAlleles,
|
||||
final int ploidy,
|
||||
final BCF2Decoder decoder,
|
||||
final byte typeDescriptor,
|
||||
final GenotypeBuilder[] gbs) throws IOException {
|
||||
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
|
||||
|
||||
// a single cache for the encoded genotypes, since we don't actually need this vector
|
||||
final int[] tmp = new int[ploidy];
|
||||
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp);
|
||||
if ( encoded == null )
|
||||
// no called sample GT = .
|
||||
gb.alleles(null);
|
||||
else {
|
||||
assert encoded.length > 0;
|
||||
|
||||
// we have at least some alleles to decode
|
||||
final List<Allele> gt = new ArrayList<Allele>(encoded.length);
|
||||
|
||||
// note that the auto-pruning of fields magically handles different
|
||||
// ploidy per sample at a site
|
||||
for ( final int encode : encoded )
|
||||
gt.add(getAlleleFromEncoded(siteAlleles, encode));
|
||||
|
||||
gb.alleles(gt);
|
||||
final boolean phased = (encoded[0] & 0x01) == 1;
|
||||
gb.phased(phased);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"})
|
||||
@Ensures("result != null")
|
||||
private final Allele getAlleleFromEncoded(final List<Allele> siteAlleles, final int encode) {
|
||||
final int offset = encode >> 1;
|
||||
return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private class DPDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
// the -1 is for missing
|
||||
gb.DP(decoder.decodeInt(typeDescriptor, -1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class GQDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
// the -1 is for missing
|
||||
gb.GQ(decoder.decodeInt(typeDescriptor, -1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class ADDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
gb.AD(decoder.decodeIntArray(typeDescriptor, numElements));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class PLDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
gb.PL(decoder.decodeIntArray(typeDescriptor, numElements));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class GenericDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
Object value = decoder.decodeTypedValue(typeDescriptor, numElements);
|
||||
if ( value != null ) { // don't add missing values
|
||||
if ( value instanceof List && ((List)value).size() == 1) {
|
||||
// todo -- I really hate this, and it suggests that the code isn't completely right
|
||||
// the reason it's here is that it's possible to prune down a vector to a singleton
|
||||
// value and there we have the contract that the value comes back as an atomic value
|
||||
// not a vector of size 1
|
||||
value = ((List)value).get(0);
|
||||
}
|
||||
gb.attribute(field, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class FTDecoder implements Decoder {
|
||||
@Override
|
||||
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) throws IOException {
|
||||
for ( final GenotypeBuilder gb : gbs ) {
|
||||
Object value = decoder.decodeTypedValue(typeDescriptor, numElements);
|
||||
assert value == null || value instanceof String;
|
||||
gb.filter((String)value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Lazy version of genotypes decoder for BCF2 genotypes
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 5/12
|
||||
*/
|
||||
public class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
|
||||
// the essential information for us to use to decode the genotypes data
|
||||
// initialized when this lazy decoder is created, as we know all of this from the BCF2Codec
|
||||
// and its stored here again for code cleanliness
|
||||
private final BCF2Codec codec;
|
||||
private final List<Allele> siteAlleles;
|
||||
private final int nSamples;
|
||||
private final int nFields;
|
||||
private final GenotypeBuilder[] builders;
|
||||
|
||||
@Requires("codec.getHeader().getNGenotypeSamples() == builders.length")
|
||||
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final List<Allele> alleles, final int nSamples,
|
||||
final int nFields, final GenotypeBuilder[] builders) {
|
||||
this.codec = codec;
|
||||
this.siteAlleles = alleles;
|
||||
this.nSamples = nSamples;
|
||||
this.nFields = nFields;
|
||||
this.builders = builders;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LazyGenotypesContext.LazyData parse(final Object data) {
|
||||
try {
|
||||
|
||||
// load our byte[] data into the decoder
|
||||
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
|
||||
|
||||
for ( int i = 0; i < nSamples; i++ )
|
||||
builders[i].reset(true);
|
||||
|
||||
for ( int i = 0; i < nFields; i++ ) {
|
||||
// get the field name
|
||||
final int offset = (Integer) decoder.decodeTypedValue();
|
||||
final String field = codec.getDictionaryString(offset);
|
||||
|
||||
// the type of each element
|
||||
final byte typeDescriptor = decoder.readTypeDescriptor();
|
||||
final int numElements = decoder.decodeNumberOfElements(typeDescriptor);
|
||||
final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field);
|
||||
try {
|
||||
fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, numElements, builders);
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new TribbleException("BUG: expected encoding of field " + field
|
||||
+ " inconsistent with the value observed in the decoded value");
|
||||
}
|
||||
}
|
||||
|
||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
|
||||
for ( final GenotypeBuilder gb : builders )
|
||||
genotypes.add(gb.make());
|
||||
|
||||
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
|
||||
} catch ( IOException e ) {
|
||||
throw new TribbleException("Unexpected IOException parsing already read genotypes data block", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,219 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.EnumSet;
|
||||
|
||||
/**
|
||||
* BCF2 types and associated information
|
||||
*
|
||||
* @author depristo
|
||||
* @since 05/12
|
||||
*/
|
||||
public enum BCF2Type {
|
||||
// the actual values themselves
|
||||
MISSING(0, 0, 0x00) {
|
||||
@Override public int read(final InputStream in) throws IOException {
|
||||
throw new IllegalArgumentException("Cannot read MISSING type");
|
||||
}
|
||||
@Override public void write(final int value, final OutputStream out) throws IOException {
|
||||
throw new IllegalArgumentException("Cannot write MISSING type");
|
||||
}
|
||||
},
|
||||
|
||||
INT8 (1, 1, 0xFFFFFF80, -127, 127) {
|
||||
@Override
|
||||
public int read(final InputStream in) throws IOException {
|
||||
return BCF2Utils.readByte(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final int value, final OutputStream out) throws IOException {
|
||||
out.write(0xFF & value); // TODO -- do we need this operation?
|
||||
}
|
||||
},
|
||||
|
||||
INT16(2, 2, 0xFFFF8000, -32767, 32767) {
|
||||
@Override
|
||||
public int read(final InputStream in) throws IOException {
|
||||
final int b2 = BCF2Utils.readByte(in) & 0xFF;
|
||||
final int b1 = BCF2Utils.readByte(in) & 0xFF;
|
||||
return (short)((b1 << 8) | b2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final int value, final OutputStream out) throws IOException {
|
||||
// TODO -- optimization -- should we put this in a local buffer?
|
||||
out.write((0x00FF & value));
|
||||
out.write((0xFF00 & value) >> 8);
|
||||
}
|
||||
},
|
||||
|
||||
INT32(3, 4, 0x80000000, -2147483647, 2147483647) {
|
||||
@Override
|
||||
public int read(final InputStream in) throws IOException {
|
||||
final int b4 = BCF2Utils.readByte(in) & 0xFF;
|
||||
final int b3 = BCF2Utils.readByte(in) & 0xFF;
|
||||
final int b2 = BCF2Utils.readByte(in) & 0xFF;
|
||||
final int b1 = BCF2Utils.readByte(in) & 0xFF;
|
||||
return (int)(b1 << 24 | b2 << 16 | b3 << 8 | b4);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final int value, final OutputStream out) throws IOException {
|
||||
out.write((0x000000FF & value));
|
||||
out.write((0x0000FF00 & value) >> 8);
|
||||
out.write((0x00FF0000 & value) >> 16);
|
||||
out.write((0xFF000000 & value) >> 24);
|
||||
}
|
||||
},
|
||||
|
||||
FLOAT(5, 4, 0x7F800001) {
|
||||
@Override
|
||||
public int read(final InputStream in) throws IOException {
|
||||
return INT32.read(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final int value, final OutputStream out) throws IOException {
|
||||
INT32.write(value, out);
|
||||
}
|
||||
},
|
||||
|
||||
CHAR (7, 1, 0x00000000) {
|
||||
@Override
|
||||
public int read(final InputStream in) throws IOException {
|
||||
return INT8.read(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final int value, final OutputStream out) throws IOException {
|
||||
INT8.write(value, out);
|
||||
}
|
||||
};
|
||||
|
||||
private final int id;
|
||||
private final Object missingJavaValue;
|
||||
private final int missingBytes;
|
||||
private final int sizeInBytes;
|
||||
private final long minValue, maxValue;
|
||||
|
||||
BCF2Type(final int id, final int sizeInBytes, final int missingBytes) {
|
||||
this(id, sizeInBytes, missingBytes, 0, 0);
|
||||
}
|
||||
|
||||
BCF2Type(final int id, final int sizeInBytes, final int missingBytes, final long minValue, final long maxValue) {
|
||||
this.id = id;
|
||||
this.sizeInBytes = sizeInBytes;
|
||||
this.missingJavaValue = null;
|
||||
this.missingBytes = missingBytes;
|
||||
this.minValue = minValue;
|
||||
this.maxValue = maxValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many bytes are used to represent this type on disk?
|
||||
* @return
|
||||
*/
|
||||
public int getSizeInBytes() {
|
||||
return sizeInBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* The ID according to the BCF2 specification
|
||||
* @return
|
||||
*/
|
||||
public int getID() { return id; }
|
||||
|
||||
/**
|
||||
* Can we encode value v in this type, according to its declared range.
|
||||
*
|
||||
* Only makes sense for integer values
|
||||
*
|
||||
* @param v
|
||||
* @return
|
||||
*/
|
||||
@Requires("this.isIntegerType()")
|
||||
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
|
||||
|
||||
/**
|
||||
* Return the java object (aka null) that is used to represent a missing value for this
|
||||
* type in Java
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Object getMissingJavaValue() { return missingJavaValue; }
|
||||
|
||||
/**
|
||||
* The bytes (encoded as an int) that are used to represent a missing value
|
||||
* for this type in BCF2
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getMissingBytes() { return missingBytes; }
|
||||
|
||||
/**
|
||||
* An enum set of the types that might represent Integer values
|
||||
*/
|
||||
private final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
|
||||
|
||||
/**
|
||||
* @return true if this BCF2Type corresponds to the magic "MISSING" type (0x00)
|
||||
*/
|
||||
public boolean isMissingType() {
|
||||
return this == MISSING;
|
||||
}
|
||||
|
||||
public boolean isIntegerType() {
|
||||
return INTEGERS.contains(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a value from in stream of this BCF2 type as an int [32 bit] collection of bits
|
||||
*
|
||||
* For intX and char values this is just the int / byte value of the underlying data represented as a 32 bit int
|
||||
* For a char the result must be converted to a char by (char)(byte)(0x0F & value)
|
||||
* For doubles it's necessary to convert subsequently this value to a double via Double.bitsToDouble()
|
||||
*
|
||||
* @param in
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
@Requires("in != null")
|
||||
public int read(final InputStream in) throws IOException {
|
||||
throw new IllegalArgumentException("Not implemented");
|
||||
}
|
||||
|
||||
@Requires("out != null")
|
||||
public void write(final int value, final OutputStream out) throws IOException {
|
||||
throw new IllegalArgumentException("Not implemented");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,333 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Common utilities for working with BCF2 files
|
||||
*
|
||||
* Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type)
|
||||
*
|
||||
* @author depristo
|
||||
* @since 5/12
|
||||
*/
|
||||
public final class BCF2Utils {
|
||||
public static final int MAX_ALLELES_IN_GENOTYPES = 127;
|
||||
|
||||
public static final int OVERFLOW_ELEMENT_MARKER = 15;
|
||||
public static final int MAX_INLINE_ELEMENTS = 14;
|
||||
|
||||
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
|
||||
public final static BCF2Type[] ID_TO_ENUM;
|
||||
|
||||
static {
|
||||
int maxID = -1;
|
||||
for ( BCF2Type v : BCF2Type.values() ) maxID = Math.max(v.getID(), maxID);
|
||||
ID_TO_ENUM = new BCF2Type[maxID+1];
|
||||
for ( BCF2Type v : BCF2Type.values() ) ID_TO_ENUM[v.getID()] = v;
|
||||
}
|
||||
|
||||
private BCF2Utils() {}
|
||||
|
||||
/**
|
||||
* Create a strings dictionary from the VCF header
|
||||
*
|
||||
* The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT)
|
||||
* fields.
|
||||
*
|
||||
* Note that its critical that the list be dedupped and sorted in a consistent manner each time,
|
||||
* as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly
|
||||
* the same way as in the header each time it's very bad
|
||||
*
|
||||
* @param header the VCFHeader from which to build the dictionary
|
||||
* @return a non-null dictionary of elements, may be empty
|
||||
*/
|
||||
@Requires("header != null")
|
||||
@Ensures({"result != null", "new HashSet(result).size() == result.size()"})
|
||||
public static ArrayList<String> makeDictionary(final VCFHeader header) {
|
||||
final Set<String> seen = new HashSet<String>();
|
||||
final ArrayList<String> dict = new ArrayList<String>();
|
||||
|
||||
// special case the special PASS field which doesn't show up in the FILTER field definitions
|
||||
seen.add(VCFConstants.PASSES_FILTERS_v4);
|
||||
dict.add(VCFConstants.PASSES_FILTERS_v4);
|
||||
|
||||
// set up the strings dictionary
|
||||
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line instanceof VCFIDHeaderLine && ! (line instanceof VCFContigHeaderLine) ) {
|
||||
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
|
||||
if ( ! seen.contains(idLine.getID())) {
|
||||
dict.add(idLine.getID());
|
||||
seen.add(idLine.getID());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dict;
|
||||
}
|
||||
|
||||
@Requires({"nElements >= 0", "nElements <= OVERFLOW_ELEMENT_MARKER", "type != null"})
|
||||
public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
|
||||
return (byte)((0x0F & nElements) << 4 | (type.getID() & 0x0F));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public static int decodeSize(final byte typeDescriptor) {
|
||||
return (0xF0 & typeDescriptor) >> 4;
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public static int decodeTypeID(final byte typeDescriptor) {
|
||||
return typeDescriptor & 0x0F;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public static BCF2Type decodeType(final byte typeDescriptor) {
|
||||
return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
|
||||
}
|
||||
|
||||
public static boolean sizeIsOverflow(final byte typeDescriptor) {
|
||||
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
|
||||
}
|
||||
|
||||
public static byte readByte(final InputStream stream) throws IOException {
|
||||
return (byte)(stream.read() & 0xFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Collapse multiple strings into a comma separated list
|
||||
*
|
||||
* ["s1", "s2", "s3"] => ",s1,s2,s3"
|
||||
*
|
||||
* @param strings size > 1 list of strings
|
||||
* @return
|
||||
*/
|
||||
@Requires({"strings != null"})
|
||||
@Ensures("result != null")
|
||||
public static String collapseStringList(final List<String> strings) {
|
||||
if ( strings.isEmpty() ) return "";
|
||||
else if ( strings.size() == 1 ) return strings.get(0);
|
||||
else {
|
||||
final StringBuilder b = new StringBuilder();
|
||||
for ( final String s : strings ) {
|
||||
if ( s != null ) {
|
||||
assert s.indexOf(",") == -1; // no commas in individual strings
|
||||
b.append(",").append(s);
|
||||
}
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inverse operation of collapseStringList.
|
||||
*
|
||||
* ",s1,s2,s3" => ["s1", "s2", "s3"]
|
||||
*
|
||||
*
|
||||
* @param collapsed
|
||||
* @return
|
||||
*/
|
||||
@Requires({"collapsed != null", "isCollapsedString(collapsed)"})
|
||||
@Ensures("result != null")
|
||||
public static List<String> explodeStringList(final String collapsed) {
|
||||
assert isCollapsedString(collapsed);
|
||||
final String[] exploded = collapsed.substring(1).split(",");
|
||||
return Arrays.asList(exploded);
|
||||
}
|
||||
|
||||
@Requires("s != null")
|
||||
public static boolean isCollapsedString(final String s) {
|
||||
return s.length() > 0 && s.charAt(0) == ',';
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a good name for a shadow BCF file for vcfFile.
|
||||
*
|
||||
* foo.vcf => foo.bcf
|
||||
* foo.xxx => foo.xxx.bcf
|
||||
*
|
||||
* If the resulting BCF file cannot be written, return null. Happens
|
||||
* when vcfFile = /dev/null for example
|
||||
*
|
||||
* @param vcfFile
|
||||
* @return the BCF
|
||||
*/
|
||||
@Requires("vcfFile != null")
|
||||
public static final File shadowBCF(final File vcfFile) {
|
||||
final String path = vcfFile.getAbsolutePath();
|
||||
if ( path.contains(".vcf") )
|
||||
return new File(path.replace(".vcf", ".bcf"));
|
||||
else {
|
||||
final File bcf = new File( path + ".bcf" );
|
||||
if ( bcf.canRead() )
|
||||
return bcf;
|
||||
else {
|
||||
try {
|
||||
// this is the only way to robustly decide if we could actually write to BCF
|
||||
final FileOutputStream o = new FileOutputStream(bcf);
|
||||
o.close();
|
||||
bcf.delete();
|
||||
return bcf;
|
||||
} catch ( FileNotFoundException e ) {
|
||||
return null;
|
||||
} catch ( IOException e ) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result.isIntegerType()")
|
||||
public static BCF2Type determineIntegerType(final int value) {
|
||||
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
|
||||
if ( potentialType.withinRange(value) )
|
||||
return potentialType;
|
||||
}
|
||||
|
||||
throw new TribbleException("Integer cannot be encoded in allowable range of even INT32: " + value);
|
||||
}
|
||||
|
||||
@Ensures("result.isIntegerType()")
|
||||
public static BCF2Type determineIntegerType(final int[] values) {
|
||||
// find the min and max values in the array
|
||||
int max = 0, min = 0;
|
||||
for ( final int v : values ) {
|
||||
if ( v > max ) max = v;
|
||||
if ( v < min ) min = v;
|
||||
}
|
||||
|
||||
final BCF2Type maxType = determineIntegerType(max);
|
||||
final BCF2Type minType = determineIntegerType(min);
|
||||
|
||||
// INT8 < INT16 < INT32 so this returns the larger of the two
|
||||
return maxType.compareTo(minType) >= 0 ? maxType : minType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum BCF2 integer size of t1 and t2
|
||||
*
|
||||
* For example, if t1 == INT8 and t2 == INT16 returns INT16
|
||||
*
|
||||
* @param t1
|
||||
* @param t2
|
||||
* @return
|
||||
*/
|
||||
@Requires({"t1.isIntegerType()","t2.isIntegerType()"})
|
||||
@Ensures("result.isIntegerType()")
|
||||
public static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
|
||||
switch ( t1 ) {
|
||||
case INT8: return t2;
|
||||
case INT16: return t2 == BCF2Type.INT32 ? t2 : t1;
|
||||
case INT32: return t1;
|
||||
default: throw new TribbleException("BUG: unexpected BCF2Type " + t1);
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result.isIntegerType()")
|
||||
public static BCF2Type determineIntegerType(final List<Integer> values) {
|
||||
BCF2Type maxType = BCF2Type.INT8;
|
||||
for ( final int value : values ) {
|
||||
final BCF2Type type1 = determineIntegerType(value);
|
||||
switch ( type1 ) {
|
||||
case INT8: break;
|
||||
case INT16: maxType = BCF2Type.INT16; break;
|
||||
case INT32: return BCF2Type.INT32; // fast path for largest possible value
|
||||
default: throw new TribbleException("Unexpected integer type " + type1 );
|
||||
}
|
||||
}
|
||||
return maxType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that takes an object and returns a list representation
|
||||
* of it:
|
||||
*
|
||||
* o == null => []
|
||||
* o is a list => o
|
||||
* else => [o]
|
||||
*
|
||||
* @param o
|
||||
* @return
|
||||
*/
|
||||
public static List<Object> toList(final Object o) {
|
||||
if ( o == null ) return Collections.emptyList();
|
||||
else if ( o instanceof List ) return (List<Object>)o;
|
||||
else return Collections.singletonList(o);
|
||||
}
|
||||
|
||||
/**
|
||||
* Are the elements and their order in the output and input headers consistent so that
|
||||
* we can write out the raw genotypes block without decoding and recoding it?
|
||||
*
|
||||
* If the order of INFO, FILTER, or contrig elements in the output header is different than
|
||||
* in the input header we must decode the blocks using the input header and then recode them
|
||||
* based on the new output order.
|
||||
*
|
||||
* If they are consistent, we can simply pass through the raw genotypes block bytes, which is
|
||||
* a *huge* performance win for large blocks.
|
||||
*
|
||||
* Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc)
|
||||
* don't modify the ordering of the header fields and so can safely pass through the genotypes
|
||||
* undecoded. Some operations -- those at add filters or info fields -- can change the ordering
|
||||
* of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded
|
||||
*/
|
||||
public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) {
|
||||
// first, we have to have the same samples in the same order
|
||||
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
|
||||
return false;
|
||||
|
||||
final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
|
||||
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
|
||||
|
||||
while ( inputLinesIt.hasNext() ) {
|
||||
if ( ! outputLinesIt.hasNext() ) // missing lines in output
|
||||
return false;
|
||||
|
||||
final VCFIDHeaderLine outputLine = outputLinesIt.next();
|
||||
final VCFIDHeaderLine inputLine = inputLinesIt.next();
|
||||
|
||||
if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static <T> List<T> nullAsEmpty(List<T> l) {
|
||||
if ( l == null )
|
||||
return Collections.emptyList();
|
||||
else
|
||||
return l;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,105 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Simple holder for BCF version information
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 8/2/12
|
||||
* Time: 2:16 PM
|
||||
*/
|
||||
public class BCFVersion {
|
||||
/**
|
||||
* BCF2 begins with the MAGIC info BCF_M_m where M is the major version (currently 2)
|
||||
* and m is the minor version, currently 1
|
||||
*/
|
||||
public static final byte[] MAGIC_HEADER_START = "BCF".getBytes();
|
||||
|
||||
final int majorVersion;
|
||||
final int minorVersion;
|
||||
|
||||
public BCFVersion(int majorVersion, int minorVersion) {
|
||||
this.majorVersion = majorVersion;
|
||||
this.minorVersion = minorVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the major version number of this BCF file
|
||||
*/
|
||||
public int getMajorVersion() {
|
||||
return majorVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the minor version number of this BCF file
|
||||
*/
|
||||
public int getMinorVersion() {
|
||||
return minorVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new BCFVersion object describing the major and minor version of the BCF file in stream
|
||||
*
|
||||
* Note that stream must be at the very start of the file.
|
||||
*
|
||||
* @param stream
|
||||
* @return a BCFVersion object, or null if stream doesn't contain a BCF file
|
||||
* @throws IOException
|
||||
*/
|
||||
public static BCFVersion readBCFVersion(final InputStream stream) throws IOException {
|
||||
final byte[] magicBytes = new byte[MAGIC_HEADER_START.length];
|
||||
stream.read(magicBytes);
|
||||
if ( Arrays.equals(magicBytes, MAGIC_HEADER_START) ) {
|
||||
// we're a BCF file
|
||||
final int majorByte = stream.read();
|
||||
final int minorByte = stream.read();
|
||||
return new BCFVersion( majorByte, minorByte );
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the BCF magic information indicating this is a BCF file with corresponding major and minor versions
|
||||
* @param out
|
||||
* @throws IOException
|
||||
*/
|
||||
public void write(final OutputStream out) throws IOException {
|
||||
out.write(MAGIC_HEADER_START);
|
||||
out.write(getMajorVersion() & 0xFF);
|
||||
out.write(getMinorVersion() & 0xFF);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("BCF%d.%d", getMajorVersion(), getMinorVersion());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,242 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Constants and utility methods used throughout the VCF/BCF/VariantContext classes
|
||||
*/
|
||||
public class GeneralUtils {
|
||||
|
||||
/**
|
||||
* Setting this to true causes the VCF/BCF/VariantContext classes to emit debugging information
|
||||
* to standard error
|
||||
*/
|
||||
public static final boolean DEBUG_MODE_ENABLED = false;
|
||||
|
||||
/**
|
||||
* The smallest log10 value we'll emit from normalizeFromLog10 and other functions
|
||||
* where the real-space value is 0.0.
|
||||
*/
|
||||
public final static double LOG10_P_OF_ZERO = -1000000.0;
|
||||
|
||||
/**
|
||||
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
|
||||
* elti objects (note there's no actual space between sep and the elti elements). Returns
|
||||
* "" if collection is empty. If collection contains just elt, then returns elt.toString()
|
||||
*
|
||||
* @param separator the string to use to separate objects
|
||||
* @param objects a collection of objects. the element order is defined by the iterator over objects
|
||||
* @param <T> the type of the objects
|
||||
* @return a non-null string
|
||||
*/
|
||||
public static <T> String join(final String separator, final Collection<T> objects) {
|
||||
if (objects.isEmpty()) { // fast path for empty collection
|
||||
return "";
|
||||
} else {
|
||||
final Iterator<T> iter = objects.iterator();
|
||||
final T first = iter.next();
|
||||
|
||||
if ( ! iter.hasNext() ) // fast path for singleton collections
|
||||
return first.toString();
|
||||
else { // full path for 2+ collection that actually need a join
|
||||
final StringBuilder ret = new StringBuilder(first.toString());
|
||||
while(iter.hasNext()) {
|
||||
ret.append(separator);
|
||||
ret.append(iter.next().toString());
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
||||
*
|
||||
* @param array the array to be normalized
|
||||
* @return a newly allocated array corresponding the normalized values in array
|
||||
*/
|
||||
public static double[] normalizeFromLog10(double[] array) {
|
||||
return normalizeFromLog10(array, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
||||
*
|
||||
* @param array the array to be normalized
|
||||
* @param takeLog10OfOutput if true, the output will be transformed back into log10 units
|
||||
* @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed
|
||||
*/
|
||||
public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) {
|
||||
return normalizeFromLog10(array, takeLog10OfOutput, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space
|
||||
*
|
||||
* @param array
|
||||
* @param takeLog10OfOutput
|
||||
* @param keepInLogSpace
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) {
|
||||
// for precision purposes, we need to add (or really subtract, since they're
|
||||
// all negative) the largest value; also, we need to convert to normal-space.
|
||||
double maxValue = arrayMax(array);
|
||||
|
||||
// we may decide to just normalize in log space without converting to linear space
|
||||
if (keepInLogSpace) {
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
array[i] -= maxValue;
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
// default case: go to linear space
|
||||
double[] normalized = new double[array.length];
|
||||
|
||||
for (int i = 0; i < array.length; i++)
|
||||
normalized[i] = Math.pow(10, array[i] - maxValue);
|
||||
|
||||
// normalize
|
||||
double sum = 0.0;
|
||||
for (int i = 0; i < array.length; i++)
|
||||
sum += normalized[i];
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
double x = normalized[i] / sum;
|
||||
if (takeLog10OfOutput) {
|
||||
x = Math.log10(x);
|
||||
if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) )
|
||||
x = array[i] - maxValue;
|
||||
}
|
||||
|
||||
normalized[i] = x;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
public static double arrayMax(final double[] array) {
|
||||
return array[maxElementIndex(array, array.length)];
|
||||
}
|
||||
|
||||
public static int maxElementIndex(final double[] array) {
|
||||
return maxElementIndex(array, array.length);
|
||||
}
|
||||
|
||||
public static int maxElementIndex(final double[] array, final int endIndex) {
|
||||
if (array == null || array.length == 0)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int maxI = 0;
|
||||
for (int i = 1; i < endIndex; i++) {
|
||||
if (array[i] > array[maxI])
|
||||
maxI = i;
|
||||
}
|
||||
|
||||
return maxI;
|
||||
}
|
||||
|
||||
public static <T> List<T> cons(final T elt, final List<T> l) {
|
||||
List<T> l2 = new ArrayList<T>();
|
||||
l2.add(elt);
|
||||
if (l != null) l2.addAll(l);
|
||||
return l2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make all combinations of N size of objects
|
||||
*
|
||||
* if objects = [A, B, C]
|
||||
* if N = 1 => [[A], [B], [C]]
|
||||
* if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]]
|
||||
*
|
||||
* @param objects
|
||||
* @param n
|
||||
* @param <T>
|
||||
* @param withReplacement if false, the resulting permutations will only contain unique objects from objects
|
||||
* @return
|
||||
*/
|
||||
public static <T> List<List<T>> makePermutations(final List<T> objects, final int n, final boolean withReplacement) {
|
||||
final List<List<T>> combinations = new ArrayList<List<T>>();
|
||||
|
||||
if ( n <= 0 )
|
||||
;
|
||||
else if ( n == 1 ) {
|
||||
for ( final T o : objects )
|
||||
combinations.add(Collections.singletonList(o));
|
||||
} else {
|
||||
final List<List<T>> sub = makePermutations(objects, n - 1, withReplacement);
|
||||
for ( List<T> subI : sub ) {
|
||||
for ( final T a : objects ) {
|
||||
if ( withReplacement || ! subI.contains(a) )
|
||||
combinations.add(cons(a, subI));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return combinations;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares double values for equality (within 1e-6), or inequality.
|
||||
*
|
||||
* @param a the first double value
|
||||
* @param b the second double value
|
||||
* @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a.
|
||||
*/
|
||||
public static byte compareDoubles(double a, double b) {
|
||||
return compareDoubles(a, b, 1e-6);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares double values for equality (within epsilon), or inequality.
|
||||
*
|
||||
* @param a the first double value
|
||||
* @param b the second double value
|
||||
* @param epsilon the precision within which two double values will be considered equal
|
||||
* @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a.
|
||||
*/
|
||||
public static byte compareDoubles(double a, double b, double epsilon) {
|
||||
if (Math.abs(a - b) < epsilon) {
|
||||
return 0;
|
||||
}
|
||||
if (a > b) {
|
||||
return -1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static public final <T> List<T> reverse(final List<T> l) {
|
||||
final List<T> newL = new ArrayList<T>(l);
|
||||
Collections.reverse(newL);
|
||||
return newL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,476 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* Immutable representation of an allele
|
||||
*
|
||||
* Types of alleles:
|
||||
*
|
||||
* Ref: a t C g a // C is the reference base
|
||||
*
|
||||
* : a t G g a // C base is a G in some individuals
|
||||
*
|
||||
* : a t - g a // C base is deleted w.r.t. the reference
|
||||
*
|
||||
* : a t CAg a // A base is inserted w.r.t. the reference sequence
|
||||
*
|
||||
* In these cases, where are the alleles?
|
||||
*
|
||||
* SNP polymorphism of C/G -> { C , G } -> C is the reference allele
|
||||
* 1 base deletion of C -> { C , - } -> C is the reference allele
|
||||
* 1 base insertion of A -> { - ; A } -> Null is the reference allele
|
||||
*
|
||||
* Suppose I see a the following in the population:
|
||||
*
|
||||
* Ref: a t C g a // C is the reference base
|
||||
* : a t G g a // C base is a G in some individuals
|
||||
* : a t - g a // C base is deleted w.r.t. the reference
|
||||
*
|
||||
* How do I represent this? There are three segregating alleles:
|
||||
*
|
||||
* { C , G , - }
|
||||
*
|
||||
* Now suppose I have this more complex example:
|
||||
*
|
||||
* Ref: a t C g a // C is the reference base
|
||||
* : a t - g a
|
||||
* : a t - - a
|
||||
* : a t CAg a
|
||||
*
|
||||
* There are actually four segregating alleles:
|
||||
*
|
||||
* { C g , - g, - -, and CAg } over bases 2-4
|
||||
*
|
||||
* However, the molecular equivalence explicitly listed above is usually discarded, so the actual
|
||||
* segregating alleles are:
|
||||
*
|
||||
* { C g, g, -, C a g }
|
||||
*
|
||||
* Critically, it should be possible to apply an allele to a reference sequence to create the
|
||||
* correct haplotype sequence:
|
||||
*
|
||||
* Allele + reference => haplotype
|
||||
*
|
||||
* For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the
|
||||
* Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context.
|
||||
*
|
||||
* Given list of alleles it's possible to determine the "type" of the variation
|
||||
*
|
||||
* A / C @ loc => SNP with
|
||||
* - / A => INDEL
|
||||
*
|
||||
* If you know where allele is the reference, you can determine whether the variant is an insertion or deletion.
|
||||
*
|
||||
* Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be
|
||||
* determined. This is usually represented by a '.' allele.
|
||||
*
|
||||
* Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an
|
||||
* Allele.
|
||||
|
||||
* @author ebanks, depristo
|
||||
*/
|
||||
public class Allele implements Comparable<Allele> {
|
||||
private static final byte[] EMPTY_ALLELE_BASES = new byte[0];
|
||||
|
||||
private boolean isRef = false;
|
||||
private boolean isNoCall = false;
|
||||
private boolean isSymbolic = false;
|
||||
|
||||
private byte[] bases = null;
|
||||
|
||||
public final static String NO_CALL_STRING = ".";
|
||||
/** A generic static NO_CALL allele for use */
|
||||
|
||||
// no public way to create an allele
|
||||
protected Allele(byte[] bases, boolean isRef) {
|
||||
// null alleles are no longer allowed
|
||||
if ( wouldBeNullAllele(bases) ) {
|
||||
throw new IllegalArgumentException("Null alleles are not supported");
|
||||
}
|
||||
|
||||
// no-calls are represented as no bases
|
||||
if ( wouldBeNoCallAllele(bases) ) {
|
||||
this.bases = EMPTY_ALLELE_BASES;
|
||||
isNoCall = true;
|
||||
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
|
||||
return;
|
||||
}
|
||||
|
||||
if ( wouldBeSymbolicAllele(bases) ) {
|
||||
isSymbolic = true;
|
||||
if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele");
|
||||
}
|
||||
else {
|
||||
StringUtil.toUpperCase(bases);
|
||||
}
|
||||
|
||||
this.isRef = isRef;
|
||||
this.bases = bases;
|
||||
|
||||
if ( ! acceptableAlleleBases(bases) )
|
||||
throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'");
|
||||
}
|
||||
|
||||
protected Allele(String bases, boolean isRef) {
|
||||
this(bases.getBytes(), isRef);
|
||||
}
|
||||
|
||||
|
||||
private final static Allele REF_A = new Allele("A", true);
|
||||
private final static Allele ALT_A = new Allele("A", false);
|
||||
private final static Allele REF_C = new Allele("C", true);
|
||||
private final static Allele ALT_C = new Allele("C", false);
|
||||
private final static Allele REF_G = new Allele("G", true);
|
||||
private final static Allele ALT_G = new Allele("G", false);
|
||||
private final static Allele REF_T = new Allele("T", true);
|
||||
private final static Allele ALT_T = new Allele("T", false);
|
||||
private final static Allele REF_N = new Allele("N", true);
|
||||
private final static Allele ALT_N = new Allele("N", false);
|
||||
public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false);
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// creation routines
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases
|
||||
* == '-', a Null allele is created. If bases == '.', a no call Allele is created.
|
||||
*
|
||||
* @param bases the DNA sequence of this variation, '-', of '.'
|
||||
* @param isRef should we make this a reference allele?
|
||||
* @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated
|
||||
*/
|
||||
public static Allele create(byte[] bases, boolean isRef) {
|
||||
if ( bases == null )
|
||||
throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele");
|
||||
|
||||
if ( bases.length == 1 ) {
|
||||
// optimization to return a static constant Allele for each single base object
|
||||
switch (bases[0]) {
|
||||
case '.':
|
||||
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
|
||||
return NO_CALL;
|
||||
case 'A': case 'a' : return isRef ? REF_A : ALT_A;
|
||||
case 'C': case 'c' : return isRef ? REF_C : ALT_C;
|
||||
case 'G': case 'g' : return isRef ? REF_G : ALT_G;
|
||||
case 'T': case 't' : return isRef ? REF_T : ALT_T;
|
||||
case 'N': case 'n' : return isRef ? REF_N : ALT_N;
|
||||
default: throw new IllegalArgumentException("Illegal base [" + (char)bases[0] + "] seen in the allele");
|
||||
}
|
||||
} else {
|
||||
return new Allele(bases, isRef);
|
||||
}
|
||||
}
|
||||
|
||||
public static Allele create(byte base, boolean isRef) {
|
||||
// public Allele(byte base, boolean isRef) {
|
||||
return create( new byte[]{ base }, isRef);
|
||||
}
|
||||
|
||||
public static Allele create(byte base) {
|
||||
return create( base, false );
|
||||
}
|
||||
|
||||
public static Allele extend(Allele left, byte[] right) {
|
||||
if (left.isSymbolic())
|
||||
throw new IllegalArgumentException("Cannot extend a symbolic allele");
|
||||
byte[] bases = new byte[left.length() + right.length];
|
||||
System.arraycopy(left.getBases(), 0, bases, 0, left.length());
|
||||
System.arraycopy(right, 0, bases, left.length(), right.length);
|
||||
|
||||
return create(bases, left.isReference());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the null allele
|
||||
*/
|
||||
public static boolean wouldBeNullAllele(byte[] bases) {
|
||||
return (bases.length == 1 && bases[0] == '-') || bases.length == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the NO_CALL allele
|
||||
*/
|
||||
public static boolean wouldBeNoCallAllele(byte[] bases) {
|
||||
return bases.length == 1 && bases[0] == '.';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent a symbolic allele
|
||||
*/
|
||||
public static boolean wouldBeSymbolicAllele(byte[] bases) {
|
||||
if ( bases.length <= 2 )
|
||||
return false;
|
||||
else {
|
||||
final String strBases = new String(bases);
|
||||
return (bases[0] == '<' && bases[bases.length-1] == '>') ||
|
||||
(strBases.contains("[") || strBases.contains("]"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the well formatted allele
|
||||
*/
|
||||
public static boolean acceptableAlleleBases(String bases) {
|
||||
return acceptableAlleleBases(bases.getBytes(), true);
|
||||
}
|
||||
|
||||
public static boolean acceptableAlleleBases(String bases, boolean allowNsAsAcceptable) {
|
||||
return acceptableAlleleBases(bases.getBytes(), allowNsAsAcceptable);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the well formatted allele
|
||||
*/
|
||||
public static boolean acceptableAlleleBases(byte[] bases) {
|
||||
return acceptableAlleleBases(bases, true); // default: N bases are acceptable
|
||||
}
|
||||
|
||||
public static boolean acceptableAlleleBases(byte[] bases, boolean allowNsAsAcceptable) {
|
||||
if ( wouldBeNullAllele(bases) )
|
||||
return false;
|
||||
|
||||
if ( wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) )
|
||||
return true;
|
||||
|
||||
for (byte base : bases ) {
|
||||
switch (base) {
|
||||
case 'A': case 'C': case 'G': case 'T': case 'a': case 'c': case 'g': case 't':
|
||||
break;
|
||||
case 'N' : case 'n' :
|
||||
if (allowNsAsAcceptable)
|
||||
break;
|
||||
else
|
||||
return false;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Allele(byte[], boolean)
|
||||
*
|
||||
* @param bases bases representing an allele
|
||||
* @param isRef is this the reference allele?
|
||||
*/
|
||||
public static Allele create(String bases, boolean isRef) {
|
||||
//public Allele(String bases, boolean isRef) {
|
||||
return create(bases.getBytes(), isRef);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
|
||||
*
|
||||
* @param bases bases representing an allele
|
||||
*/
|
||||
public static Allele create(String bases) {
|
||||
return create(bases, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
|
||||
*
|
||||
* @param bases bases representing an allele
|
||||
*/
|
||||
public static Allele create(byte[] bases) {
|
||||
return create(bases, false);
|
||||
//this(bases, false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// accessor routines
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
// Returns true if this is the NO_CALL allele
|
||||
public boolean isNoCall() { return isNoCall; }
|
||||
// Returns true if this is not the NO_CALL allele
|
||||
public boolean isCalled() { return ! isNoCall(); }
|
||||
|
||||
// Returns true if this Allele is the reference allele
|
||||
public boolean isReference() { return isRef; }
|
||||
// Returns true if this Allele is not the reference allele
|
||||
public boolean isNonReference() { return ! isReference(); }
|
||||
|
||||
// Returns true if this Allele is symbolic (i.e. no well-defined base sequence)
|
||||
public boolean isSymbolic() { return isSymbolic; }
|
||||
|
||||
// Returns a nice string representation of this object
|
||||
public String toString() {
|
||||
return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the DNA bases segregating in this allele. Note this isn't reference polarized,
|
||||
* so the Null allele is represented by a vector of length 0
|
||||
*
|
||||
* @return the segregating bases
|
||||
*/
|
||||
public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; }
|
||||
|
||||
/**
|
||||
* Return the DNA bases segregating in this allele in String format.
|
||||
* This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it.
|
||||
*
|
||||
* @return the segregating bases
|
||||
*/
|
||||
public String getBaseString() { return isNoCall() ? NO_CALL_STRING : new String(getBases()); }
|
||||
|
||||
/**
|
||||
* Return the printed representation of this allele.
|
||||
* Same as getBaseString(), except for symbolic alleles.
|
||||
* For symbolic alleles, the base string is empty while the display string contains <TAG>.
|
||||
*
|
||||
* @return the allele string representation
|
||||
*/
|
||||
public String getDisplayString() { return new String(bases); }
|
||||
|
||||
/**
|
||||
* Same as #getDisplayString() but returns the result as byte[].
|
||||
*
|
||||
* Slightly faster then getDisplayString()
|
||||
*
|
||||
* @return the allele string representation
|
||||
*/
|
||||
public byte[] getDisplayBases() { return bases; }
|
||||
|
||||
/**
|
||||
* @param other the other allele
|
||||
*
|
||||
* @return true if these alleles are equal
|
||||
*/
|
||||
public boolean equals(Object other) {
|
||||
return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) );
|
||||
}
|
||||
|
||||
/**
|
||||
* @return hash code
|
||||
*/
|
||||
public int hashCode() {
|
||||
int hash = 1;
|
||||
for (int i = 0; i < bases.length; i++)
|
||||
hash += (i+1) * bases[i];
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the
|
||||
* same ref tag
|
||||
*
|
||||
* @param other allele to compare to
|
||||
* @param ignoreRefState if true, ignore ref state in comparison
|
||||
* @return true if this and other are equal
|
||||
*/
|
||||
public boolean equals(Allele other, boolean ignoreRefState) {
|
||||
return this == other || (isRef == other.isRef || ignoreRefState) && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param test bases to test against
|
||||
*
|
||||
* @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
|
||||
*/
|
||||
public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); }
|
||||
|
||||
/**
|
||||
* @param test bases to test against
|
||||
*
|
||||
* @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
|
||||
*/
|
||||
public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); }
|
||||
|
||||
/**
|
||||
* @param test allele to test against
|
||||
*
|
||||
* @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
|
||||
*/
|
||||
public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); }
|
||||
|
||||
/**
|
||||
* @return the length of this allele. Null and NO_CALL alleles have 0 length.
|
||||
*/
|
||||
public int length() {
|
||||
return isSymbolic ? 0 : bases.length;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// useful static functions
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public static Allele getMatchingAllele(Collection<Allele> allAlleles, byte[] alleleBases) {
|
||||
for ( Allele a : allAlleles ) {
|
||||
if ( a.basesMatch(alleleBases) ) {
|
||||
return a;
|
||||
}
|
||||
}
|
||||
|
||||
if ( wouldBeNoCallAllele(alleleBases) )
|
||||
return NO_CALL;
|
||||
else
|
||||
return null; // couldn't find anything
|
||||
}
|
||||
|
||||
public int compareTo(Allele other) {
|
||||
if ( isReference() && other.isNonReference() )
|
||||
return -1;
|
||||
else if ( isNonReference() && other.isReference() )
|
||||
return 1;
|
||||
else
|
||||
return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue
|
||||
}
|
||||
|
||||
public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) {
|
||||
if ( a2.length() >= a1.length() )
|
||||
return firstIsPrefixOfSecond(a1, a2);
|
||||
else
|
||||
return firstIsPrefixOfSecond(a2, a1);
|
||||
}
|
||||
|
||||
private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) {
|
||||
String a1String = a1.getBaseString();
|
||||
return a2.getBaseString().substring(0, a1String.length()).equals(a1String);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,263 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Common utility routines for VariantContext and Genotype
|
||||
*
|
||||
* @author depristo
|
||||
*/
|
||||
public final class CommonInfo {
|
||||
public static final double NO_LOG10_PERROR = 1.0;
|
||||
|
||||
private static Set<String> NO_FILTERS = Collections.emptySet();
|
||||
private static Map<String, Object> NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap<String, Object>());
|
||||
|
||||
private double log10PError = NO_LOG10_PERROR;
|
||||
private String name = null;
|
||||
private Set<String> filters = null;
|
||||
private Map<String, Object> attributes = NO_ATTRIBUTES;
|
||||
|
||||
public CommonInfo(String name, double log10PError, Set<String> filters, Map<String, Object> attributes) {
|
||||
this.name = name;
|
||||
setLog10PError(log10PError);
|
||||
this.filters = filters;
|
||||
if ( attributes != null && ! attributes.isEmpty() ) {
|
||||
this.attributes = attributes;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the name
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the name
|
||||
*
|
||||
* @param name the name associated with this information
|
||||
*/
|
||||
public void setName(String name) {
|
||||
if ( name == null ) throw new IllegalArgumentException("Name cannot be null " + this);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Filter
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Set<String> getFiltersMaybeNull() {
|
||||
return filters;
|
||||
}
|
||||
|
||||
public Set<String> getFilters() {
|
||||
return filters == null ? NO_FILTERS : Collections.unmodifiableSet(filters);
|
||||
}
|
||||
|
||||
public boolean filtersWereApplied() {
|
||||
return filters != null;
|
||||
}
|
||||
|
||||
public boolean isFiltered() {
|
||||
return filters == null ? false : filters.size() > 0;
|
||||
}
|
||||
|
||||
public boolean isNotFiltered() {
|
||||
return ! isFiltered();
|
||||
}
|
||||
|
||||
public void addFilter(String filter) {
|
||||
if ( filters == null ) // immutable -> mutable
|
||||
filters = new HashSet<String>();
|
||||
|
||||
if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
|
||||
if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
|
||||
filters.add(filter);
|
||||
}
|
||||
|
||||
public void addFilters(Collection<String> filters) {
|
||||
if ( filters == null ) throw new IllegalArgumentException("BUG: Attempting to add null filters at" + this);
|
||||
for ( String f : filters )
|
||||
addFilter(f);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Working with log error rates
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean hasLog10PError() {
|
||||
return getLog10PError() != NO_LOG10_PERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the -1 * log10-based error estimate
|
||||
*/
|
||||
public double getLog10PError() { return log10PError; }
|
||||
public double getPhredScaledQual() { return getLog10PError() * -10; }
|
||||
|
||||
public void setLog10PError(double log10PError) {
|
||||
if ( log10PError > 0 && log10PError != NO_LOG10_PERROR)
|
||||
throw new IllegalArgumentException("BUG: log10PError cannot be > 0 : " + this.log10PError);
|
||||
if ( Double.isInfinite(this.log10PError) )
|
||||
throw new IllegalArgumentException("BUG: log10PError should not be Infinity");
|
||||
if ( Double.isNaN(this.log10PError) )
|
||||
throw new IllegalArgumentException("BUG: log10PError should not be NaN");
|
||||
this.log10PError = log10PError;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Working with attributes
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
public void clearAttributes() {
|
||||
attributes = new HashMap<String, Object>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the attribute map
|
||||
*/
|
||||
public Map<String, Object> getAttributes() {
|
||||
return Collections.unmodifiableMap(attributes);
|
||||
}
|
||||
|
||||
// todo -- define common attributes as enum
|
||||
|
||||
public void setAttributes(Map<String, ?> map) {
|
||||
clearAttributes();
|
||||
putAttributes(map);
|
||||
}
|
||||
|
||||
public void putAttribute(String key, Object value) {
|
||||
putAttribute(key, value, false);
|
||||
}
|
||||
|
||||
public void putAttribute(String key, Object value, boolean allowOverwrites) {
|
||||
if ( ! allowOverwrites && hasAttribute(key) )
|
||||
throw new IllegalStateException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);
|
||||
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>();
|
||||
|
||||
attributes.put(key, value);
|
||||
}
|
||||
|
||||
public void removeAttribute(String key) {
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>();
|
||||
attributes.remove(key);
|
||||
}
|
||||
|
||||
public void putAttributes(Map<String, ?> map) {
|
||||
if ( map != null ) {
|
||||
// for efficiency, we can skip the validation if the map is empty
|
||||
if ( attributes.size() == 0 ) {
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>();
|
||||
attributes.putAll(map);
|
||||
} else {
|
||||
for ( Map.Entry<String, ?> elt : map.entrySet() ) {
|
||||
putAttribute(elt.getKey(), elt.getValue(), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasAttribute(String key) {
|
||||
return attributes.containsKey(key);
|
||||
}
|
||||
|
||||
public int getNumAttributes() {
|
||||
return attributes.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the attribute key
|
||||
*
|
||||
* @return the attribute value for the given key (or null if not set)
|
||||
*/
|
||||
public Object getAttribute(String key) {
|
||||
return attributes.get(key);
|
||||
}
|
||||
|
||||
public Object getAttribute(String key, Object defaultValue) {
|
||||
if ( hasAttribute(key) )
|
||||
return attributes.get(key);
|
||||
else
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
public String getAttributeAsString(String key, String defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof String ) return (String)x;
|
||||
return String.valueOf(x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public int getAttributeAsInt(String key, int defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
|
||||
if ( x instanceof Integer ) return (Integer)x;
|
||||
return Integer.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public double getAttributeAsDouble(String key, double defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof Double ) return (Double)x;
|
||||
if ( x instanceof Integer ) return (Integer)x;
|
||||
return Double.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public boolean getAttributeAsBoolean(String key, boolean defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof Boolean ) return (Boolean)x;
|
||||
return Boolean.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
// public String getAttributeAsString(String key) { return (String.valueOf(getExtendedAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
|
||||
// public int getAttributeAsInt(String key) { Object x = getExtendedAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
|
||||
// public double getAttributeAsDouble(String key) { Object x = getExtendedAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
|
||||
// public boolean getAttributeAsBoolean(String key) { Object x = getExtendedAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
|
||||
// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
|
||||
// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
|
||||
// public String getAttributeAsStringNoException(String key) { if (getExtendedAttribute(key) == null) return null; return getAttributeAsString(key); }
|
||||
// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
|
||||
}
|
||||
|
|
@ -1,182 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* This class encompasses all the basic information about a genotype. It is immutable.
|
||||
*
|
||||
* A genotype has several key fields
|
||||
*
|
||||
* -- a sample name, must be a non-null string
|
||||
*
|
||||
* -- an ordered list of alleles, intrepreted as the genotype of the sample,
|
||||
* each allele for each chromosome given in order. If alleles = [a*, t]
|
||||
* then the sample is a/t, with a (the reference from the *) the first
|
||||
* chromosome and t on the second chromosome
|
||||
*
|
||||
* -- a isPhased marker indicting where the alleles are phased with respect to some global
|
||||
* coordinate system. See VCF4.1 spec for a detailed discussion
|
||||
*
|
||||
* -- Inline, optimized ints and int[] values for:
|
||||
* -- GQ: the phred-scaled genotype quality, of -1 if it's missing
|
||||
*
|
||||
* -- DP: the count of reads at this locus for this sample, of -1 if missing
|
||||
*
|
||||
* -- AD: an array of counts of reads at this locus, one for each Allele at the site.
|
||||
* that is, for each allele in the surrounding VariantContext. Null if missing.
|
||||
*
|
||||
* -- PL: phred-scaled genotype likelihoods in standard VCF4.1 order for
|
||||
* all combinations of the alleles in the surrounding VariantContext, given
|
||||
* the ploidy of the sample (from the alleles vector). Null if missing.
|
||||
*
|
||||
* -- A general map from String keys to -> Object values for all other attributes in
|
||||
* this genotype. Note that this map should not contain duplicate values for the
|
||||
* standard bindings for GQ, DP, AD, and PL. Genotype filters can be put into
|
||||
* this genotype, but it isn't respected by the GATK in analyses
|
||||
*
|
||||
* The only way to build a Genotype object is with a GenotypeBuilder, which permits values
|
||||
* to be set in any order, which means that GenotypeBuilder may at some in the chain of
|
||||
* sets pass through invalid states that are not permitted in a fully formed immutable
|
||||
* Genotype.
|
||||
*
|
||||
* Note this is a simplified, refactored Genotype object based on the original
|
||||
* generic (and slow) implementation from the original VariantContext + Genotype
|
||||
* codebase.
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 05/12
|
||||
*/
|
||||
public final class FastGenotype extends Genotype {
|
||||
private final List<Allele> alleles;
|
||||
private final boolean isPhased;
|
||||
private final int GQ;
|
||||
private final int DP;
|
||||
private final int[] AD;
|
||||
private final int[] PL;
|
||||
private final Map<String, Object> extendedAttributes;
|
||||
|
||||
/**
|
||||
* The only way to make one of these, for use by GenotypeBuilder only
|
||||
*
|
||||
* @param sampleName
|
||||
* @param alleles
|
||||
* @param isPhased
|
||||
* @param GQ
|
||||
* @param DP
|
||||
* @param AD
|
||||
* @param PL
|
||||
* @param extendedAttributes
|
||||
*/
|
||||
@Requires({
|
||||
"sampleName != null",
|
||||
"alleles != null",
|
||||
"GQ >= -1",
|
||||
"DP >= -1",
|
||||
"validADorPLField(AD)",
|
||||
"validADorPLField(PL)",
|
||||
"extendedAttributes != null",
|
||||
"! hasForbiddenKey(extendedAttributes)"})
|
||||
protected FastGenotype(final String sampleName,
|
||||
final List<Allele> alleles,
|
||||
final boolean isPhased,
|
||||
final int GQ,
|
||||
final int DP,
|
||||
final int[] AD,
|
||||
final int[] PL,
|
||||
final String filters,
|
||||
final Map<String, Object> extendedAttributes) {
|
||||
super(sampleName, filters);
|
||||
this.alleles = alleles;
|
||||
this.isPhased = isPhased;
|
||||
this.GQ = GQ;
|
||||
this.DP = DP;
|
||||
this.AD = AD;
|
||||
this.PL = PL;
|
||||
this.extendedAttributes = extendedAttributes;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Implmenting the abstract methods
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override public List<Allele> getAlleles() {
|
||||
return alleles;
|
||||
}
|
||||
|
||||
@Override public Allele getAllele(int i) {
|
||||
return alleles.get(i);
|
||||
}
|
||||
|
||||
@Override public boolean isPhased() {
|
||||
return isPhased;
|
||||
}
|
||||
|
||||
@Override public int getDP() {
|
||||
return DP;
|
||||
}
|
||||
|
||||
@Override public int[] getAD() {
|
||||
return AD;
|
||||
}
|
||||
|
||||
@Override public int getGQ() {
|
||||
return GQ;
|
||||
}
|
||||
|
||||
@Override public int[] getPL() {
|
||||
return PL;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// get routines for extended attributes
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Map<String, Object> getExtendedAttributes() {
|
||||
return extendedAttributes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is values a valid AD or PL field
|
||||
* @param values
|
||||
* @return
|
||||
*/
|
||||
private static boolean validADorPLField(final int[] values) {
|
||||
if ( values != null )
|
||||
for ( int v : values )
|
||||
if ( v < 0 )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,676 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* This class encompasses all the basic information about a genotype. It is immutable.
|
||||
*
|
||||
* @author Mark DePristo
|
||||
*/
|
||||
@Invariant({
|
||||
"getAlleles() != null",
|
||||
"getSampleName() != null",
|
||||
"getPloidy() >= 0",
|
||||
"! hasForbiddenKey(getExtendedAttributes())"})
|
||||
public abstract class Genotype implements Comparable<Genotype> {
|
||||
/**
|
||||
* A list of genotype field keys corresponding to values we
|
||||
* manage inline in the Genotype object. They must not appear in the
|
||||
* extended attributes map
|
||||
*/
|
||||
public final static Collection<String> PRIMARY_KEYS = Arrays.asList(
|
||||
VCFConstants.GENOTYPE_FILTER_KEY,
|
||||
VCFConstants.GENOTYPE_KEY,
|
||||
VCFConstants.GENOTYPE_QUALITY_KEY,
|
||||
VCFConstants.DEPTH_KEY,
|
||||
VCFConstants.GENOTYPE_ALLELE_DEPTHS,
|
||||
VCFConstants.GENOTYPE_PL_KEY);
|
||||
|
||||
public final static String PHASED_ALLELE_SEPARATOR = "|";
|
||||
public final static String UNPHASED_ALLELE_SEPARATOR = "/";
|
||||
|
||||
private final String sampleName;
|
||||
private GenotypeType type = null;
|
||||
private final String filters;
|
||||
|
||||
protected Genotype(final String sampleName, final String filters) {
|
||||
this.sampleName = sampleName;
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the alleles for this genotype. Cannot be null. May be empty
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public abstract List<Allele> getAlleles();
|
||||
|
||||
/**
|
||||
* Returns how many times allele appears in this genotype object?
|
||||
*
|
||||
* @param allele
|
||||
* @return a value >= 0 indicating how many times the allele occurred in this sample's genotype
|
||||
*/
|
||||
@Requires("allele != null")
|
||||
@Ensures("result >= 0")
|
||||
public int countAllele(final Allele allele) {
|
||||
int c = 0;
|
||||
for ( final Allele a : getAlleles() )
|
||||
if ( a.equals(allele) )
|
||||
c++;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ith allele in this genotype
|
||||
*
|
||||
* @param i the ith allele, must be < the ploidy, starting with 0
|
||||
* @return the allele at position i, which cannot be null
|
||||
*/
|
||||
@Requires({"i >=0 && i < getPloidy()", "getType() != GenotypeType.UNAVAILABLE"})
|
||||
@Ensures("result != null")
|
||||
public abstract Allele getAllele(int i);
|
||||
|
||||
/**
|
||||
* Are the alleles phased w.r.t. the global phasing system?
|
||||
*
|
||||
* @return true if yes
|
||||
*/
|
||||
public abstract boolean isPhased();
|
||||
|
||||
/**
|
||||
* What is the ploidy of this sample?
|
||||
*
|
||||
* @return the ploidy of this genotype. 0 if the site is no-called.
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int getPloidy() {
|
||||
return getAlleles().size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the sequencing depth of this sample, or -1 if this value is missing
|
||||
*/
|
||||
@Ensures("result >= -1")
|
||||
public abstract int getDP();
|
||||
|
||||
/**
|
||||
* @return the count of reads, one for each allele in the surrounding Variant context,
|
||||
* matching the corresponding allele, or null if this value is missing. MUST
|
||||
* NOT BE MODIFIED!
|
||||
*/
|
||||
public abstract int[] getAD();
|
||||
|
||||
/**
|
||||
* Returns the name associated with this sample.
|
||||
*
|
||||
* @return a non-null String
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public String getSampleName() {
|
||||
return sampleName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a phred-scaled quality score, or -1 if none is available
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result >= -1")
|
||||
public abstract int getGQ();
|
||||
|
||||
/**
|
||||
* Does the PL field have a value?
|
||||
* @return true if there's a PL field value
|
||||
*/
|
||||
@Ensures("(result == false && getPL() == null) || (result == true && getPL() != null)")
|
||||
public boolean hasPL() {
|
||||
return getPL() != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the AD field have a value?
|
||||
* @return true if there's a AD field value
|
||||
*/
|
||||
@Ensures("(result == false && getAD() == null) || (result == true && getAD() != null)")
|
||||
public boolean hasAD() {
|
||||
return getAD() != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the GQ field have a value?
|
||||
* @return true if there's a GQ field value
|
||||
*/
|
||||
@Ensures("(result == false && getGQ() == -1) || (result == true && getGQ() >= 0)")
|
||||
public boolean hasGQ() {
|
||||
return getGQ() != -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the DP field have a value?
|
||||
* @return true if there's a DP field value
|
||||
*/
|
||||
@Ensures("(result == false && getDP() == -1) || (result == true && getDP() >= 0)")
|
||||
public boolean hasDP() {
|
||||
return getDP() != -1;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// The type of this genotype
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @return the high-level type of this sample's genotype
|
||||
*/
|
||||
@Ensures({"type != null", "result != null"})
|
||||
public GenotypeType getType() {
|
||||
if ( type == null ) {
|
||||
type = determineType();
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal code to determine the type of the genotype from the alleles vector
|
||||
* @return the type
|
||||
*/
|
||||
@Requires("type == null") // we should never call if already calculated
|
||||
protected GenotypeType determineType() {
|
||||
// TODO -- this code is slow and could be optimized for the diploid case
|
||||
final List<Allele> alleles = getAlleles();
|
||||
if ( alleles.isEmpty() )
|
||||
return GenotypeType.UNAVAILABLE;
|
||||
|
||||
boolean sawNoCall = false, sawMultipleAlleles = false;
|
||||
Allele observedAllele = null;
|
||||
|
||||
for ( final Allele allele : alleles ) {
|
||||
if ( allele.isNoCall() )
|
||||
sawNoCall = true;
|
||||
else if ( observedAllele == null )
|
||||
observedAllele = allele;
|
||||
else if ( !allele.equals(observedAllele) )
|
||||
sawMultipleAlleles = true;
|
||||
}
|
||||
|
||||
if ( sawNoCall ) {
|
||||
if ( observedAllele == null )
|
||||
return GenotypeType.NO_CALL;
|
||||
return GenotypeType.MIXED;
|
||||
}
|
||||
|
||||
if ( observedAllele == null )
|
||||
throw new IllegalStateException("BUG: there are no alleles present in this genotype but the alleles list is not null");
|
||||
|
||||
return sawMultipleAlleles ? GenotypeType.HET : observedAllele.isReference() ? GenotypeType.HOM_REF : GenotypeType.HOM_VAR;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHom() { return isHomRef() || isHomVar(); }
|
||||
|
||||
/**
|
||||
* @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHomRef() { return getType() == GenotypeType.HOM_REF; }
|
||||
|
||||
/**
|
||||
* @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHomVar() { return getType() == GenotypeType.HOM_VAR; }
|
||||
|
||||
/**
|
||||
* @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHet() { return getType() == GenotypeType.HET; }
|
||||
|
||||
/**
|
||||
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false.
|
||||
*/
|
||||
public boolean isNoCall() { return getType() == GenotypeType.NO_CALL; }
|
||||
|
||||
/**
|
||||
* @return true if this genotype is comprised of any alleles that are not no-calls (even if some are).
|
||||
*/
|
||||
public boolean isCalled() { return getType() != GenotypeType.NO_CALL && getType() != GenotypeType.UNAVAILABLE; }
|
||||
|
||||
/**
|
||||
* @return true if this genotype is comprised of both calls and no-calls.
|
||||
*/
|
||||
public boolean isMixed() { return getType() == GenotypeType.MIXED; }
|
||||
|
||||
/**
|
||||
* @return true if the type of this genotype is set.
|
||||
*/
|
||||
public boolean isAvailable() { return getType() != GenotypeType.UNAVAILABLE; }
|
||||
|
||||
// ------------------------------------------------------------------------------
|
||||
//
|
||||
// methods for getting genotype likelihoods for a genotype object, if present
|
||||
//
|
||||
// ------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @return Returns true if this Genotype has PL field values
|
||||
*/
|
||||
@Ensures("(result && getLikelihoods() != null) || (! result && getLikelihoods() == null)")
|
||||
public boolean hasLikelihoods() {
|
||||
return getPL() != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function that returns a string representation of the PL field of this
|
||||
* genotype, or . if none is available.
|
||||
*
|
||||
* @return a non-null String representation for the PL of this sample
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public String getLikelihoodsString() {
|
||||
return hasLikelihoods() ? getLikelihoods().toString() : VCFConstants.MISSING_VALUE_v4;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing
|
||||
* @return null or a GenotypesLikelihood object for this sample's PL field
|
||||
*/
|
||||
@Ensures("(hasLikelihoods() && result != null) || (! hasLikelihoods() && result == null)")
|
||||
public GenotypeLikelihoods getLikelihoods() {
|
||||
return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are all likelihoods for this sample non-informative?
|
||||
*
|
||||
* Returns true if all PLs are 0 => 0,0,0 => true
|
||||
* 0,0,0,0,0,0 => true
|
||||
* 0,10,100 => false
|
||||
*
|
||||
* @return true if all samples PLs are equal and == 0
|
||||
*/
|
||||
public boolean isNonInformative() {
|
||||
if ( getPL() == null )
|
||||
return true;
|
||||
else {
|
||||
for ( final int PL : getPL() ) {
|
||||
if ( PL != 0 )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unsafe low-level accessor the PL field itself, may be null.
|
||||
*
|
||||
* @return a pointer to the underlying PL data. MUST NOT BE MODIFIED!
|
||||
*/
|
||||
public abstract int[] getPL();
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Many different string representations
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return a VCF-like string representation for the alleles of this genotype.
|
||||
*
|
||||
* Does not append the reference * marker on the alleles.
|
||||
*
|
||||
* @return a string representing the genotypes, or null if the type is unavailable.
|
||||
*/
|
||||
@Ensures("result != null || ! isAvailable()")
|
||||
public String getGenotypeString() {
|
||||
return getGenotypeString(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a VCF-like string representation for the alleles of this genotype.
|
||||
*
|
||||
* If ignoreRefState is true, will not append the reference * marker on the alleles.
|
||||
*
|
||||
* @return a string representing the genotypes, or null if the type is unavailable.
|
||||
*/
|
||||
@Ensures("result != null || ! isAvailable()")
|
||||
public String getGenotypeString(boolean ignoreRefState) {
|
||||
if ( getPloidy() == 0 )
|
||||
return "NA";
|
||||
|
||||
// Notes:
|
||||
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
|
||||
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)
|
||||
// 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course)
|
||||
return ParsingUtils.join(isPhased() ? PHASED_ALLELE_SEPARATOR : UNPHASED_ALLELE_SEPARATOR,
|
||||
ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles())));
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility that returns a list of allele strings corresponding to the alleles in this sample
|
||||
* @return
|
||||
*/
|
||||
protected List<String> getAlleleStrings() {
|
||||
final List<String> al = new ArrayList<String>(getPloidy());
|
||||
for ( Allele a : getAlleles() )
|
||||
al.add(a.getBaseString());
|
||||
|
||||
return al;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("[%s %s%s%s%s%s%s%s]",
|
||||
getSampleName(),
|
||||
getGenotypeString(false),
|
||||
toStringIfExists(VCFConstants.GENOTYPE_QUALITY_KEY, getGQ()),
|
||||
toStringIfExists(VCFConstants.DEPTH_KEY, getDP()),
|
||||
toStringIfExists(VCFConstants.GENOTYPE_ALLELE_DEPTHS, getAD()),
|
||||
toStringIfExists(VCFConstants.GENOTYPE_PL_KEY, getPL()),
|
||||
toStringIfExists(VCFConstants.GENOTYPE_FILTER_KEY, getFilters()),
|
||||
sortedString(getExtendedAttributes()));
|
||||
}
|
||||
|
||||
public String toBriefString() {
|
||||
return String.format("%s:Q%d", getGenotypeString(false), getGQ());
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Comparison operations
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* comparable genotypes -> compareTo on the sample names
|
||||
* @param genotype
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public int compareTo(final Genotype genotype) {
|
||||
return getSampleName().compareTo(genotype.getSampleName());
|
||||
}
|
||||
|
||||
public boolean sameGenotype(final Genotype other) {
|
||||
return sameGenotype(other, true);
|
||||
}
|
||||
|
||||
public boolean sameGenotype(final Genotype other, boolean ignorePhase) {
|
||||
if (getPloidy() != other.getPloidy())
|
||||
return false; // gotta have the same number of allele to be equal
|
||||
|
||||
// By default, compare the elements in the lists of alleles, element-by-element
|
||||
Collection<Allele> thisAlleles = this.getAlleles();
|
||||
Collection<Allele> otherAlleles = other.getAlleles();
|
||||
|
||||
if (ignorePhase) { // do not care about order, only identity of Alleles
|
||||
thisAlleles = new TreeSet<Allele>(thisAlleles); //implemented Allele.compareTo()
|
||||
otherAlleles = new TreeSet<Allele>(otherAlleles);
|
||||
}
|
||||
|
||||
return thisAlleles.equals(otherAlleles);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// get routines for extended attributes
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the extended attributes for this object
|
||||
* @return is never null, but is often isEmpty()
|
||||
*/
|
||||
@Ensures({"result != null", "! hasForbiddenKey(result)"})
|
||||
public abstract Map<String, Object> getExtendedAttributes();
|
||||
|
||||
/**
|
||||
* Is key associated with a value (even a null one) in the extended attributes?
|
||||
*
|
||||
* Note this will not return true for the inline attributes DP, GQ, AD, or PL
|
||||
*
|
||||
* @param key a non-null string key to check for an association
|
||||
* @return true if key has a value in the extendedAttributes
|
||||
*/
|
||||
@Requires({"key != null", "! isForbiddenKey(key)"})
|
||||
public boolean hasExtendedAttribute(final String key) {
|
||||
return getExtendedAttributes().containsKey(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the extended attribute value associated with key, if possible
|
||||
*
|
||||
* @param key a non-null string key to fetch a value for
|
||||
* @param defaultValue the value to return if key isn't in the extended attributes
|
||||
* @return a value (potentially) null associated with key, or defaultValue if no association exists
|
||||
*/
|
||||
@Requires({"key != null", "! isForbiddenKey(key)"})
|
||||
@Ensures("hasExtendedAttribute(key) || result == defaultValue")
|
||||
public Object getExtendedAttribute(final String key, final Object defaultValue) {
|
||||
return hasExtendedAttribute(key) ? getExtendedAttributes().get(key) : defaultValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as #getExtendedAttribute with a null default
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public Object getExtendedAttribute(final String key) {
|
||||
return getExtendedAttribute(key, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the filter string associated with this Genotype.
|
||||
*
|
||||
* @return If this result == null, then the genotype is considered PASSing filters
|
||||
* If the result != null, then the genotype has failed filtering for the reason(s)
|
||||
* specified in result. To be reference compliant multiple filter field
|
||||
* string values can be encoded with a ; separator.
|
||||
*/
|
||||
public final String getFilters() {
|
||||
return filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this genotype filtered or not?
|
||||
*
|
||||
* @return returns false if getFilters() == null
|
||||
*/
|
||||
@Ensures({"result != (getFilters() == null)"})
|
||||
public final boolean isFiltered() {
|
||||
return getFilters() != null;
|
||||
}
|
||||
|
||||
@Deprecated public boolean hasLog10PError() { return hasGQ(); }
|
||||
@Deprecated public double getLog10PError() { return getGQ() / -10.0; }
|
||||
@Deprecated public int getPhredScaledQual() { return getGQ(); }
|
||||
|
||||
@Deprecated
|
||||
public String getAttributeAsString(String key, String defaultValue) {
|
||||
Object x = getExtendedAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof String ) return (String)x;
|
||||
return String.valueOf(x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public int getAttributeAsInt(String key, int defaultValue) {
|
||||
Object x = getExtendedAttribute(key);
|
||||
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
|
||||
if ( x instanceof Integer ) return (Integer)x;
|
||||
return Integer.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public double getAttributeAsDouble(String key, double defaultValue) {
|
||||
Object x = getExtendedAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof Double ) return (Double)x;
|
||||
return Double.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
/**
|
||||
* A totally generic getter, that allows you to specific keys that correspond
|
||||
* to even inline values (GQ, for example). Can be very expensive. Additionally,
|
||||
* all int[] are converted inline into List<Integer> for convenience.
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public Object getAnyAttribute(final String key) {
|
||||
if (key.equals(VCFConstants.GENOTYPE_KEY)) {
|
||||
return getAlleles();
|
||||
} else if (key.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
||||
return getGQ();
|
||||
} else if (key.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
|
||||
return Arrays.asList(getAD());
|
||||
} else if (key.equals(VCFConstants.GENOTYPE_PL_KEY)) {
|
||||
return Arrays.asList(getPL());
|
||||
} else if (key.equals(VCFConstants.DEPTH_KEY)) {
|
||||
return getDP();
|
||||
} else {
|
||||
return getExtendedAttribute(key);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasAnyAttribute(final String key) {
|
||||
if (key.equals(VCFConstants.GENOTYPE_KEY)) {
|
||||
return isAvailable();
|
||||
} else if (key.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
||||
return hasGQ();
|
||||
} else if (key.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
|
||||
return hasAD();
|
||||
} else if (key.equals(VCFConstants.GENOTYPE_PL_KEY)) {
|
||||
return hasPL();
|
||||
} else if (key.equals(VCFConstants.DEPTH_KEY)) {
|
||||
return hasDP();
|
||||
} else {
|
||||
return hasExtendedAttribute(key);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO -- add getAttributesAsX interface here
|
||||
|
||||
// ------------------------------------------------------------------------------
|
||||
//
|
||||
// private utilities
|
||||
//
|
||||
// ------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* a utility method for generating sorted strings from a map key set.
|
||||
* @param c the map
|
||||
* @param <T> the key type
|
||||
* @param <V> the value type
|
||||
* @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys
|
||||
*/
|
||||
@Requires("c != null")
|
||||
protected static <T extends Comparable<T>, V> String sortedString(Map<T, V> c) {
|
||||
|
||||
// NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS
|
||||
final List<T> t = new ArrayList<T>(c.keySet());
|
||||
Collections.sort(t);
|
||||
|
||||
final List<String> pairs = new ArrayList<String>();
|
||||
for (final T k : t) {
|
||||
pairs.add(k + "=" + c.get(k));
|
||||
}
|
||||
|
||||
return pairs.isEmpty() ? "" : " {" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a display name for field name with value v if this isn't -1. Otherwise returns ""
|
||||
* @param name of the field ("AD")
|
||||
* @param v the value of the field, or -1 if missing
|
||||
* @return a non-null string for display if the field is not missing
|
||||
*/
|
||||
@Requires("name != null")
|
||||
@Ensures("result != null")
|
||||
protected final static String toStringIfExists(final String name, final int v) {
|
||||
return v == -1 ? "" : " " + name + " " + v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a display name for field name with String value v if this isn't null. Otherwise returns ""
|
||||
* @param name of the field ("FT")
|
||||
* @param v the value of the field, or null if missing
|
||||
* @return a non-null string for display if the field is not missing
|
||||
*/
|
||||
protected final static String toStringIfExists(final String name, final String v) {
|
||||
return v == null ? "" : " " + name + " " + v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a display name for field name with values vs if this isn't null. Otherwise returns ""
|
||||
* @param name of the field ("AD")
|
||||
* @param vs the value of the field, or null if missing
|
||||
* @return a non-null string for display if the field is not missing
|
||||
*/
|
||||
@Requires("name != null")
|
||||
@Ensures("result != null")
|
||||
protected final static String toStringIfExists(final String name, final int[] vs) {
|
||||
if ( vs == null )
|
||||
return "";
|
||||
else {
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append(" ").append(name).append(" ");
|
||||
for ( int i = 0; i < vs.length; i++ ) {
|
||||
if ( i != 0 ) b.append(",");
|
||||
b.append(vs[i]);
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the attribute map have a mapping involving a forbidden key (i.e.,
|
||||
* one that's managed inline by this Genotypes object?
|
||||
*
|
||||
* @param attributes the extended attributes key
|
||||
* @return
|
||||
*/
|
||||
protected final static boolean hasForbiddenKey(final Map<String, Object> attributes) {
|
||||
for ( final String forbidden : PRIMARY_KEYS)
|
||||
if ( attributes.containsKey(forbidden) )
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
protected final static boolean isForbiddenKey(final String key) {
|
||||
return PRIMARY_KEYS.contains(key);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,419 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A builder class for genotypes
|
||||
*
|
||||
* Provides convenience setter methods for all of the Genotype field
|
||||
* values. Setter methods can be used in any order, allowing you to
|
||||
* pass through states that wouldn't be allowed in the highly regulated
|
||||
* immutable Genotype class.
|
||||
*
|
||||
* All fields default to meaningful MISSING values.
|
||||
*
|
||||
* Call make() to actually create the corresponding Genotype object from
|
||||
* this builder. Can be called multiple times to create independent copies,
|
||||
* or with intervening sets to conveniently make similar Genotypes with
|
||||
* slight modifications.
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 06/12
|
||||
*/
|
||||
@Invariant({"alleles != null"})
|
||||
public final class GenotypeBuilder {
|
||||
private static final List<Allele> HAPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL);
|
||||
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
||||
private String sampleName = null;
|
||||
private List<Allele> alleles = Collections.emptyList();
|
||||
|
||||
private boolean isPhased = false;
|
||||
private int GQ = -1;
|
||||
private int DP = -1;
|
||||
private int[] AD = null;
|
||||
private int[] PL = null;
|
||||
private Map<String, Object> extendedAttributes = null;
|
||||
private String filters = null;
|
||||
private int initialAttributeMapSize = 5;
|
||||
|
||||
private final static Map<String, Object> NO_ATTRIBUTES =
|
||||
Collections.unmodifiableMap(new HashMap<String, Object>(0));
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Factory methods
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
public static Genotype create(final String sampleName, final List<Allele> alleles) {
|
||||
return new GenotypeBuilder(sampleName, alleles).make();
|
||||
}
|
||||
|
||||
public static Genotype create(final String sampleName,
|
||||
final List<Allele> alleles,
|
||||
final Map<String, Object> attributes) {
|
||||
return new GenotypeBuilder(sampleName, alleles).attributes(attributes).make();
|
||||
}
|
||||
|
||||
protected static Genotype create(final String sampleName,
|
||||
final List<Allele> alleles,
|
||||
final double[] gls) {
|
||||
return new GenotypeBuilder(sampleName, alleles).PL(gls).make();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Genotype object for a sample that's missing from the VC (i.e., in
|
||||
* the output header). Defaults to a diploid no call genotype ./.
|
||||
*
|
||||
* @param sampleName the name of this sample
|
||||
* @return an initialized Genotype with sampleName that's a diploid ./. no call genotype
|
||||
*/
|
||||
public static Genotype createMissing(final String sampleName, final int ploidy) {
|
||||
final GenotypeBuilder builder = new GenotypeBuilder(sampleName);
|
||||
switch ( ploidy ) {
|
||||
case 1: builder.alleles(HAPLOID_NO_CALL); break;
|
||||
case 2: builder.alleles(DIPLOID_NO_CALL); break;
|
||||
default: builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); break;
|
||||
}
|
||||
return builder.make();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a empty builder. Both a sampleName and alleles must be provided
|
||||
* before trying to make a Genotype from this builder.
|
||||
*/
|
||||
public GenotypeBuilder() {}
|
||||
|
||||
/**
|
||||
* Create a builder using sampleName. Alleles must be provided
|
||||
* before trying to make a Genotype from this builder.
|
||||
* @param sampleName
|
||||
*/
|
||||
public GenotypeBuilder(final String sampleName) {
|
||||
name(sampleName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a builder using sampleName and alleles for starting values
|
||||
* @param sampleName
|
||||
* @param alleles
|
||||
*/
|
||||
public GenotypeBuilder(final String sampleName, final List<Allele> alleles) {
|
||||
name(sampleName);
|
||||
alleles(alleles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new builder starting with the values in Genotype g
|
||||
* @param g
|
||||
*/
|
||||
public GenotypeBuilder(final Genotype g) {
|
||||
copy(g);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy all of the values for this builder from Genotype g
|
||||
* @param g
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder copy(final Genotype g) {
|
||||
name(g.getSampleName());
|
||||
alleles(g.getAlleles());
|
||||
phased(g.isPhased());
|
||||
GQ(g.getGQ());
|
||||
DP(g.getDP());
|
||||
AD(g.getAD());
|
||||
PL(g.getPL());
|
||||
filter(g.getFilters());
|
||||
attributes(g.getExtendedAttributes());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset all of the builder attributes to their defaults. After this
|
||||
* function you must provide sampleName and alleles before trying to
|
||||
* make more Genotypes.
|
||||
*/
|
||||
public final void reset(final boolean keepSampleName) {
|
||||
if ( ! keepSampleName ) sampleName = null;
|
||||
alleles = Collections.emptyList();
|
||||
isPhased = false;
|
||||
GQ = -1;
|
||||
DP = -1;
|
||||
AD = null;
|
||||
PL = null;
|
||||
filters = null;
|
||||
extendedAttributes = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Genotype object using the values set in this builder.
|
||||
*
|
||||
* After creation the values in this builder can be modified and more Genotypes
|
||||
* created, althrough the contents of array values like PL should never be modified
|
||||
* inline as they are not copied for efficiency reasons.
|
||||
*
|
||||
* @return a newly minted Genotype object with values provided from this builder
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public Genotype make() {
|
||||
final Map<String, Object> ea = extendedAttributes == null ? NO_ATTRIBUTES : extendedAttributes;
|
||||
return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, filters, ea);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this genotype's name
|
||||
* @param sampleName
|
||||
* @return
|
||||
*/
|
||||
@Requires({"sampleName != null"})
|
||||
@Ensures({"this.sampleName != null"})
|
||||
public GenotypeBuilder name(final String sampleName) {
|
||||
this.sampleName = sampleName;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this genotype's alleles
|
||||
* @param alleles
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"this.alleles != null"})
|
||||
public GenotypeBuilder alleles(final List<Allele> alleles) {
|
||||
if ( alleles == null )
|
||||
this.alleles = Collections.emptyList();
|
||||
else
|
||||
this.alleles = alleles;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this genotype phased?
|
||||
* @param phased
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder phased(final boolean phased) {
|
||||
isPhased = phased;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Requires({"GQ >= -1"})
|
||||
@Ensures({"this.GQ == GQ", "this.GQ >= -1"})
|
||||
public GenotypeBuilder GQ(final int GQ) {
|
||||
this.GQ = GQ;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adaptor interface from the pLog10Error system.
|
||||
*
|
||||
* Will be retired when
|
||||
*
|
||||
* @param pLog10Error
|
||||
* @return
|
||||
*/
|
||||
@Deprecated
|
||||
public GenotypeBuilder log10PError(final double pLog10Error) {
|
||||
if ( pLog10Error == CommonInfo.NO_LOG10_PERROR )
|
||||
return GQ(-1);
|
||||
else
|
||||
return GQ((int)Math.round(pLog10Error * -10));
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has no GQ value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noGQ() { GQ = -1; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has no AD value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noAD() { AD = null; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has no DP value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noDP() { DP = -1; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has no PL value
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noPL() { PL = null; return this; }
|
||||
|
||||
/**
|
||||
* This genotype has this DP value
|
||||
* @return
|
||||
*/
|
||||
@Requires({"DP >= -1"})
|
||||
@Ensures({"this.DP == DP"})
|
||||
public GenotypeBuilder DP(final int DP) {
|
||||
this.DP = DP;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this AD value
|
||||
* @return
|
||||
*/
|
||||
@Requires({"AD == null || AD.length > 0"})
|
||||
@Ensures({"this.AD == AD"})
|
||||
public GenotypeBuilder AD(final int[] AD) {
|
||||
this.AD = AD;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this PL value, as int[]. FAST
|
||||
* @return
|
||||
*/
|
||||
@Requires("PL == null || PL.length > 0")
|
||||
@Ensures({"this.PL == PL"})
|
||||
public GenotypeBuilder PL(final int[] PL) {
|
||||
this.PL = PL;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this PL value, converted from double[]. SLOW
|
||||
* @return
|
||||
*/
|
||||
@Requires("PL == null || PL.length > 0")
|
||||
@Ensures({"this.PL == PL"})
|
||||
public GenotypeBuilder PL(final double[] GLs) {
|
||||
this.PL = GenotypeLikelihoods.fromLog10Likelihoods(GLs).getAsPLs();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has these attributes.
|
||||
*
|
||||
* Cannot contain inline attributes (DP, AD, GQ, PL)
|
||||
* @return
|
||||
*/
|
||||
@Requires("attributes != null")
|
||||
@Ensures("attributes.isEmpty() || extendedAttributes != null")
|
||||
public GenotypeBuilder attributes(final Map<String, Object> attributes) {
|
||||
for ( Map.Entry<String, Object> pair : attributes.entrySet() )
|
||||
attribute(pair.getKey(), pair.getValue());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder to remove all extended attributes
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder noAttributes() {
|
||||
this.extendedAttributes = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype has this attribute key / value pair.
|
||||
*
|
||||
* Cannot contain inline attributes (DP, AD, GQ, PL)
|
||||
* @return
|
||||
*/
|
||||
@Requires({"key != null"})
|
||||
@Ensures({"extendedAttributes != null", "extendedAttributes.containsKey(key)"})
|
||||
public GenotypeBuilder attribute(final String key, final Object value) {
|
||||
if ( extendedAttributes == null )
|
||||
extendedAttributes = new HashMap<String, Object>(initialAttributeMapSize);
|
||||
extendedAttributes.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder to make a Genotype object that has had filters applied,
|
||||
* which may be empty (passes) or have some value indicating the reasons
|
||||
* why it's been filtered.
|
||||
*
|
||||
* @param filters non-null list of filters. empty list => PASS
|
||||
* @return this builder
|
||||
*/
|
||||
@Requires("filters != null")
|
||||
public GenotypeBuilder filters(final List<String> filters) {
|
||||
if ( filters.isEmpty() )
|
||||
return filter(null);
|
||||
else if ( filters.size() == 1 )
|
||||
return filter(filters.get(0));
|
||||
else
|
||||
return filter(ParsingUtils.join(";", ParsingUtils.sortList(filters)));
|
||||
}
|
||||
|
||||
/**
|
||||
* varargs version of #filters
|
||||
* @param filters
|
||||
* @return
|
||||
*/
|
||||
@Requires("filters != null")
|
||||
public GenotypeBuilder filters(final String ... filters) {
|
||||
return filters(Arrays.asList(filters));
|
||||
}
|
||||
|
||||
/**
|
||||
* Most efficient version of setting filters -- just set the filters string to filters
|
||||
*
|
||||
* @param filter if filters == null or filters.equals("PASS") => genotype is PASS
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder filter(final String filter) {
|
||||
this.filters = VCFConstants.PASSES_FILTERS_v4.equals(filter) ? null : filter;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This genotype is unfiltered
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder unfiltered() {
|
||||
return filter(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell's this builder that we have at most these number of attributes
|
||||
* @return
|
||||
*/
|
||||
public GenotypeBuilder maxAttributes(final int i) {
|
||||
initialAttributeMapSize = i;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,463 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
|
||||
public class GenotypeLikelihoods {
|
||||
private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5;
|
||||
private final static int NUM_LIKELIHOODS_CACHE_PLOIDY = 10;
|
||||
// caching numAlleles up to 5 and ploidy up to 10
|
||||
private final static int[][] numLikelihoodCache = new int[NUM_LIKELIHOODS_CACHE_N_ALLELES][NUM_LIKELIHOODS_CACHE_PLOIDY];
|
||||
|
||||
public final static int MAX_PL = Short.MAX_VALUE;
|
||||
|
||||
//
|
||||
// There are two objects here because we are lazy in creating both representations
|
||||
// for this object: a vector of log10 Probs and the PL phred-scaled string. Supports
|
||||
// having one set during initializating, and dynamic creation of the other, if needed
|
||||
//
|
||||
private double[] log10Likelihoods = null;
|
||||
private String likelihoodsAsString_PLs = null;
|
||||
|
||||
|
||||
/**
|
||||
* initialize num likelihoods cache
|
||||
*/
|
||||
static {
|
||||
// must be done before PLIndexToAlleleIndex
|
||||
for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) {
|
||||
for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) {
|
||||
numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The maximum number of alleles that we can represent as genotype likelihoods
|
||||
*/
|
||||
public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50;
|
||||
|
||||
/*
|
||||
* a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
|
||||
*/
|
||||
private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
|
||||
|
||||
public final static GenotypeLikelihoods fromPLField(String PLs) {
|
||||
return new GenotypeLikelihoods(PLs);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public final static GenotypeLikelihoods fromGLField(String GLs) {
|
||||
return new GenotypeLikelihoods(parseDeprecatedGLString(GLs));
|
||||
}
|
||||
|
||||
public final static GenotypeLikelihoods fromLog10Likelihoods(double[] log10Likelihoods) {
|
||||
return new GenotypeLikelihoods(log10Likelihoods);
|
||||
}
|
||||
|
||||
public final static GenotypeLikelihoods fromPLs(final int[] pls) {
|
||||
return new GenotypeLikelihoods(PLsToGLs(pls));
|
||||
}
|
||||
|
||||
//
|
||||
// You must use the factory methods now
|
||||
//
|
||||
private GenotypeLikelihoods(String asString) {
|
||||
likelihoodsAsString_PLs = asString;
|
||||
}
|
||||
|
||||
private GenotypeLikelihoods(double[] asVector) {
|
||||
log10Likelihoods = asVector;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the genotypes likelihoods in negative log10 vector format. pr{AA} = x, this
|
||||
* vector returns math.log10(x) for each of the genotypes. Can return null if the
|
||||
* genotype likelihoods are "missing".
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double[] getAsVector() {
|
||||
// assumes one of the likelihoods vector or the string isn't null
|
||||
if ( log10Likelihoods == null ) {
|
||||
// make sure we create the GL string if it doesn't already exist
|
||||
log10Likelihoods = parsePLsIntoLikelihoods(likelihoodsAsString_PLs);
|
||||
}
|
||||
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
public int[] getAsPLs() {
|
||||
final double[] GLs = getAsVector();
|
||||
return GLs == null ? null : GLsToPLs(GLs);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getAsString();
|
||||
}
|
||||
|
||||
public String getAsString() {
|
||||
if ( likelihoodsAsString_PLs == null ) {
|
||||
// todo -- should we accept null log10Likelihoods and set PLs as MISSING?
|
||||
if ( log10Likelihoods == null )
|
||||
throw new TribbleException("BUG: Attempted to get likelihoods as strings and neither the vector nor the string is set!");
|
||||
likelihoodsAsString_PLs = convertLikelihoodsToPLString(log10Likelihoods);
|
||||
}
|
||||
|
||||
return likelihoodsAsString_PLs;
|
||||
}
|
||||
|
||||
@Override public boolean equals(Object aThat) {
|
||||
//check for self-comparison
|
||||
if ( this == aThat ) return true;
|
||||
|
||||
if ( !(aThat instanceof GenotypeLikelihoods) ) return false;
|
||||
GenotypeLikelihoods that = (GenotypeLikelihoods)aThat;
|
||||
|
||||
// now a proper field-by-field evaluation can be made.
|
||||
// GLs are considered equal if the corresponding PLs are equal
|
||||
return Arrays.equals(getAsPLs(), that.getAsPLs());
|
||||
}
|
||||
|
||||
//Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
|
||||
//Returns null in case of missing likelihoods
|
||||
public EnumMap<GenotypeType,Double> getAsMap(boolean normalizeFromLog10){
|
||||
//Make sure that the log10likelihoods are set
|
||||
double[] likelihoods = normalizeFromLog10 ? GeneralUtils.normalizeFromLog10(getAsVector()) : getAsVector();
|
||||
if(likelihoods == null)
|
||||
return null;
|
||||
EnumMap<GenotypeType,Double> likelihoodsMap = new EnumMap<GenotypeType, Double>(GenotypeType.class);
|
||||
likelihoodsMap.put(GenotypeType.HOM_REF,likelihoods[GenotypeType.HOM_REF.ordinal()-1]);
|
||||
likelihoodsMap.put(GenotypeType.HET,likelihoods[GenotypeType.HET.ordinal()-1]);
|
||||
likelihoodsMap.put(GenotypeType.HOM_VAR, likelihoods[GenotypeType.HOM_VAR.ordinal() - 1]);
|
||||
return likelihoodsMap;
|
||||
}
|
||||
|
||||
//Return the neg log10 Genotype Quality (GQ) for the given genotype
|
||||
//Returns Double.NEGATIVE_INFINITY in case of missing genotype
|
||||
|
||||
/**
|
||||
* This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context.
|
||||
* Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List<Allele>) in place of it.
|
||||
*
|
||||
* If you **know** you're biallelic, use getGQLog10FromLikelihoods directly.
|
||||
* @param genotype - actually a genotype type (no call, hom ref, het, hom var)
|
||||
* @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best).
|
||||
*/
|
||||
@Deprecated
|
||||
public double getLog10GQ(GenotypeType genotype){
|
||||
return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector());
|
||||
}
|
||||
|
||||
@Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"})
|
||||
private double getLog10GQ(List<Allele> genotypeAlleles,List<Allele> contextAlleles) {
|
||||
int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0));
|
||||
int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1));
|
||||
int plIndex = calculatePLindex(allele1Index,allele2Index);
|
||||
return getGQLog10FromLikelihoods(plIndex,getAsVector());
|
||||
}
|
||||
|
||||
public double getLog10GQ(Genotype genotype, List<Allele> vcAlleles ) {
|
||||
return getLog10GQ(genotype.getAlleles(),vcAlleles);
|
||||
}
|
||||
|
||||
public double getLog10GQ(Genotype genotype, VariantContext context) {
|
||||
return getLog10GQ(genotype,context.getAlleles());
|
||||
}
|
||||
|
||||
public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){
|
||||
if(likelihoods == null)
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
|
||||
double qual = Double.NEGATIVE_INFINITY;
|
||||
for (int i=0; i < likelihoods.length; i++) {
|
||||
if (i==iOfChoosenGenotype)
|
||||
continue;
|
||||
if (likelihoods[i] >= qual)
|
||||
qual = likelihoods[i];
|
||||
}
|
||||
|
||||
// qual contains now max(likelihoods[k]) for all k != bestGTguess
|
||||
qual = likelihoods[iOfChoosenGenotype] - qual;
|
||||
|
||||
if (qual < 0) {
|
||||
// QUAL can be negative if the chosen genotype is not the most likely one individually.
|
||||
// In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one
|
||||
double[] normalized = GeneralUtils.normalizeFromLog10(likelihoods);
|
||||
double chosenGenotype = normalized[iOfChoosenGenotype];
|
||||
return Math.log10(1.0 - chosenGenotype);
|
||||
} else {
|
||||
// invert the size, as this is the probability of making an error
|
||||
return -1 * qual;
|
||||
}
|
||||
}
|
||||
|
||||
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
|
||||
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
|
||||
String[] strings = likelihoodsAsString_PLs.split(",");
|
||||
double[] likelihoodsAsVector = new double[strings.length];
|
||||
try {
|
||||
for ( int i = 0; i < strings.length; i++ ) {
|
||||
likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0;
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
throw new TribbleException("The GL/PL tag contains non-integer values: " + likelihoodsAsString_PLs);
|
||||
}
|
||||
return likelihoodsAsVector;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Back-compatibility function to read old style GL formatted genotype likelihoods in VCF format
|
||||
* @param GLString
|
||||
* @return
|
||||
*/
|
||||
private final static double[] parseDeprecatedGLString(String GLString) {
|
||||
if ( !GLString.equals(VCFConstants.MISSING_VALUE_v4) ) {
|
||||
String[] strings = GLString.split(",");
|
||||
double[] likelihoodsAsVector = new double[strings.length];
|
||||
for ( int i = 0; i < strings.length; i++ ) {
|
||||
likelihoodsAsVector[i] = Double.parseDouble(strings[i]);
|
||||
}
|
||||
return likelihoodsAsVector;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private final static String convertLikelihoodsToPLString(final double[] GLs) {
|
||||
if ( GLs == null )
|
||||
return VCFConstants.MISSING_VALUE_v4;
|
||||
|
||||
final StringBuilder s = new StringBuilder();
|
||||
boolean first = true;
|
||||
for ( final int pl : GLsToPLs(GLs) ) {
|
||||
if ( ! first )
|
||||
s.append(",");
|
||||
else
|
||||
first = false;
|
||||
|
||||
s.append(pl);
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
private final static int[] GLsToPLs(final double[] GLs) {
|
||||
final int[] pls = new int[GLs.length];
|
||||
final double adjust = maxPL(GLs);
|
||||
|
||||
for ( int i = 0; i < GLs.length; i++ ) {
|
||||
pls[i] = (int)Math.round(Math.min(-10 * (GLs[i] - adjust), MAX_PL));
|
||||
}
|
||||
|
||||
return pls;
|
||||
}
|
||||
|
||||
private final static double maxPL(final double[] GLs) {
|
||||
double adjust = Double.NEGATIVE_INFINITY;
|
||||
for ( double l : GLs ) adjust = Math.max(adjust, l);
|
||||
return adjust;
|
||||
}
|
||||
|
||||
private final static double[] PLsToGLs(final int pls[]) {
|
||||
double[] likelihoodsAsVector = new double[pls.length];
|
||||
for ( int i = 0; i < pls.length; i++ ) {
|
||||
likelihoodsAsVector[i] = pls[i] / -10.0;
|
||||
}
|
||||
return likelihoodsAsVector;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static conversion utilities, going from GL/PL index to allele index and vice versa.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/*
|
||||
* Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index.
|
||||
* Note that the reference allele is always index=0.
|
||||
*/
|
||||
public static class GenotypeLikelihoodsAllelePair {
|
||||
public final int alleleIndex1, alleleIndex2;
|
||||
|
||||
public GenotypeLikelihoodsAllelePair(final int alleleIndex1, final int alleleIndex2) {
|
||||
this.alleleIndex1 = alleleIndex1;
|
||||
this.alleleIndex2 = alleleIndex2;
|
||||
}
|
||||
}
|
||||
|
||||
private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) {
|
||||
final int numLikelihoods = numLikelihoods(1 + altAlleles, 2);
|
||||
final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods];
|
||||
|
||||
// for all possible combinations of 2 alleles
|
||||
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
|
||||
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
|
||||
cache[calculatePLindex(allele1, allele2)] = new GenotypeLikelihoodsAllelePair(allele1, allele2);
|
||||
}
|
||||
}
|
||||
|
||||
// a bit of sanity checking
|
||||
for ( int i = 0; i < cache.length; i++ ) {
|
||||
if ( cache[i] == null )
|
||||
throw new IllegalStateException("BUG: cache entry " + i + " is unexpected null");
|
||||
}
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// num likelihoods given number of alleles and ploidy
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Actually does the computation in @see #numLikelihoods
|
||||
*
|
||||
* @param numAlleles
|
||||
* @param ploidy
|
||||
* @return
|
||||
*/
|
||||
private static final int calcNumLikelihoods(final int numAlleles, final int ploidy) {
|
||||
if (numAlleles == 1)
|
||||
return 1;
|
||||
else if (ploidy == 1)
|
||||
return numAlleles;
|
||||
else {
|
||||
int acc =0;
|
||||
for (int k=0; k <= ploidy; k++ )
|
||||
acc += calcNumLikelihoods(numAlleles - 1, ploidy - k);
|
||||
return acc;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute how many likelihood elements are associated with the given number of alleles
|
||||
* Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P)
|
||||
* where P = ploidy (number of chromosomes) and N = total # of alleles.
|
||||
* Each chromosome can be in one single state (0,...,N-1) and there are P of them.
|
||||
* Naive solution would be to store N*P likelihoods, but this is not necessary because we can't distinguish chromosome states, but rather
|
||||
* only total number of alt allele counts in all chromosomes.
|
||||
*
|
||||
* For example, S(3,2) = 6: For alleles A,B,C, on a diploid organism we have six possible genotypes:
|
||||
* AA,AB,BB,AC,BC,CC.
|
||||
* Another way of expressing is with vector (#of A alleles, # of B alleles, # of C alleles)
|
||||
* which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2)
|
||||
* In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2
|
||||
*
|
||||
* Note this method caches the value for most common num Allele / ploidy combinations for efficiency
|
||||
*
|
||||
* Recursive implementation:
|
||||
* S(N,P) = sum_{k=0}^P S(N-1,P-k)
|
||||
* because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K
|
||||
* With initial conditions
|
||||
* S(N,1) = N (only way to have N integers add up to 1 is all-zeros except one element with a one. There are N of these vectors)
|
||||
* S(1,P) = 1 (only way to have 1 integer add to P is with that integer P itself).
|
||||
*
|
||||
* @param numAlleles Number of alleles (including ref)
|
||||
* @param ploidy Ploidy, or number of chromosomes in set
|
||||
* @return Number of likelihood elements we need to hold.
|
||||
*/
|
||||
@Requires({"ploidy > 0", "numAlleles > 0"})
|
||||
@Ensures("result > 0")
|
||||
public static int numLikelihoods(final int numAlleles, final int ploidy) {
|
||||
if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES
|
||||
&& ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY )
|
||||
return numLikelihoodCache[numAlleles][ploidy];
|
||||
else {
|
||||
// have to calculate on the fly
|
||||
return calcNumLikelihoods(numAlleles, ploidy);
|
||||
}
|
||||
}
|
||||
|
||||
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
|
||||
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
|
||||
// Assumes that allele1Index < allele2Index
|
||||
public static int calculatePLindex(final int allele1Index, final int allele2Index) {
|
||||
return (allele2Index * (allele2Index+1) / 2) + allele1Index;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the allele index pair for the given PL
|
||||
*
|
||||
* @param PLindex the PL index
|
||||
* @return the allele index pair
|
||||
*/
|
||||
public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) {
|
||||
// make sure that we've cached enough data
|
||||
if ( PLindex >= PLIndexToAlleleIndex.length )
|
||||
throw new IllegalStateException("Internal limitation: cannot genotype more than " + MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + " alleles");
|
||||
|
||||
return PLIndexToAlleleIndex[PLindex];
|
||||
}
|
||||
|
||||
// An index conversion from the deprecated PL ordering to the new VCF-based ordering for up to 3 alternate alleles
|
||||
protected static final int[] PLindexConversion = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9};
|
||||
|
||||
/**
|
||||
* get the allele index pair for the given PL using the deprecated PL ordering:
|
||||
* AA,AB,AC,AD,BB,BC,BD,CC,CD,DD instead of AA,AB,BB,AC,BC,CC,AD,BD,CD,DD.
|
||||
* Although it's painful to keep this conversion around, our DiploidSNPGenotypeLikelihoods class uses the deprecated
|
||||
* ordering and I know with certainty that external users have built code on top of it; changing it now would
|
||||
* cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method.
|
||||
* This method assumes at most 3 alternate alleles.
|
||||
*
|
||||
* @param PLindex the PL index
|
||||
* @return the allele index pair
|
||||
*/
|
||||
@Deprecated
|
||||
public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) {
|
||||
return getAllelePair(PLindexConversion[PLindex]);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index.
|
||||
*
|
||||
* @param allele1Index the index in VariantContext.getAllele() of the first allele
|
||||
* @param allele2Index the index in VariantContext.getAllele() of the second allele
|
||||
* @return the PL indexes
|
||||
*/
|
||||
public static int[] getPLIndecesOfAlleles(final int allele1Index, final int allele2Index) {
|
||||
|
||||
final int[] indexes = new int[3];
|
||||
indexes[0] = calculatePLindex(allele1Index, allele1Index);
|
||||
indexes[1] = calculatePLindex(allele1Index, allele2Index);
|
||||
indexes[2] = calculatePLindex(allele2Index, allele2Index);
|
||||
return indexes;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
/**
|
||||
* Summary types for Genotype objects
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public enum GenotypeType {
|
||||
/** The sample is no-called (all alleles are NO_CALL */
|
||||
NO_CALL,
|
||||
/** The sample is homozygous reference */
|
||||
HOM_REF,
|
||||
/** The sample is heterozygous, with at least one ref and at least one one alt in any order */
|
||||
HET,
|
||||
/** All alleles are non-reference */
|
||||
HOM_VAR,
|
||||
/** There is no allele data availble for this sample (alleles.isEmpty) */
|
||||
UNAVAILABLE,
|
||||
/** Some chromosomes are NO_CALL and others are called */
|
||||
MIXED // no-call and call in the same genotype
|
||||
}
|
||||
|
|
@ -1,724 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Represents an ordered collection of Genotype objects
|
||||
*/
|
||||
public class GenotypesContext implements List<Genotype> {
|
||||
/**
|
||||
* static constant value for an empty GenotypesContext. Useful since so many VariantContexts have no genotypes
|
||||
*/
|
||||
public final static GenotypesContext NO_GENOTYPES =
|
||||
new GenotypesContext(new ArrayList<Genotype>(0), new HashMap<String, Integer>(0), Collections.<String>emptyList()).immutable();
|
||||
|
||||
/**
|
||||
*sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical order
|
||||
*/
|
||||
List<String> sampleNamesInOrder = null;
|
||||
|
||||
/**
|
||||
* a map optimized for efficient lookup. Each genotype in genotypes must have its
|
||||
* sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that
|
||||
* genotype in the vector of genotypes
|
||||
*/
|
||||
Map<String, Integer> sampleNameToOffset = null;
|
||||
|
||||
/**
|
||||
* An ArrayList of genotypes contained in this context
|
||||
*
|
||||
* WARNING: TO ENABLE THE LAZY VERSION OF THIS CLASS, NO METHODS SHOULD DIRECTLY
|
||||
* ACCESS THIS VARIABLE. USE getGenotypes() INSTEAD.
|
||||
*
|
||||
*/
|
||||
ArrayList<Genotype> notToBeDirectlyAccessedGenotypes;
|
||||
|
||||
/**
|
||||
* Cached value of the maximum ploidy observed among all samples
|
||||
*/
|
||||
private int maxPloidy = -1;
|
||||
|
||||
/** Are we allowing users to modify the list? */
|
||||
boolean immutable = false;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// private constructors -- you have to use static create methods to make these classes
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Create an empty GenotypeContext
|
||||
*/
|
||||
protected GenotypesContext() {
|
||||
this(10);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an empty GenotypeContext, with initial capacity for n elements
|
||||
*/
|
||||
@Requires("n >= 0")
|
||||
protected GenotypesContext(final int n) {
|
||||
this(new ArrayList<Genotype>(n));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an GenotypeContext containing genotypes
|
||||
*/
|
||||
@Requires("genotypes != null")
|
||||
protected GenotypesContext(final ArrayList<Genotype> genotypes) {
|
||||
this.notToBeDirectlyAccessedGenotypes = genotypes;
|
||||
this.sampleNameToOffset = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a fully resolved GenotypeContext containing genotypes, sample lookup table,
|
||||
* and sorted sample names
|
||||
*
|
||||
* @param genotypes our genotypes in arbitrary
|
||||
* @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its
|
||||
* sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that
|
||||
* genotype in the vector of genotypes
|
||||
* @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical
|
||||
* order.
|
||||
*/
|
||||
@Requires({"genotypes != null",
|
||||
"sampleNameToOffset != null",
|
||||
"sampleNamesInOrder != null",
|
||||
"genotypes.size() == sampleNameToOffset.size()",
|
||||
"genotypes.size() == sampleNamesInOrder.size()"})
|
||||
protected GenotypesContext(final ArrayList<Genotype> genotypes,
|
||||
final Map<String, Integer> sampleNameToOffset,
|
||||
final List<String> sampleNamesInOrder) {
|
||||
this.notToBeDirectlyAccessedGenotypes = genotypes;
|
||||
this.sampleNameToOffset = sampleNameToOffset;
|
||||
this.sampleNamesInOrder = sampleNamesInOrder;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// public static factory methods
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Basic creation routine
|
||||
* @return an empty, mutable GenotypeContext
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext create() {
|
||||
return new GenotypesContext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic creation routine
|
||||
* @return an empty, mutable GenotypeContext with initial capacity for nGenotypes
|
||||
*/
|
||||
@Requires("nGenotypes >= 0")
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext create(final int nGenotypes) {
|
||||
return new GenotypesContext(nGenotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a fully resolved GenotypeContext containing genotypes, sample lookup table,
|
||||
* and sorted sample names
|
||||
*
|
||||
* @param genotypes our genotypes in arbitrary
|
||||
* @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its
|
||||
* sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that
|
||||
* genotype in the vector of genotypes
|
||||
* @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical
|
||||
* order.
|
||||
* @return an mutable GenotypeContext containing genotypes with already present lookup data
|
||||
*/
|
||||
@Requires({"genotypes != null",
|
||||
"sampleNameToOffset != null",
|
||||
"sampleNamesInOrder != null"})
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext create(final ArrayList<Genotype> genotypes,
|
||||
final Map<String, Integer> sampleNameToOffset,
|
||||
final List<String> sampleNamesInOrder) {
|
||||
return new GenotypesContext(genotypes, sampleNameToOffset, sampleNamesInOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a fully resolved GenotypeContext containing genotypes
|
||||
*
|
||||
* @param genotypes our genotypes in arbitrary
|
||||
* @return an mutable GenotypeContext containing genotypes
|
||||
*/
|
||||
@Requires({"genotypes != null"})
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext create(final ArrayList<Genotype> genotypes) {
|
||||
return genotypes == null ? NO_GENOTYPES : new GenotypesContext(genotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a fully resolved GenotypeContext containing genotypes
|
||||
*
|
||||
* @param genotypes our genotypes in arbitrary
|
||||
* @return an mutable GenotypeContext containing genotypes
|
||||
*/
|
||||
@Requires({"genotypes != null"})
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext create(final Genotype... genotypes) {
|
||||
return create(new ArrayList<Genotype>(Arrays.asList(genotypes)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a freshly allocated GenotypeContext containing the genotypes in toCopy
|
||||
*
|
||||
* @param toCopy the GenotypesContext to copy
|
||||
* @return an mutable GenotypeContext containing genotypes
|
||||
*/
|
||||
@Requires({"toCopy != null"})
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext copy(final GenotypesContext toCopy) {
|
||||
return create(new ArrayList<Genotype>(toCopy.getGenotypes()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a GenotypesContext containing the genotypes in iteration order contained
|
||||
* in toCopy
|
||||
*
|
||||
* @param toCopy the collection of genotypes
|
||||
* @return an mutable GenotypeContext containing genotypes
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public static final GenotypesContext copy(final Collection<Genotype> toCopy) {
|
||||
return toCopy == null ? NO_GENOTYPES : create(new ArrayList<Genotype>(toCopy));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Mutability methods
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
public final GenotypesContext immutable() {
|
||||
immutable = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean isMutable() {
|
||||
return ! immutable;
|
||||
}
|
||||
|
||||
public final void checkImmutability() {
|
||||
if ( immutable )
|
||||
throw new IllegalAccessError("GenotypeMap is currently immutable, but a mutator method was invoked on it");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// caches
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@Ensures({"sampleNameToOffset == null"})
|
||||
protected void invalidateSampleNameMap() {
|
||||
sampleNameToOffset = null;
|
||||
}
|
||||
|
||||
@Ensures({"sampleNamesInOrder == null"})
|
||||
protected void invalidateSampleOrdering() {
|
||||
sampleNamesInOrder = null;
|
||||
}
|
||||
|
||||
@Ensures({"sampleNamesInOrder != null"})
|
||||
protected void ensureSampleOrdering() {
|
||||
if ( sampleNamesInOrder == null ) {
|
||||
sampleNamesInOrder = new ArrayList<String>(size());
|
||||
|
||||
for ( int i = 0; i < size(); i++ ) {
|
||||
sampleNamesInOrder.add(getGenotypes().get(i).getSampleName());
|
||||
}
|
||||
Collections.sort(sampleNamesInOrder);
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures({"sampleNameToOffset != null"})
|
||||
protected void ensureSampleNameMap() {
|
||||
if ( sampleNameToOffset == null ) {
|
||||
sampleNameToOffset = new HashMap<String, Integer>(size());
|
||||
|
||||
for ( int i = 0; i < size(); i++ ) {
|
||||
sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Lazy methods
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
public boolean isLazyWithData() {
|
||||
return this instanceof LazyGenotypesContext &&
|
||||
((LazyGenotypesContext)this).getUnparsedGenotypeData() != null;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Map methods
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
protected ArrayList<Genotype> getGenotypes() {
|
||||
return notToBeDirectlyAccessedGenotypes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
checkImmutability();
|
||||
invalidateSampleNameMap();
|
||||
invalidateSampleOrdering();
|
||||
getGenotypes().clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return getGenotypes().size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return getGenotypes().isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a single genotype to this context.
|
||||
*
|
||||
* There are many constraints on this input, and important
|
||||
* impacts on the performance of other functions provided by this
|
||||
* context.
|
||||
*
|
||||
* First, the sample name of genotype must be unique within this
|
||||
* context. However, this is not enforced in the code itself, through
|
||||
* you will invalid the contract on this context if you add duplicate
|
||||
* samples and are running with CoFoJa enabled.
|
||||
*
|
||||
* Second, adding genotype also updates the sample name -> index map,
|
||||
* so add() followed by containsSample and related function is an efficient
|
||||
* series of operations.
|
||||
*
|
||||
* Third, adding the genotype invalidates the sorted list of sample names, to
|
||||
* add() followed by any of the SampleNamesInOrder operations is inefficient, as
|
||||
* each SampleNamesInOrder must rebuild the sorted list of sample names at
|
||||
* an O(n log n) cost.
|
||||
*
|
||||
* @param genotype
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
@Requires({"genotype != null", "get(genotype.getSampleName()) == null"})
|
||||
public boolean add(final Genotype genotype) {
|
||||
checkImmutability();
|
||||
invalidateSampleOrdering();
|
||||
|
||||
if ( sampleNameToOffset != null ) {
|
||||
// update the name map by adding entries
|
||||
sampleNameToOffset.put(genotype.getSampleName(), size());
|
||||
}
|
||||
|
||||
return getGenotypes().add(genotype);
|
||||
}
|
||||
|
||||
@Override
|
||||
@Requires("! contains(genotype)")
|
||||
public void add(final int i, final Genotype genotype) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds all of the genotypes to this context
|
||||
*
|
||||
* See {@link #add(Genotype)} for important information about this functions
|
||||
* constraints and performance costs
|
||||
*
|
||||
* @param genotypes
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
@Requires("! containsAny(genotypes)")
|
||||
public boolean addAll(final Collection<? extends Genotype> genotypes) {
|
||||
checkImmutability();
|
||||
invalidateSampleOrdering();
|
||||
|
||||
if ( sampleNameToOffset != null ) {
|
||||
// update the name map by adding entries
|
||||
int pos = size();
|
||||
for ( final Genotype g : genotypes ) {
|
||||
sampleNameToOffset.put(g.getSampleName(), pos++);
|
||||
}
|
||||
}
|
||||
|
||||
return getGenotypes().addAll(genotypes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean addAll(final int i, final Collection<? extends Genotype> genotypes) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(final Object o) {
|
||||
return getGenotypes().contains(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsAll(final Collection<?> objects) {
|
||||
return getGenotypes().containsAll(objects);
|
||||
}
|
||||
|
||||
private boolean containsAny(final Collection<? extends Genotype> genotypes) {
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( contains(g) ) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Genotype get(final int i) {
|
||||
return getGenotypes().get(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* What is the max ploidy among all samples? Returns defaultPloidy if no genotypes are present
|
||||
*
|
||||
* @param defaultPloidy the default ploidy, if all samples are no-called
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int getMaxPloidy(final int defaultPloidy) {
|
||||
if ( defaultPloidy < 0 ) throw new IllegalArgumentException("defaultPloidy must be greater than or equal to 0");
|
||||
|
||||
if ( maxPloidy == -1 ) {
|
||||
maxPloidy = 0; // necessary in the case where there are no genotypes
|
||||
for ( final Genotype g : getGenotypes() ) {
|
||||
maxPloidy = Math.max(g.getPloidy(), maxPloidy);
|
||||
}
|
||||
|
||||
// everything is no called so we return the default ploidy
|
||||
if ( maxPloidy == 0 ) maxPloidy = defaultPloidy;
|
||||
}
|
||||
|
||||
return maxPloidy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets sample associated with this sampleName, or null if none is found
|
||||
*
|
||||
* @param sampleName
|
||||
* @return
|
||||
*/
|
||||
public Genotype get(final String sampleName) {
|
||||
Integer offset = getSampleI(sampleName);
|
||||
return offset == null ? null : getGenotypes().get(offset);
|
||||
}
|
||||
|
||||
private Integer getSampleI(final String sampleName) {
|
||||
ensureSampleNameMap();
|
||||
return sampleNameToOffset.get(sampleName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int indexOf(final Object o) {
|
||||
return getGenotypes().indexOf(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Genotype> iterator() {
|
||||
return getGenotypes().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int lastIndexOf(final Object o) {
|
||||
return getGenotypes().lastIndexOf(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListIterator<Genotype> listIterator() {
|
||||
// todo -- must be immutable
|
||||
throw new UnsupportedOperationException();
|
||||
// return genotypes.listIterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListIterator<Genotype> listIterator(final int i) {
|
||||
// todo -- must be immutable
|
||||
throw new UnsupportedOperationException();
|
||||
// return genotypes.listIterator(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Note that remove requires us to invalidate our sample -> index
|
||||
* cache. The loop:
|
||||
*
|
||||
* GenotypesContext gc = ...
|
||||
* for ( sample in samples )
|
||||
* if ( gc.containsSample(sample) )
|
||||
* gc.remove(sample)
|
||||
*
|
||||
* is extremely inefficient, as each call to remove invalidates the cache
|
||||
* and containsSample requires us to rebuild it, an O(n) operation.
|
||||
*
|
||||
* If you must remove many samples from the GC, use either removeAll or retainAll
|
||||
* to avoid this O(n * m) operation.
|
||||
*
|
||||
* @param i
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public Genotype remove(final int i) {
|
||||
checkImmutability();
|
||||
invalidateSampleNameMap();
|
||||
invalidateSampleOrdering();
|
||||
return getGenotypes().remove(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* See for important warning {@link this.remove(Integer)}
|
||||
* @param o
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean remove(final Object o) {
|
||||
checkImmutability();
|
||||
invalidateSampleNameMap();
|
||||
invalidateSampleOrdering();
|
||||
return getGenotypes().remove(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean removeAll(final Collection<?> objects) {
|
||||
checkImmutability();
|
||||
invalidateSampleNameMap();
|
||||
invalidateSampleOrdering();
|
||||
return getGenotypes().removeAll(objects);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retainAll(final Collection<?> objects) {
|
||||
checkImmutability();
|
||||
invalidateSampleNameMap();
|
||||
invalidateSampleOrdering();
|
||||
return getGenotypes().retainAll(objects);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Genotype set(final int i, final Genotype genotype) {
|
||||
checkImmutability();
|
||||
final Genotype prev = getGenotypes().set(i, genotype);
|
||||
|
||||
invalidateSampleOrdering();
|
||||
if ( sampleNameToOffset != null ) {
|
||||
// update the name map by removing the old entry and replacing it with the new one
|
||||
sampleNameToOffset.remove(prev.getSampleName());
|
||||
sampleNameToOffset.put(genotype.getSampleName(), i);
|
||||
}
|
||||
|
||||
return prev;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces the genotype in this context -- note for efficiency
|
||||
* reasons we do not add the genotype if it's not present. The
|
||||
* return value will be null indicating this happened.
|
||||
*
|
||||
* Note this operation is preserves the map cache Sample -> Offset but
|
||||
* invalidates the sorted list of samples. Using replace within a loop
|
||||
* containing any of the SampleNameInOrder operation requires an O(n log n)
|
||||
* resorting after each replace operation.
|
||||
*
|
||||
* @param genotype a non null genotype to bind in this context
|
||||
* @return null if genotype was not added, otherwise returns the previous genotype
|
||||
*/
|
||||
@Requires("genotype != null")
|
||||
public Genotype replace(final Genotype genotype) {
|
||||
checkImmutability();
|
||||
Integer offset = getSampleI(genotype.getSampleName());
|
||||
if ( offset == null )
|
||||
return null;
|
||||
else
|
||||
return set(offset, genotype);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Genotype> subList(final int i, final int i1) {
|
||||
return getGenotypes().subList(i, i1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] toArray() {
|
||||
return getGenotypes().toArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T[] toArray(final T[] ts) {
|
||||
return getGenotypes().toArray(ts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the Genotypes in this context in the order specified by sampleNamesInOrder
|
||||
*
|
||||
* @param sampleNamesInOrder a Iterable of String, containing exactly one entry for each Genotype sample name in
|
||||
* this context
|
||||
* @return a Iterable over the genotypes in this context.
|
||||
*/
|
||||
@Requires("sampleNamesInOrder != null")
|
||||
public Iterable<Genotype> iterateInSampleNameOrder(final Iterable<String> sampleNamesInOrder) {
|
||||
return new Iterable<Genotype>() {
|
||||
@Override
|
||||
public Iterator<Genotype> iterator() {
|
||||
return new InOrderIterator(sampleNamesInOrder.iterator());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the Genotypes in this context in their sample name order (A, B, C)
|
||||
* regardless of the underlying order in the vector of genotypes
|
||||
* @return a Iterable over the genotypes in this context.
|
||||
*/
|
||||
public Iterable<Genotype> iterateInSampleNameOrder() {
|
||||
return iterateInSampleNameOrder(getSampleNamesOrderedByName());
|
||||
}
|
||||
|
||||
private final class InOrderIterator implements Iterator<Genotype> {
|
||||
final Iterator<String> sampleNamesInOrder;
|
||||
|
||||
private InOrderIterator(final Iterator<String> sampleNamesInOrder) {
|
||||
this.sampleNamesInOrder = sampleNamesInOrder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return sampleNamesInOrder.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Genotype next() {
|
||||
return get(sampleNamesInOrder.next());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The set of sample names for all genotypes in this context, in arbitrary order
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Set<String> getSampleNames() {
|
||||
ensureSampleNameMap();
|
||||
return sampleNameToOffset.keySet();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The set of sample names for all genotypes in this context, in their natural ordering (A, B, C)
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public List<String> getSampleNamesOrderedByName() {
|
||||
ensureSampleOrdering();
|
||||
return sampleNamesInOrder;
|
||||
}
|
||||
|
||||
@Requires("sample != null")
|
||||
public boolean containsSample(final String sample) {
|
||||
ensureSampleNameMap();
|
||||
return sampleNameToOffset.containsKey(sample);
|
||||
}
|
||||
|
||||
@Requires("samples != null")
|
||||
public boolean containsSamples(final Collection<String> samples) {
|
||||
return getSampleNames().containsAll(samples);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a freshly allocated subcontext of this context containing only the samples
|
||||
* listed in samples. Note that samples can contain names not in this context, they
|
||||
* will just be ignored.
|
||||
*
|
||||
* @param samples
|
||||
* @return
|
||||
*/
|
||||
@Requires("samples != null")
|
||||
@Ensures("result != null")
|
||||
public GenotypesContext subsetToSamples( final Set<String> samples ) {
|
||||
final int nSamples = samples.size();
|
||||
|
||||
if ( nSamples == 0 )
|
||||
return NO_GENOTYPES;
|
||||
else { // nGenotypes < nSamples
|
||||
final GenotypesContext subset = create(samples.size());
|
||||
for ( final String sample : samples ) {
|
||||
final Genotype g = get(sample);
|
||||
if ( g != null )
|
||||
subset.add(g);
|
||||
}
|
||||
return subset;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final List<String> gS = new ArrayList<String>();
|
||||
for ( final Genotype g : this.iterateInSampleNameOrder() )
|
||||
gS.add(g.toString());
|
||||
return "[" + join(",", gS) + "]";
|
||||
}
|
||||
|
||||
// copied from Utils
|
||||
private static <T> String join(final String separator, final Collection<T> objects) {
|
||||
if (objects.isEmpty()) { // fast path for empty collection
|
||||
return "";
|
||||
} else {
|
||||
final Iterator<T> iter = objects.iterator();
|
||||
final T first = iter.next();
|
||||
|
||||
if ( ! iter.hasNext() ) // fast path for singleton collections
|
||||
return first.toString();
|
||||
else { // full path for 2+ collection that actually need a join
|
||||
final StringBuilder ret = new StringBuilder(first.toString());
|
||||
while(iter.hasNext()) {
|
||||
ret.append(separator);
|
||||
ret.append(iter.next().toString());
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,198 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Lazy-loading GenotypesContext. A lazy-loading context has access to the
|
||||
* VCFParser and a unparsed string of genotype data. If the user attempts to manipulate
|
||||
* the genotypes contained in this context, we decode the data and become a full blown
|
||||
* GenotypesContext. However, if the user never does this we are spared a lot of expense
|
||||
* decoding the genotypes unnecessarily.
|
||||
*/
|
||||
public class LazyGenotypesContext extends GenotypesContext {
|
||||
/** The LazyParser we'll use to decode unparsedGenotypeData if necessary */
|
||||
final LazyParser parser;
|
||||
|
||||
Object unparsedGenotypeData;
|
||||
|
||||
/**
|
||||
* nUnparsedGenotypes the number of genotypes contained in the unparsedGenotypes data
|
||||
* (known already in the parser). Useful for isEmpty and size() optimizations
|
||||
*/
|
||||
final int nUnparsedGenotypes;
|
||||
|
||||
/**
|
||||
* True if we've already decoded the values in unparsedGenotypeData
|
||||
*/
|
||||
boolean loaded = false;
|
||||
|
||||
private final static ArrayList<Genotype> EMPTY = new ArrayList<Genotype>(0);
|
||||
|
||||
/**
|
||||
* Simple lazy parser interface. Provide an object implementing this
|
||||
* interface to LazyGenotypesContext, and it's parse method will be called
|
||||
* when the use of the lazy context requires the underlying genotypes data
|
||||
* be parsed into Genotype objects. The data argument is the data provided
|
||||
* to the LazyGenotypesContext holding encoded genotypes data
|
||||
*/
|
||||
public interface LazyParser {
|
||||
@Requires("data != null")
|
||||
@Ensures("result != null")
|
||||
public LazyData parse(Object data);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the data used in the full GenotypesContext constructor
|
||||
*
|
||||
* {@link GenotypesContext#GenotypesContext(java.util.ArrayList, java.util.Map, java.util.List)}
|
||||
*/
|
||||
public static class LazyData {
|
||||
final ArrayList<Genotype> genotypes;
|
||||
final Map<String, Integer> sampleNameToOffset;
|
||||
final List<String> sampleNamesInOrder;
|
||||
|
||||
@Requires({"genotypes != null", "sampleNamesInOrder != null", "sampleNameToOffset != null"})
|
||||
public LazyData(final ArrayList<Genotype> genotypes,
|
||||
final List<String> sampleNamesInOrder,
|
||||
final Map<String, Integer> sampleNameToOffset) {
|
||||
this.genotypes = genotypes;
|
||||
this.sampleNamesInOrder = sampleNamesInOrder;
|
||||
this.sampleNameToOffset = sampleNameToOffset;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new lazy loading genotypes context using the LazyParser to create
|
||||
* genotypes data on demand.
|
||||
*
|
||||
* @param parser the parser to be used to load on-demand genotypes data
|
||||
* @param unparsedGenotypeData the encoded genotypes data that we will decode if necessary
|
||||
* @param nUnparsedGenotypes the number of genotypes that will be produced if / when we actually decode the genotypes data
|
||||
*/
|
||||
@Requires({"parser != null", "unparsedGenotypeData != null", "nUnparsedGenotypes >= 0"})
|
||||
public LazyGenotypesContext(final LazyParser parser, final Object unparsedGenotypeData, final int nUnparsedGenotypes) {
|
||||
super(EMPTY);
|
||||
this.parser = parser;
|
||||
this.unparsedGenotypeData = unparsedGenotypeData;
|
||||
this.nUnparsedGenotypes = nUnparsedGenotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the genotypes accessor. If we haven't already, decode the genotypes data
|
||||
* and store the decoded results in the appropriate variables. Otherwise we just
|
||||
* returned the decoded result directly. Note some care needs to be taken here as
|
||||
* the value in notToBeDirectlyAccessedGenotypes may diverge from what would be produced
|
||||
* by decode, if after the first decode the genotypes themselves are replaced
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
@Ensures("result != null")
|
||||
protected ArrayList<Genotype> getGenotypes() {
|
||||
decode();
|
||||
return notToBeDirectlyAccessedGenotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Force us to decode the genotypes, if not already done
|
||||
*/
|
||||
public void decode() {
|
||||
if ( ! loaded ) {
|
||||
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
|
||||
LazyData parsed = parser.parse(unparsedGenotypeData);
|
||||
notToBeDirectlyAccessedGenotypes = parsed.genotypes;
|
||||
sampleNamesInOrder = parsed.sampleNamesInOrder;
|
||||
sampleNameToOffset = parsed.sampleNameToOffset;
|
||||
loaded = true;
|
||||
unparsedGenotypeData = null; // don't hold the unparsed data any longer
|
||||
|
||||
// warning -- this path allows us to create a VariantContext that doesn't run validateGenotypes()
|
||||
// That said, it's not such an important routine -- it's just checking that the genotypes
|
||||
// are well formed w.r.t. the alleles list, but this will be enforced within the VCFCodec
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the ensure* functionality. If the data hasn't been loaded
|
||||
* yet and we want to build the cache, just decode it and we're done. If we've
|
||||
* already decoded the data, though, go through the super class
|
||||
*/
|
||||
@Override
|
||||
protected synchronized void ensureSampleNameMap() {
|
||||
if ( ! loaded ) {
|
||||
decode(); // will load up all of the necessary data
|
||||
} else {
|
||||
super.ensureSampleNameMap();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected synchronized void ensureSampleOrdering() {
|
||||
if ( ! loaded ) {
|
||||
decode(); // will load up all of the necessary data
|
||||
} else {
|
||||
super.ensureSampleOrdering();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void invalidateSampleNameMap() {
|
||||
// if the cache is invalidated, and we haven't loaded our data yet, do so
|
||||
if ( ! loaded ) decode();
|
||||
super.invalidateSampleNameMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void invalidateSampleOrdering() {
|
||||
// if the cache is invalidated, and we haven't loaded our data yet, do so
|
||||
if ( ! loaded ) decode();
|
||||
super.invalidateSampleOrdering();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
// optimization -- we know the number of samples in the unparsed data, so use it here to
|
||||
// avoid parsing just to know if the genotypes context is empty
|
||||
return loaded ? super.isEmpty() : nUnparsedGenotypes == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
// optimization -- we know the number of samples in the unparsed data, so use it here to
|
||||
// avoid parsing just to know the size of the context
|
||||
return loaded ? super.size() : nUnparsedGenotypes;
|
||||
}
|
||||
|
||||
public Object getUnparsedGenotypeData() {
|
||||
return unparsedGenotypeData;
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,482 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import com.google.java.contract.*;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Builder class for VariantContext
|
||||
*
|
||||
* Some basic assumptions here:
|
||||
*
|
||||
* 1 -- data isn't protectively copied. If you provide an attribute map to
|
||||
* the build, and modify it later, the builder will see this and so will any
|
||||
* resulting variant contexts. It's best not to modify collections provided
|
||||
* to a builder.
|
||||
*
|
||||
* 2 -- the system uses the standard builder model, allowing the simple construction idiom:
|
||||
*
|
||||
* builder.source("a").genotypes(gc).id("x").make() => VariantContext
|
||||
*
|
||||
* 3 -- The best way to copy a VariantContext is:
|
||||
*
|
||||
* new VariantContextBuilder(vc).make() => a copy of VC
|
||||
*
|
||||
* 4 -- validation of arguments is done at the during the final make() call, so a
|
||||
* VariantContextBuilder can exist in an inconsistent state as long as those issues
|
||||
* are resolved before the call to make() is issued.
|
||||
*
|
||||
* @author depristo
|
||||
*/
|
||||
public class VariantContextBuilder {
|
||||
// required fields
|
||||
private boolean fullyDecoded = false;
|
||||
private String source = null;
|
||||
private String contig = null;
|
||||
private long start = -1;
|
||||
private long stop = -1;
|
||||
private Collection<Allele> alleles = null;
|
||||
|
||||
// optional -> these are set to the appropriate default value
|
||||
private String ID = VCFConstants.EMPTY_ID_FIELD;
|
||||
private GenotypesContext genotypes = GenotypesContext.NO_GENOTYPES;
|
||||
private double log10PError = VariantContext.NO_LOG10_PERROR;
|
||||
private Set<String> filters = null;
|
||||
private Map<String, Object> attributes = null;
|
||||
private boolean attributesCanBeModified = false;
|
||||
|
||||
/** enum of what must be validated */
|
||||
final private EnumSet<VariantContext.Validation> toValidate = EnumSet.noneOf(VariantContext.Validation.class);
|
||||
|
||||
/**
|
||||
* Create an empty VariantContextBuilder where all values adopt their default values. Note that
|
||||
* source, chr, start, stop, and alleles must eventually be filled in, or the resulting VariantContext
|
||||
* will throw an error.
|
||||
*/
|
||||
public VariantContextBuilder() {}
|
||||
|
||||
/**
|
||||
* Create an empty VariantContextBuilder where all values adopt their default values, but the bare min.
|
||||
* of info (source, chr, start, stop, and alleles) have been provided to start.
|
||||
*/
|
||||
@Requires({"source != null", "contig != null", "start >= 0", "stop >= 0",
|
||||
"alleles != null && !alleles.isEmpty()"})
|
||||
public VariantContextBuilder(String source, String contig, long start, long stop, Collection<Allele> alleles) {
|
||||
this.source = source;
|
||||
this.contig = contig;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.alleles = alleles;
|
||||
this.attributes = Collections.emptyMap(); // immutable
|
||||
toValidate.add(VariantContext.Validation.ALLELES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new builder based on parent -- the new VC will have all fields initialized
|
||||
* to their corresponding values in parent. This is the best way to create a derived VariantContext
|
||||
*
|
||||
* @param parent Cannot be null
|
||||
*/
|
||||
public VariantContextBuilder(VariantContext parent) {
|
||||
if ( parent == null ) throw new IllegalArgumentException("BUG: VariantContextBuilder parent argument cannot be null in VariantContextBuilder");
|
||||
this.alleles = parent.alleles;
|
||||
this.attributes = parent.getAttributes();
|
||||
this.attributesCanBeModified = false;
|
||||
this.contig = parent.contig;
|
||||
this.filters = parent.getFiltersMaybeNull();
|
||||
this.genotypes = parent.genotypes;
|
||||
this.ID = parent.getID();
|
||||
this.log10PError = parent.getLog10PError();
|
||||
this.source = parent.getSource();
|
||||
this.start = parent.getStart();
|
||||
this.stop = parent.getEnd();
|
||||
this.fullyDecoded = parent.isFullyDecoded();
|
||||
}
|
||||
|
||||
public VariantContextBuilder(VariantContextBuilder parent) {
|
||||
if ( parent == null ) throw new IllegalArgumentException("BUG: VariantContext parent argument cannot be null in VariantContextBuilder");
|
||||
this.alleles = parent.alleles;
|
||||
this.attributesCanBeModified = false;
|
||||
this.contig = parent.contig;
|
||||
this.genotypes = parent.genotypes;
|
||||
this.ID = parent.ID;
|
||||
this.log10PError = parent.log10PError;
|
||||
this.source = parent.source;
|
||||
this.start = parent.start;
|
||||
this.stop = parent.stop;
|
||||
this.fullyDecoded = parent.fullyDecoded;
|
||||
|
||||
this.attributes(parent.attributes);
|
||||
this.filters(parent.filters);
|
||||
}
|
||||
|
||||
public VariantContextBuilder copy() {
|
||||
return new VariantContextBuilder(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder to use this collection of alleles for the resulting VariantContext
|
||||
*
|
||||
* @param alleles
|
||||
* @return this builder
|
||||
*/
|
||||
@Requires({"alleles != null", "!alleles.isEmpty()"})
|
||||
public VariantContextBuilder alleles(final Collection<Allele> alleles) {
|
||||
this.alleles = alleles;
|
||||
toValidate.add(VariantContext.Validation.ALLELES);
|
||||
return this;
|
||||
}
|
||||
|
||||
public VariantContextBuilder alleles(final List<String> alleleStrings) {
|
||||
List<Allele> alleles = new ArrayList<Allele>(alleleStrings.size());
|
||||
|
||||
for ( int i = 0; i < alleleStrings.size(); i++ ) {
|
||||
alleles.add(Allele.create(alleleStrings.get(i), i == 0));
|
||||
}
|
||||
|
||||
return alleles(alleles);
|
||||
}
|
||||
|
||||
public VariantContextBuilder alleles(final String ... alleleStrings) {
|
||||
return alleles(Arrays.asList(alleleStrings));
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return new ArrayList<Allele>(alleles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder to use this map of attributes alleles for the resulting VariantContext
|
||||
*
|
||||
* Attributes can be null -> meaning there are no attributes. After
|
||||
* calling this routine the builder assumes it can modify the attributes
|
||||
* object here, if subsequent calls are made to set attribute values
|
||||
* @param attributes
|
||||
*/
|
||||
public VariantContextBuilder attributes(final Map<String, Object> attributes) {
|
||||
if (attributes != null) {
|
||||
this.attributes = attributes;
|
||||
}
|
||||
else {
|
||||
this.attributes = new HashMap<String, Object>();
|
||||
}
|
||||
|
||||
this.attributesCanBeModified = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Puts the key -> value mapping into this builder's attributes
|
||||
*
|
||||
* @param key
|
||||
* @param value
|
||||
* @return
|
||||
*/
|
||||
@Requires({"key != null"})
|
||||
@Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()+1)"})
|
||||
public VariantContextBuilder attribute(final String key, final Object value) {
|
||||
makeAttributesModifiable();
|
||||
attributes.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes key if present in the attributes
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
@Requires({"key != null"})
|
||||
@Ensures({"this.attributes.size() == old(this.attributes.size()) || this.attributes.size() == old(this.attributes.size()-1)"})
|
||||
public VariantContextBuilder rmAttribute(final String key) {
|
||||
makeAttributesModifiable();
|
||||
attributes.remove(key);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes the attributes field modifiable. In many cases attributes is just a pointer to an immutable
|
||||
* collection, so methods that want to add / remove records require the attributes to be copied to a
|
||||
*/
|
||||
@Ensures({"this.attributesCanBeModified"})
|
||||
private void makeAttributesModifiable() {
|
||||
if ( ! attributesCanBeModified ) {
|
||||
this.attributesCanBeModified = true;
|
||||
this.attributes = new HashMap<String, Object>(attributes);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This builder's filters are set to this value
|
||||
*
|
||||
* filters can be null -> meaning there are no filters
|
||||
* @param filters
|
||||
*/
|
||||
public VariantContextBuilder filters(final Set<String> filters) {
|
||||
this.filters = filters;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link #filters}
|
||||
*
|
||||
* @param filters
|
||||
* @return
|
||||
*/
|
||||
public VariantContextBuilder filters(final String ... filters) {
|
||||
filters(new LinkedHashSet<String>(Arrays.asList(filters)));
|
||||
return this;
|
||||
}
|
||||
|
||||
@Requires({"filter != null", "!filter.equals(\"PASS\")"})
|
||||
public VariantContextBuilder filter(final String filter) {
|
||||
if ( this.filters == null ) this.filters = new LinkedHashSet<String>(1);
|
||||
this.filters.add(filter);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder that the resulting VariantContext should have PASS filters
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public VariantContextBuilder passFilters() {
|
||||
return filters(VariantContext.PASSES_FILTERS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder that the resulting VariantContext be unfiltered
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public VariantContextBuilder unfiltered() {
|
||||
this.filters = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder that the resulting VariantContext should use this genotypes GenotypeContext
|
||||
*
|
||||
* Note that genotypes can be null -> meaning there are no genotypes
|
||||
*
|
||||
* @param genotypes
|
||||
*/
|
||||
public VariantContextBuilder genotypes(final GenotypesContext genotypes) {
|
||||
this.genotypes = genotypes;
|
||||
if ( genotypes != null )
|
||||
toValidate.add(VariantContext.Validation.GENOTYPES);
|
||||
return this;
|
||||
}
|
||||
|
||||
public VariantContextBuilder genotypesNoValidation(final GenotypesContext genotypes) {
|
||||
this.genotypes = genotypes;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes
|
||||
*
|
||||
* Note that genotypes can be null -> meaning there are no genotypes
|
||||
*
|
||||
* @param genotypes
|
||||
*/
|
||||
public VariantContextBuilder genotypes(final Collection<Genotype> genotypes) {
|
||||
return genotypes(GenotypesContext.copy(genotypes));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder that the resulting VariantContext should use a GenotypeContext containing genotypes
|
||||
* @param genotypes
|
||||
*/
|
||||
public VariantContextBuilder genotypes(final Genotype ... genotypes) {
|
||||
return genotypes(GenotypesContext.copy(Arrays.asList(genotypes)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells this builder that the resulting VariantContext should not contain any GenotypeContext
|
||||
*/
|
||||
public VariantContextBuilder noGenotypes() {
|
||||
this.genotypes = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have ID
|
||||
* @param ID
|
||||
* @return
|
||||
*/
|
||||
@Requires("ID != null")
|
||||
public VariantContextBuilder id(final String ID) {
|
||||
this.ID = ID;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should not have an ID
|
||||
* @return
|
||||
*/
|
||||
public VariantContextBuilder noID() {
|
||||
return id(VCFConstants.EMPTY_ID_FIELD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have log10PError
|
||||
* @param log10PError
|
||||
* @return
|
||||
*/
|
||||
@Requires("log10PError <= 0 || log10PError == VariantContext.NO_LOG10_PERROR")
|
||||
public VariantContextBuilder log10PError(final double log10PError) {
|
||||
this.log10PError = log10PError;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have source field set to source
|
||||
* @param source
|
||||
* @return
|
||||
*/
|
||||
@Requires("source != null")
|
||||
public VariantContextBuilder source(final String source) {
|
||||
this.source = source;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have the specified location
|
||||
* @param contig
|
||||
* @param start
|
||||
* @param stop
|
||||
* @return
|
||||
*/
|
||||
@Requires({"contig != null", "start >= 0", "stop >= 0"})
|
||||
public VariantContextBuilder loc(final String contig, final long start, final long stop) {
|
||||
this.contig = contig;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
toValidate.add(VariantContext.Validation.ALLELES);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have the specified contig chr
|
||||
* @param contig
|
||||
* @return
|
||||
*/
|
||||
@Requires({"contig != null"})
|
||||
public VariantContextBuilder chr(final String contig) {
|
||||
this.contig = contig;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have the specified contig start
|
||||
* @param start
|
||||
* @return
|
||||
*/
|
||||
@Requires({"start >= 0"})
|
||||
public VariantContextBuilder start(final long start) {
|
||||
this.start = start;
|
||||
toValidate.add(VariantContext.Validation.ALLELES);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have the specified contig stop
|
||||
* @param stop
|
||||
* @return
|
||||
*/
|
||||
@Requires({"stop >= 0"})
|
||||
public VariantContextBuilder stop(final long stop) {
|
||||
this.stop = stop;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #computeEndFromAlleles(java.util.List, int, int) with endForSymbolicAlleles == -1
|
||||
*/
|
||||
public VariantContextBuilder computeEndFromAlleles(final List<Allele> alleles, final int start) {
|
||||
return computeEndFromAlleles(alleles, start, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the end position for this VariantContext from the alleles themselves
|
||||
*
|
||||
* assigns this builder the stop position computed.
|
||||
*
|
||||
* @param alleles the list of alleles to consider. The reference allele must be the first one
|
||||
* @param start the known start position of this event
|
||||
* @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1
|
||||
* if no is expected but will throw an error if one is found
|
||||
* @return this builder
|
||||
*/
|
||||
@Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" })
|
||||
public VariantContextBuilder computeEndFromAlleles(final List<Allele> alleles, final int start, final int endForSymbolicAlleles) {
|
||||
stop(VariantContextUtils.computeEndFromAlleles(alleles, start, endForSymbolicAlleles));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this builder contains fully decoded data
|
||||
*
|
||||
* See VariantContext for more information
|
||||
*/
|
||||
public boolean isFullyDecoded() {
|
||||
return fullyDecoded;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this builder's fully decoded state to true.
|
||||
*
|
||||
* A fully decoded builder indicates that all fields are represented by their
|
||||
* proper java objects (e.g., Integer(10) not "10").
|
||||
*
|
||||
* See VariantContext for more information
|
||||
*
|
||||
* @param isFullyDecoded
|
||||
*/
|
||||
public VariantContextBuilder fullyDecoded(boolean isFullyDecoded) {
|
||||
this.fullyDecoded = isFullyDecoded;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes all of the builder data provided up to this point, and instantiates
|
||||
* a freshly allocated VariantContext with all of the builder data. This
|
||||
* VariantContext is validated as appropriate and if not failing QC (and
|
||||
* throwing an exception) is returned.
|
||||
*
|
||||
* Note that this function can be called multiple times to create multiple
|
||||
* VariantContexts from the same builder.
|
||||
*/
|
||||
public VariantContext make() {
|
||||
return new VariantContext(source, ID, contig, start, stop, alleles,
|
||||
genotypes, log10PError, filters, attributes,
|
||||
fullyDecoded, toValidate);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,374 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.jexl2.Expression;
|
||||
import org.apache.commons.jexl2.JexlEngine;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class VariantContextUtils {
|
||||
|
||||
private static Set<String> MISSING_KEYS_WARNED_ABOUT = new HashSet<String>();
|
||||
|
||||
final public static JexlEngine engine = new JexlEngine();
|
||||
private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false;
|
||||
|
||||
static {
|
||||
engine.setSilent(false); // will throw errors now for selects that don't evaluate properly
|
||||
engine.setLenient(false);
|
||||
engine.setDebug(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the attributes of the attributes map given the VariantContext to reflect the
|
||||
* proper chromosome-based VCF tags
|
||||
*
|
||||
* @param vc the VariantContext
|
||||
* @param attributes the attributes map to populate; must not be null; may contain old values
|
||||
* @param removeStaleValues should we remove stale values from the mapping?
|
||||
* @return the attributes map provided as input, returned for programming convenience
|
||||
*/
|
||||
public static Map<String, Object> calculateChromosomeCounts(VariantContext vc, Map<String, Object> attributes, boolean removeStaleValues) {
|
||||
return calculateChromosomeCounts(vc, attributes, removeStaleValues, new HashSet<String>(0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the attributes of the attributes map given the VariantContext to reflect the
|
||||
* proper chromosome-based VCF tags
|
||||
*
|
||||
* @param vc the VariantContext
|
||||
* @param attributes the attributes map to populate; must not be null; may contain old values
|
||||
* @param removeStaleValues should we remove stale values from the mapping?
|
||||
* @param founderIds - Set of founders Ids to take into account. AF and FC will be calculated over the founders.
|
||||
* If empty or null, counts are generated for all samples as unrelated individuals
|
||||
* @return the attributes map provided as input, returned for programming convenience
|
||||
*/
|
||||
public static Map<String, Object> calculateChromosomeCounts(VariantContext vc, Map<String, Object> attributes, boolean removeStaleValues, final Set<String> founderIds) {
|
||||
final int AN = vc.getCalledChrCount();
|
||||
|
||||
// if everyone is a no-call, remove the old attributes if requested
|
||||
if ( AN == 0 && removeStaleValues ) {
|
||||
if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) )
|
||||
attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
|
||||
if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) )
|
||||
attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
|
||||
if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) )
|
||||
attributes.remove(VCFConstants.ALLELE_NUMBER_KEY);
|
||||
return attributes;
|
||||
}
|
||||
|
||||
if ( vc.hasGenotypes() ) {
|
||||
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, AN);
|
||||
|
||||
// if there are alternate alleles, record the relevant tags
|
||||
if ( vc.getAlternateAlleles().size() > 0 ) {
|
||||
ArrayList<Double> alleleFreqs = new ArrayList<Double>();
|
||||
ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
|
||||
ArrayList<Integer> foundersAlleleCounts = new ArrayList<Integer>();
|
||||
double totalFoundersChromosomes = (double)vc.getCalledChrCount(founderIds);
|
||||
int foundersAltChromosomes;
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
foundersAltChromosomes = vc.getCalledChrCount(allele,founderIds);
|
||||
alleleCounts.add(vc.getCalledChrCount(allele));
|
||||
foundersAlleleCounts.add(foundersAltChromosomes);
|
||||
if ( AN == 0 ) {
|
||||
alleleFreqs.add(0.0);
|
||||
} else {
|
||||
final Double freq = (double)foundersAltChromosomes / totalFoundersChromosomes;
|
||||
alleleFreqs.add(freq);
|
||||
}
|
||||
}
|
||||
|
||||
attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
|
||||
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
|
||||
} else {
|
||||
// if there's no alt AC and AF shouldn't be present
|
||||
attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
|
||||
attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
|
||||
}
|
||||
}
|
||||
|
||||
return attributes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
|
||||
* chromosome-based VCF tags based on the current VC produced by builder.make()
|
||||
*
|
||||
* @param builder the VariantContextBuilder we are updating
|
||||
* @param removeStaleValues should we remove stale values from the mapping?
|
||||
*/
|
||||
public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) {
|
||||
VariantContext vc = builder.make();
|
||||
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, new HashSet<String>(0)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
|
||||
* chromosome-based VCF tags based on the current VC produced by builder.make()
|
||||
*
|
||||
* @param builder the VariantContextBuilder we are updating
|
||||
* @param founderIds - Set of founders to take into account. AF and FC will be calculated over the founders only.
|
||||
* If empty or null, counts are generated for all samples as unrelated individuals
|
||||
* @param removeStaleValues should we remove stale values from the mapping?
|
||||
*/
|
||||
public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues, final Set<String> founderIds) {
|
||||
VariantContext vc = builder.make();
|
||||
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
|
||||
}
|
||||
|
||||
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
|
||||
VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field);
|
||||
if ( metaData == null ) metaData = header.getInfoHeaderLine(field);
|
||||
if ( metaData == null ) {
|
||||
if ( ASSUME_MISSING_FIELDS_ARE_STRINGS ) {
|
||||
if ( ! MISSING_KEYS_WARNED_ABOUT.contains(field) ) {
|
||||
MISSING_KEYS_WARNED_ABOUT.add(field);
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED )
|
||||
System.err.println("Field " + field + " missing from VCF header, assuming it is an unbounded string type");
|
||||
}
|
||||
return new VCFInfoHeaderLine(field, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Auto-generated string header for " + field);
|
||||
}
|
||||
else
|
||||
throw new TribbleException("Fully decoding VariantContext requires header line for all fields, but none was found for " + field);
|
||||
}
|
||||
return metaData;
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple but common wrapper for matching VariantContext objects using JEXL expressions
|
||||
*/
|
||||
public static class JexlVCMatchExp {
|
||||
public String name;
|
||||
public Expression exp;
|
||||
|
||||
/**
|
||||
* Create a new matcher expression with name and JEXL expression exp
|
||||
* @param name name
|
||||
* @param exp expression
|
||||
*/
|
||||
public JexlVCMatchExp(String name, Expression exp) {
|
||||
this.name = name;
|
||||
this.exp = exp;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays contain
|
||||
* the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
|
||||
* a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to
|
||||
* match() below.
|
||||
*
|
||||
* @param names names
|
||||
* @param exps expressions
|
||||
* @return list of matches
|
||||
*/
|
||||
public static List<JexlVCMatchExp> initializeMatchExps(String[] names, String[] exps) {
|
||||
if ( names == null || exps == null )
|
||||
throw new IllegalArgumentException("BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps) );
|
||||
|
||||
if ( names.length != exps.length )
|
||||
throw new IllegalArgumentException("Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps));
|
||||
|
||||
Map<String, String> map = new HashMap<String, String>();
|
||||
for ( int i = 0; i < names.length; i++ ) { map.put(names[i], exps[i]); }
|
||||
|
||||
return VariantContextUtils.initializeMatchExps(map);
|
||||
}
|
||||
|
||||
public static List<JexlVCMatchExp> initializeMatchExps(ArrayList<String> names, ArrayList<String> exps) {
|
||||
String[] nameArray = new String[names.size()];
|
||||
String[] expArray = new String[exps.size()];
|
||||
return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain
|
||||
* the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
|
||||
* a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to
|
||||
* match() below.
|
||||
*
|
||||
* @param names_and_exps mapping of names to expressions
|
||||
* @return list of matches
|
||||
*/
|
||||
public static List<JexlVCMatchExp> initializeMatchExps(Map<String, String> names_and_exps) {
|
||||
List<JexlVCMatchExp> exps = new ArrayList<JexlVCMatchExp>();
|
||||
|
||||
for ( Map.Entry<String, String> elt : names_and_exps.entrySet() ) {
|
||||
String name = elt.getKey();
|
||||
String expStr = elt.getValue();
|
||||
|
||||
if ( name == null || expStr == null ) throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr);
|
||||
try {
|
||||
Expression exp = engine.createExpression(expStr);
|
||||
exps.add(new JexlVCMatchExp(name, exp));
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Argument " + name + "has a bad value. Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax.") ;
|
||||
}
|
||||
}
|
||||
|
||||
return exps;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if exp match VC. See collection<> version for full docs.
|
||||
* @param vc variant context
|
||||
* @param exp expression
|
||||
* @return true if there is a match
|
||||
*/
|
||||
public static boolean match(VariantContext vc, JexlVCMatchExp exp) {
|
||||
return match(vc,Arrays.asList(exp)).get(exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these
|
||||
* expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL
|
||||
* expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp
|
||||
* expressions.
|
||||
*
|
||||
* @param vc variant context
|
||||
* @param exps expressions
|
||||
* @return true if there is a match
|
||||
*/
|
||||
public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Collection<JexlVCMatchExp> exps) {
|
||||
return new JEXLMap(exps,vc);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if exp match VC/g. See collection<> version for full docs.
|
||||
* @param vc variant context
|
||||
* @param g genotype
|
||||
* @param exp expression
|
||||
* @return true if there is a match
|
||||
*/
|
||||
public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) {
|
||||
return match(vc,g,Arrays.asList(exp)).get(exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches each JexlVCMatchExp exp against the data contained in vc/g, and returns a map from these
|
||||
* expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL
|
||||
* expressions to VariantContext records/genotypes. Use initializeMatchExps() to create the list of JexlVCMatchExp
|
||||
* expressions.
|
||||
*
|
||||
* @param vc variant context
|
||||
* @param g genotype
|
||||
* @param exps expressions
|
||||
* @return true if there is a match
|
||||
*/
|
||||
public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Genotype g, Collection<JexlVCMatchExp> exps) {
|
||||
return new JEXLMap(exps,vc,g);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly allocated VC that is the same as VC, but without genotypes
|
||||
* @param vc variant context
|
||||
* @return new VC without genotypes
|
||||
*/
|
||||
@Requires("vc != null")
|
||||
@Ensures("result != null")
|
||||
public static VariantContext sitesOnlyVariantContext(VariantContext vc) {
|
||||
return new VariantContextBuilder(vc).noGenotypes().make();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes
|
||||
* @param vcs collection of VCs
|
||||
* @return new VCs without genotypes
|
||||
*/
|
||||
@Requires("vcs != null")
|
||||
@Ensures("result != null")
|
||||
public static Collection<VariantContext> sitesOnlyVariantContexts(Collection<VariantContext> vcs) {
|
||||
List<VariantContext> r = new ArrayList<VariantContext>();
|
||||
for ( VariantContext vc : vcs )
|
||||
r.add(sitesOnlyVariantContext(vc));
|
||||
return r;
|
||||
}
|
||||
|
||||
// TODO: remove that after testing
|
||||
// static private void verifyUniqueSampleNames(Collection<VariantContext> unsortedVCs) {
|
||||
// Set<String> names = new HashSet<String>();
|
||||
// for ( VariantContext vc : unsortedVCs ) {
|
||||
// for ( String name : vc.getSampleNames() ) {
|
||||
// //System.out.printf("Checking %s %b%n", name, names.contains(name));
|
||||
// if ( names.contains(name) )
|
||||
// throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name);
|
||||
// }
|
||||
//
|
||||
// names.addAll(vc.getSampleNames());
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
public static int getSize( VariantContext vc ) {
|
||||
return vc.getEnd() - vc.getStart() + 1;
|
||||
}
|
||||
|
||||
public static final Set<String> genotypeNames(final Collection<Genotype> genotypes) {
|
||||
final Set<String> names = new HashSet<String>(genotypes.size());
|
||||
for ( final Genotype g : genotypes )
|
||||
names.add(g.getSampleName());
|
||||
return names;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the end position for this VariantContext from the alleles themselves
|
||||
*
|
||||
* In the trivial case this is a single BP event and end = start (open intervals)
|
||||
* In general the end is start + ref length - 1, handling the case where ref length == 0
|
||||
* However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases
|
||||
*
|
||||
* @param alleles the list of alleles to consider. The reference allele must be the first one
|
||||
* @param start the known start position of this event
|
||||
* @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1
|
||||
* if no is expected but will throw an error if one is found
|
||||
* @return this builder
|
||||
*/
|
||||
@Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" })
|
||||
public static int computeEndFromAlleles(final List<Allele> alleles, final int start, final int endForSymbolicAlleles) {
|
||||
final Allele ref = alleles.get(0);
|
||||
|
||||
if ( ref.isNonReference() )
|
||||
throw new IllegalStateException("computeEndFromAlleles requires first allele to be reference");
|
||||
|
||||
if ( VariantContext.hasSymbolicAlleles(alleles) ) {
|
||||
if ( endForSymbolicAlleles == -1 )
|
||||
throw new IllegalStateException("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided");
|
||||
return endForSymbolicAlleles;
|
||||
} else {
|
||||
return start + Math.max(ref.length() - 1, 0);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,326 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import org.apache.commons.jexl2.JexlContext;
|
||||
import org.apache.commons.jexl2.MapContext;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
* @author depristo
|
||||
*
|
||||
* Class VariantJEXLContext
|
||||
*
|
||||
* implements the JEXML context for VariantContext; this saves us from
|
||||
* having to generate a JEXML context lookup map everytime we want to evaluate an expression.
|
||||
*
|
||||
* This is package protected, only classes in variantcontext should have access to it.
|
||||
*
|
||||
* // todo -- clean up to remove or better support genotype filtering
|
||||
*/
|
||||
|
||||
class VariantJEXLContext implements JexlContext {
|
||||
// our stored variant context
|
||||
private VariantContext vc;
|
||||
|
||||
private interface AttributeGetter {
|
||||
public Object get(VariantContext vc);
|
||||
}
|
||||
|
||||
private static Map<String, AttributeGetter> x = new HashMap<String, AttributeGetter>();
|
||||
|
||||
static {
|
||||
x.put("vc", new AttributeGetter() { public Object get(VariantContext vc) { return vc; }});
|
||||
x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }});
|
||||
x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }});
|
||||
x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }});
|
||||
x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return -10 * vc.getLog10PError(); }});
|
||||
x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }});
|
||||
x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }});
|
||||
x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }});
|
||||
|
||||
// x.put("GT", new AttributeGetter() { public Object get(VariantContext vc) { return g.getGenotypeString(); }});
|
||||
x.put("homRefCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomRefCount(); }});
|
||||
x.put("hetCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHetCount(); }});
|
||||
x.put("homVarCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomVarCount(); }});
|
||||
}
|
||||
|
||||
public VariantJEXLContext(VariantContext vc) {
|
||||
this.vc = vc;
|
||||
}
|
||||
|
||||
public Object get(String name) {
|
||||
Object result = null;
|
||||
if ( x.containsKey(name) ) { // dynamic resolution of name -> value via map
|
||||
result = x.get(name).get(vc);
|
||||
} else if ( vc.hasAttribute(name)) {
|
||||
result = vc.getAttribute(name);
|
||||
} else if ( vc.getFilters().contains(name) ) {
|
||||
result = "1";
|
||||
}
|
||||
|
||||
//System.out.printf("dynamic lookup %s => %s%n", name, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean has(String name) {
|
||||
return get(name) != null;
|
||||
}
|
||||
|
||||
public void set(String name, Object value) {
|
||||
throw new UnsupportedOperationException("remove() not supported on a VariantJEXLContext");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* this is an implementation of a Map of JexlVCMatchExp to true or false values. It lazy initializes each value
|
||||
* as requested to save as much processing time as possible.
|
||||
*
|
||||
* Compatible with JEXL 1.1 (this code will be easier if we move to 2.0, all of the functionality can go into the
|
||||
* JexlContext's get()
|
||||
*
|
||||
*/
|
||||
|
||||
class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
|
||||
// our variant context and/or Genotype
|
||||
private final VariantContext vc;
|
||||
private final Genotype g;
|
||||
|
||||
// our context
|
||||
private JexlContext jContext = null;
|
||||
|
||||
// our mapping from JEXLVCMatchExp to Booleans, which will be set to NULL for previously uncached JexlVCMatchExp
|
||||
private Map<VariantContextUtils.JexlVCMatchExp,Boolean> jexl;
|
||||
|
||||
|
||||
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc, Genotype g) {
|
||||
this.vc = vc;
|
||||
this.g = g;
|
||||
initialize(jexlCollection);
|
||||
}
|
||||
|
||||
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc) {
|
||||
this(jexlCollection, vc, null);
|
||||
}
|
||||
|
||||
private void initialize(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection) {
|
||||
jexl = new HashMap<VariantContextUtils.JexlVCMatchExp,Boolean>();
|
||||
for (VariantContextUtils.JexlVCMatchExp exp: jexlCollection) {
|
||||
jexl.put(exp, null);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* create the internal JexlContext, only when required. This code is where new JEXL context variables
|
||||
* should get added.
|
||||
*
|
||||
*/
|
||||
private void createContext() {
|
||||
if ( g == null ) {
|
||||
// todo -- remove dependancy on g to the entire system
|
||||
jContext = new VariantJEXLContext(vc);
|
||||
} else {
|
||||
//
|
||||
// this whole branch is here just to support G jexl operations
|
||||
//
|
||||
Map<String, Object> infoMap = new HashMap<String, Object>();
|
||||
|
||||
if ( vc != null ) {
|
||||
// create a mapping of what we know about the variant context, its Chromosome, positions, etc.
|
||||
infoMap.put("CHROM", vc.getChr());
|
||||
infoMap.put("POS", vc.getStart());
|
||||
infoMap.put("TYPE", vc.getType().toString());
|
||||
infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual()));
|
||||
|
||||
// add alleles
|
||||
infoMap.put("ALLELES", GeneralUtils.join(";", vc.getAlleles()));
|
||||
infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles()));
|
||||
|
||||
// add attributes
|
||||
addAttributesToMap(infoMap, vc.getAttributes());
|
||||
|
||||
// add filter fields
|
||||
infoMap.put("FILTER", vc.isFiltered() ? "1" : "0");
|
||||
for ( Object filterCode : vc.getFilters() ) {
|
||||
infoMap.put(String.valueOf(filterCode), "1");
|
||||
}
|
||||
|
||||
// add genotype-specific fields
|
||||
// TODO -- implement me when we figure out a good way to represent this
|
||||
// for ( Genotype g : vc.getGenotypes().values() ) {
|
||||
// String prefix = g.getSampleName() + ".";
|
||||
// addAttributesToMap(infoMap, g.getAttributes(), prefix);
|
||||
// infoMap.put(prefix + "GT", g.getGenotypeString());
|
||||
// }
|
||||
|
||||
// add specific genotype if one is provided
|
||||
infoMap.put(VCFConstants.GENOTYPE_KEY, g.getGenotypeString());
|
||||
infoMap.put("isHomRef", g.isHomRef() ? "1" : "0");
|
||||
infoMap.put("isHet", g.isHet() ? "1" : "0");
|
||||
infoMap.put("isHomVar", g.isHomVar() ? "1" : "0");
|
||||
infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, g.getGQ());
|
||||
if ( g.hasDP() )
|
||||
infoMap.put(VCFConstants.DEPTH_KEY, g.getDP());
|
||||
for ( Map.Entry<String, Object> e : g.getExtendedAttributes().entrySet() ) {
|
||||
if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) )
|
||||
infoMap.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
// create the internal context that we can evaluate expressions against
|
||||
|
||||
jContext = new MapContext(infoMap);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the size of the internal data structure
|
||||
*/
|
||||
public int size() {
|
||||
return jexl.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if we're empty
|
||||
*/
|
||||
public boolean isEmpty() { return this.jexl.isEmpty(); }
|
||||
|
||||
/**
|
||||
* do we contain the specified key
|
||||
* @param o the key
|
||||
* @return true if we have a value for that key
|
||||
*/
|
||||
public boolean containsKey(Object o) { return jexl.containsKey(o); }
|
||||
|
||||
public Boolean get(Object o) {
|
||||
// if we've already determined the value, return it
|
||||
if (jexl.containsKey(o) && jexl.get(o) != null) return jexl.get(o);
|
||||
|
||||
// try and cast the expression
|
||||
VariantContextUtils.JexlVCMatchExp e = (VariantContextUtils.JexlVCMatchExp) o;
|
||||
evaluateExpression(e);
|
||||
return jexl.get(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the keyset of map
|
||||
* @return a set of keys of type JexlVCMatchExp
|
||||
*/
|
||||
public Set<VariantContextUtils.JexlVCMatchExp> keySet() {
|
||||
return jexl.keySet();
|
||||
}
|
||||
|
||||
/**
|
||||
* get all the values of the map. This is an expensive call, since it evaluates all keys that haven't
|
||||
* been evaluated yet. This is fine if you truely want all the keys, but if you only want a portion, or know
|
||||
* the keys you want, you would be better off using get() to get them by name.
|
||||
* @return a collection of boolean values, representing the results of all the variants evaluated
|
||||
*/
|
||||
public Collection<Boolean> values() {
|
||||
// this is an expensive call
|
||||
for (VariantContextUtils.JexlVCMatchExp exp : jexl.keySet())
|
||||
if (jexl.get(exp) == null)
|
||||
evaluateExpression(exp);
|
||||
return jexl.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* evaulate a JexlVCMatchExp's expression, given the current context (and setup the context if it's null)
|
||||
* @param exp the JexlVCMatchExp to evaluate
|
||||
*/
|
||||
private void evaluateExpression(VariantContextUtils.JexlVCMatchExp exp) {
|
||||
// if the context is null, we need to create it to evaluate the JEXL expression
|
||||
if (this.jContext == null) createContext();
|
||||
try {
|
||||
final Boolean value = (Boolean) exp.exp.evaluate(jContext);
|
||||
// treat errors as no match
|
||||
jexl.put(exp, value == null ? false : value);
|
||||
} catch (Exception e) {
|
||||
// if exception happens because variable is undefined (i.e. field in expression is not present), evaluate to FALSE
|
||||
// todo - might be safer if we explicitly checked for an exception type, but Apache's API doesn't seem to have that ability
|
||||
if (e.getMessage().contains("undefined variable"))
|
||||
jexl.put(exp,false);
|
||||
else
|
||||
throw new IllegalArgumentException(String.format("Invalid JEXL expression detected for %s with message %s", exp.name, e.getMessage()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* helper function: adds the list of attributes to the information map we're building
|
||||
* @param infoMap the map
|
||||
* @param attributes the attributes
|
||||
*/
|
||||
private static void addAttributesToMap(Map<String, Object> infoMap, Map<String, ?> attributes ) {
|
||||
for (Map.Entry<String, ?> e : attributes.entrySet()) {
|
||||
infoMap.put(e.getKey(), String.valueOf(e.getValue()));
|
||||
}
|
||||
}
|
||||
|
||||
public Boolean put(VariantContextUtils.JexlVCMatchExp jexlVCMatchExp, Boolean aBoolean) {
|
||||
return jexl.put(jexlVCMatchExp,aBoolean);
|
||||
}
|
||||
|
||||
public void putAll(Map<? extends VariantContextUtils.JexlVCMatchExp, ? extends Boolean> map) {
|
||||
jexl.putAll(map);
|
||||
}
|
||||
|
||||
// //////////////////////////////////////////////////////////////////////////////////////
|
||||
// The Following are unsupported at the moment
|
||||
// //////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// this doesn't make much sense to implement, boolean doesn't offer too much variety to deal
|
||||
// with evaluating every key in the internal map.
|
||||
public boolean containsValue(Object o) {
|
||||
throw new UnsupportedOperationException("containsValue() not supported on a JEXLMap");
|
||||
}
|
||||
|
||||
// this doesn't make much sense
|
||||
public Boolean remove(Object o) {
|
||||
throw new UnsupportedOperationException("remove() not supported on a JEXLMap");
|
||||
}
|
||||
|
||||
|
||||
public Set<Entry<VariantContextUtils.JexlVCMatchExp, Boolean>> entrySet() {
|
||||
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
|
||||
}
|
||||
|
||||
// nope
|
||||
public void clear() {
|
||||
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,279 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Type;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Utils;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 06/12
|
||||
*/
|
||||
public final class BCF2Encoder {
|
||||
// TODO -- increase default size?
|
||||
public static final int WRITE_BUFFER_INITIAL_SIZE = 16384;
|
||||
private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE);
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Functions to return the data being encoded here
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Ensures("result != null")
|
||||
public byte[] getRecordBytes() {
|
||||
byte[] bytes = encodeStream.toByteArray();
|
||||
encodeStream.reset();
|
||||
return bytes;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Writing typed values (have type byte)
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTypedMissing(final BCF2Type type) throws IOException {
|
||||
encodeType(0, type);
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTyped(final Object value, final BCF2Type type) throws IOException {
|
||||
if ( value == null )
|
||||
encodeTypedMissing(type);
|
||||
else {
|
||||
switch ( type ) {
|
||||
case INT8:
|
||||
case INT16:
|
||||
case INT32: encodeTypedInt((Integer)value, type); break;
|
||||
case FLOAT: encodeTypedFloat((Double) value); break;
|
||||
case CHAR: encodeTypedString((String) value); break;
|
||||
default: throw new IllegalArgumentException("Illegal type encountered " + type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTypedInt(final int v) throws IOException {
|
||||
final BCF2Type type = BCF2Utils.determineIntegerType(v);
|
||||
encodeTypedInt(v, type);
|
||||
}
|
||||
|
||||
@Requires("type.isIntegerType()")
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTypedInt(final int v, final BCF2Type type) throws IOException {
|
||||
encodeType(1, type);
|
||||
encodeRawInt(v, type);
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTypedString(final String s) throws IOException {
|
||||
encodeTypedString(s.getBytes());
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTypedString(final byte[] s) throws IOException {
|
||||
if ( s == null )
|
||||
encodeType(0, BCF2Type.CHAR);
|
||||
else {
|
||||
encodeType(s.length, BCF2Type.CHAR);
|
||||
for ( int i = 0; i < s.length; i++ ) {
|
||||
encodeRawChar(s[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTypedFloat(final double d) throws IOException {
|
||||
encodeType(1, BCF2Type.FLOAT);
|
||||
encodeRawFloat(d);
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTyped(List<? extends Object> v, final BCF2Type type) throws IOException {
|
||||
if ( type == BCF2Type.CHAR && v.size() != 0 ) {
|
||||
final String s = BCF2Utils.collapseStringList((List<String>) v);
|
||||
v = stringToBytes(s);
|
||||
}
|
||||
|
||||
encodeType(v.size(), type);
|
||||
encodeRawValues(v, type);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Writing raw values (don't have a type byte)
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public final <T extends Object> void encodeRawValues(final Collection<T> v, final BCF2Type type) throws IOException {
|
||||
for ( final T v1 : v ) {
|
||||
encodeRawValue(v1, type);
|
||||
}
|
||||
}
|
||||
|
||||
public final <T extends Object> void encodeRawValue(final T value, final BCF2Type type) throws IOException {
|
||||
try {
|
||||
if ( value == type.getMissingJavaValue() )
|
||||
encodeRawMissingValue(type);
|
||||
else {
|
||||
switch (type) {
|
||||
case INT8:
|
||||
case INT16:
|
||||
case INT32: encodeRawBytes((Integer) value, type); break;
|
||||
case FLOAT: encodeRawFloat((Double) value); break;
|
||||
case CHAR: encodeRawChar((Byte) value); break;
|
||||
default: throw new IllegalArgumentException("Illegal type encountered " + type);
|
||||
}
|
||||
}
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new ClassCastException("BUG: invalid type cast to " + type + " from " + value);
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeRawMissingValue(final BCF2Type type) throws IOException {
|
||||
encodeRawBytes(type.getMissingBytes(), type);
|
||||
}
|
||||
|
||||
@Requires("size >= 0")
|
||||
public final void encodeRawMissingValues(final int size, final BCF2Type type) throws IOException {
|
||||
for ( int i = 0; i < size; i++ )
|
||||
encodeRawMissingValue(type);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// low-level encoders
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public final void encodeRawChar(final byte c) throws IOException {
|
||||
encodeStream.write(c);
|
||||
}
|
||||
|
||||
public final void encodeRawFloat(final double value) throws IOException {
|
||||
encodeRawBytes(Float.floatToIntBits((float) value), BCF2Type.FLOAT);
|
||||
}
|
||||
|
||||
@Requires("size >= 0")
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeType(final int size, final BCF2Type type) throws IOException {
|
||||
if ( size <= BCF2Utils.MAX_INLINE_ELEMENTS ) {
|
||||
final int typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
|
||||
encodeStream.write(typeByte);
|
||||
} else {
|
||||
final int typeByte = BCF2Utils.encodeTypeDescriptor(BCF2Utils.OVERFLOW_ELEMENT_MARKER, type);
|
||||
encodeStream.write(typeByte);
|
||||
// write in the overflow size
|
||||
encodeTypedInt(size);
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeRawInt(final int value, final BCF2Type type) throws IOException {
|
||||
type.write(value, encodeStream);
|
||||
}
|
||||
|
||||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeRawBytes(final int value, final BCF2Type type) throws IOException {
|
||||
type.write(value, encodeStream);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// utility functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Requires({"s != null", "sizeToWrite >= 0"})
|
||||
public void encodeRawString(final String s, final int sizeToWrite) throws IOException {
|
||||
final byte[] bytes = s.getBytes();
|
||||
for ( int i = 0; i < sizeToWrite; i++ )
|
||||
if ( i < bytes.length )
|
||||
encodeRawChar(bytes[i]);
|
||||
else
|
||||
encodeRawMissingValue(BCF2Type.CHAR);
|
||||
}
|
||||
|
||||
/**
|
||||
* Totally generic encoder that examines o, determines the best way to encode it, and encodes it
|
||||
*
|
||||
* This method is incredibly slow, but it's only used for UnitTests so it doesn't matter
|
||||
*
|
||||
* @param o
|
||||
* @return
|
||||
*/
|
||||
@Requires("o != null")
|
||||
public final BCF2Type encode(final Object o) throws IOException {
|
||||
if ( o == null ) throw new IllegalArgumentException("Generic encode cannot deal with null values");
|
||||
|
||||
if ( o instanceof List ) {
|
||||
final BCF2Type type = determineBCFType(((List) o).get(0));
|
||||
encodeTyped((List) o, type);
|
||||
return type;
|
||||
} else {
|
||||
final BCF2Type type = determineBCFType(o);
|
||||
encodeTyped(o, type);
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
@Requires("arg != null")
|
||||
private final BCF2Type determineBCFType(final Object arg) {
|
||||
final Object toType = arg instanceof List ? ((List)arg).get(0) : arg;
|
||||
|
||||
if ( toType instanceof Integer )
|
||||
return BCF2Utils.determineIntegerType((Integer) toType);
|
||||
else if ( toType instanceof String )
|
||||
return BCF2Type.CHAR;
|
||||
else if ( toType instanceof Double )
|
||||
return BCF2Type.FLOAT;
|
||||
else
|
||||
throw new IllegalArgumentException("No native encoding for Object of type " + arg.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
private final List<Byte> stringToBytes(final String v) throws IOException {
|
||||
if ( v == null || v.equals("") )
|
||||
return Collections.emptyList();
|
||||
else {
|
||||
// TODO -- this needs to be optimized away for efficiency
|
||||
final byte[] bytes = v.getBytes();
|
||||
final List<Byte> l = new ArrayList<Byte>(bytes.length);
|
||||
for ( int i = 0; i < bytes.length; i++) l.add(bytes[i]);
|
||||
return l;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,518 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Type;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Utils;
|
||||
import org.broadinstitute.variant.vcf.VCFCompoundHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineCount;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 06/12
|
||||
*/
|
||||
@Invariant({
|
||||
"headerLine != null",
|
||||
"dictionaryOffsetType.isIntegerType()",
|
||||
"dictionaryOffset >= 0"
|
||||
})
|
||||
public abstract class BCF2FieldEncoder {
|
||||
/**
|
||||
* The header line describing the field we will encode values of
|
||||
*/
|
||||
final VCFCompoundHeaderLine headerLine;
|
||||
|
||||
/**
|
||||
* The BCF2 type we'll use to encoder this field, if it can be determined statically.
|
||||
* If not, this variable must be null
|
||||
*/
|
||||
final BCF2Type staticType;
|
||||
|
||||
/**
|
||||
* The integer offset into the strings map of the BCF2 file corresponding to this
|
||||
* field.
|
||||
*/
|
||||
final int dictionaryOffset;
|
||||
|
||||
/**
|
||||
* The integer type we use to encode our dictionary offset in the BCF2 file
|
||||
*/
|
||||
final BCF2Type dictionaryOffsetType;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructor
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
@Requires({"headerLine != null", "dict != null"})
|
||||
private BCF2FieldEncoder(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict, final BCF2Type staticType) {
|
||||
this.headerLine = headerLine;
|
||||
this.staticType = staticType;
|
||||
|
||||
final Integer offset = dict.get(getField());
|
||||
if ( offset == null ) throw new IllegalStateException("Format error: could not find string " + getField() + " in header as required by BCF");
|
||||
this.dictionaryOffset = offset;
|
||||
dictionaryOffsetType = BCF2Utils.determineIntegerType(offset);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Basic accessors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
@Ensures("result != null")
|
||||
public final String getField() { return headerLine.getID(); }
|
||||
|
||||
/**
|
||||
* Write the field key (dictionary offset and type) into the BCF2Encoder stream
|
||||
*
|
||||
* @param encoder where we write our dictionary offset
|
||||
* @throws IOException
|
||||
*/
|
||||
@Requires("encoder != null")
|
||||
public final void writeFieldKey(final BCF2Encoder encoder) throws IOException {
|
||||
encoder.encodeTypedInt(dictionaryOffset, dictionaryOffsetType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// methods to determine the number of encoded elements
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
@Ensures("result != null")
|
||||
protected final VCFHeaderLineCount getCountType() {
|
||||
return headerLine.getCountType();
|
||||
}
|
||||
|
||||
/**
|
||||
* True if this field has a constant, fixed number of elements (such as 1 for an atomic integer)
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result != (hasValueDeterminedNumElements() || hasContextDeterminedNumElements())")
|
||||
public boolean hasConstantNumElements() {
|
||||
return getCountType() == VCFHeaderLineCount.INTEGER;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if the only way to determine how many elements this field contains is by
|
||||
* inspecting the actual value directly, such as when the number of elements
|
||||
* is a variable length list per site or per genotype.
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result != (hasConstantNumElements() || hasContextDeterminedNumElements())")
|
||||
public boolean hasValueDeterminedNumElements() {
|
||||
return getCountType() == VCFHeaderLineCount.UNBOUNDED;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if this field has a non-fixed number of elements that depends only on the properties
|
||||
* of the current VariantContext, such as one value per Allele or per genotype configuration.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result != (hasValueDeterminedNumElements() || hasConstantNumElements())")
|
||||
public boolean hasContextDeterminedNumElements() {
|
||||
return ! hasConstantNumElements() && ! hasValueDeterminedNumElements();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of elements, assuming this field has a constant number of elements.
|
||||
* @return
|
||||
*/
|
||||
@Requires("hasConstantNumElements()")
|
||||
@Ensures("result >= 0")
|
||||
public int numElements() {
|
||||
return headerLine.getCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of elements by looking at the actual value provided
|
||||
* @return
|
||||
*/
|
||||
@Requires("hasValueDeterminedNumElements()")
|
||||
@Ensures("result >= 0")
|
||||
public int numElements(final Object value) {
|
||||
return numElementsFromValue(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of elements, assuming this field has context-determined number of elements.
|
||||
* @return
|
||||
*/
|
||||
@Requires("hasContextDeterminedNumElements()")
|
||||
@Ensures("result >= 0")
|
||||
public int numElements(final VariantContext vc) {
|
||||
return headerLine.getCount(vc);
|
||||
}
|
||||
|
||||
/**
|
||||
* A convenience access for the number of elements, returning
|
||||
* the number of encoded elements, either from the fixed number
|
||||
* it has, from the VC, or from the value itself.
|
||||
* @param vc
|
||||
* @param value
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public final int numElements(final VariantContext vc, final Object value) {
|
||||
if ( hasConstantNumElements() ) return numElements();
|
||||
else if ( hasContextDeterminedNumElements() ) return numElements(vc);
|
||||
else return numElements(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a value, return the number of elements we will encode for it.
|
||||
*
|
||||
* Assumes the value is encoded as a List
|
||||
*
|
||||
* @param value
|
||||
* @return
|
||||
*/
|
||||
@Requires("hasValueDeterminedNumElements()")
|
||||
@Ensures("result >= 0")
|
||||
protected int numElementsFromValue(final Object value) {
|
||||
if ( value == null ) return 0;
|
||||
else if ( value instanceof List ) return ((List) value).size();
|
||||
else return 1;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// methods to determine the BCF2 type of the encoded values
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Is the BCF2 type of this field static, or does it have to be determine from
|
||||
* the actual field value itself?
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result || isDynamicallyTyped()")
|
||||
public final boolean isStaticallyTyped() { return ! isDynamicallyTyped(); }
|
||||
|
||||
/**
|
||||
* Is the BCF2 type of this field static, or does it have to be determine from
|
||||
* the actual field value itself?
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result || isStaticallyTyped()")
|
||||
public final boolean isDynamicallyTyped() { return staticType == null; }
|
||||
|
||||
/**
|
||||
* Get the BCF2 type for this field, either from the static type of the
|
||||
* field itself or by inspecting the value itself.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public final BCF2Type getType(final Object value) {
|
||||
return isDynamicallyTyped() ? getDynamicType(value) : getStaticType();
|
||||
}
|
||||
|
||||
@Requires("isStaticallyTyped()")
|
||||
@Ensures("result != null")
|
||||
public final BCF2Type getStaticType() {
|
||||
return staticType;
|
||||
}
|
||||
|
||||
@Requires("isDynamicallyTyped()")
|
||||
@Ensures("result != null")
|
||||
public BCF2Type getDynamicType(final Object value) {
|
||||
throw new IllegalStateException("BUG: cannot get dynamic type for statically typed BCF2 field " + getField());
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// methods to encode values, including the key abstract method
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Key abstract method that should encode a value of the given type into the encoder.
|
||||
*
|
||||
* Value will be of a type appropriate to the underlying encoder. If the genotype field is represented as
|
||||
* an int[], this will be value, and the encoder needs to handle encoding all of the values in the int[].
|
||||
*
|
||||
* The argument should be used, not the getType() method in the superclass as an outer loop might have
|
||||
* decided a more general type (int16) to use, even through this encoder could have been done with int8.
|
||||
*
|
||||
* If minValues > 0, then encodeValue must write in at least minValues items from value. If value is atomic,
|
||||
* this means that minValues - 1 MISSING values should be added to the encoder. If minValues is a collection
|
||||
* type (int[]) then minValues - values.length should be added. This argument is intended to handle padding
|
||||
* of values in genotype fields.
|
||||
*
|
||||
* @param encoder
|
||||
* @param value
|
||||
* @param type
|
||||
* @param minValues
|
||||
* @throws IOException
|
||||
*/
|
||||
@Requires({"encoder != null", "isDynamicallyTyped() || type == getStaticType()", "minValues >= 0"})
|
||||
public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Subclass to encode Strings
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
public static class StringOrCharacter extends BCF2FieldEncoder {
|
||||
public StringOrCharacter(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
|
||||
super(headerLine, dict, BCF2Type.CHAR);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
|
||||
final String s = javaStringToBCF2String(value);
|
||||
encoder.encodeRawString(s, Math.max(s.length(), minValues));
|
||||
}
|
||||
|
||||
//
|
||||
// Regardless of what the header says, BCF2 strings and characters are always encoded
|
||||
// as arrays of CHAR type, which has a variable number of elements depending on the
|
||||
// exact string being encoded
|
||||
//
|
||||
@Override public boolean hasConstantNumElements() { return false; }
|
||||
@Override public boolean hasContextDeterminedNumElements() { return false; }
|
||||
@Override public boolean hasValueDeterminedNumElements() { return true; }
|
||||
@Override protected int numElementsFromValue(final Object value) {
|
||||
return value == null ? 0 : javaStringToBCF2String(value).length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Recode the incoming object to a String, compacting it into a
|
||||
* BCF2 string if the value is a list.
|
||||
*
|
||||
* @param value a String or List<String> to encode, or null
|
||||
* @return a non-null string to encode
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
private String javaStringToBCF2String(final Object value) {
|
||||
if ( value == null )
|
||||
return "";
|
||||
else if (value instanceof List) {
|
||||
final List<String> l = (List<String>)value;
|
||||
if ( l.isEmpty() ) return "";
|
||||
else return BCF2Utils.collapseStringList(l);
|
||||
} else
|
||||
return (String)value;
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Subclass to encode FLAG
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
public static class Flag extends BCF2FieldEncoder {
|
||||
public Flag(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
|
||||
super(headerLine, dict, BCF2Type.INT8);
|
||||
if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 )
|
||||
throw new IllegalStateException("Flag encoder only supports atomic flags for field " + getField());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numElements() {
|
||||
return 1; // the header says 0 but we will write 1 value
|
||||
}
|
||||
|
||||
@Override
|
||||
@Requires({"minValues <= 1", "value != null", "value instanceof Boolean", "((Boolean)value) == true"})
|
||||
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
|
||||
encoder.encodeRawBytes(1, getStaticType());
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Subclass to encode FLOAT
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
public static class Float extends BCF2FieldEncoder {
|
||||
final boolean isAtomic;
|
||||
|
||||
public Float(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
|
||||
super(headerLine, dict, BCF2Type.FLOAT);
|
||||
isAtomic = hasConstantNumElements() && numElements() == 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
|
||||
int count = 0;
|
||||
// TODO -- can be restructured to avoid toList operation
|
||||
if ( isAtomic ) {
|
||||
// fast path for fields with 1 fixed float value
|
||||
if ( value != null ) {
|
||||
encoder.encodeRawFloat((Double)value);
|
||||
count++;
|
||||
}
|
||||
} else {
|
||||
// handle generic case
|
||||
final List<Double> doubles = toList(Double.class, value);
|
||||
for ( final Double d : doubles ) {
|
||||
if ( d != null ) { // necessary because .,. => [null, null] in VC
|
||||
encoder.encodeRawFloat(d);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Subclass to encode int[]
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
public static class IntArray extends BCF2FieldEncoder {
|
||||
public IntArray(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
|
||||
super(headerLine, dict, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int numElementsFromValue(final Object value) {
|
||||
return value == null ? 0 : ((int[])value).length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BCF2Type getDynamicType(final Object value) {
|
||||
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value);
|
||||
}
|
||||
|
||||
@Requires("value == null || ((int[])value).length <= minValues")
|
||||
@Override
|
||||
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
|
||||
int count = 0;
|
||||
if ( value != null ) {
|
||||
for ( final int i : (int[])value ) {
|
||||
encoder.encodeRawInt(i, type);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Subclass to encode List<Integer>
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Specialized int encoder for atomic (non-list) integers
|
||||
*/
|
||||
public static class AtomicInt extends BCF2FieldEncoder {
|
||||
public AtomicInt(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
|
||||
super(headerLine, dict, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BCF2Type getDynamicType(final Object value) {
|
||||
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((Integer)value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
|
||||
int count = 0;
|
||||
if ( value != null ) {
|
||||
encoder.encodeRawInt((Integer)value, type);
|
||||
count++;
|
||||
}
|
||||
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
|
||||
}
|
||||
}
|
||||
|
||||
public static class GenericInts extends BCF2FieldEncoder {
|
||||
public GenericInts(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
|
||||
super(headerLine, dict, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BCF2Type getDynamicType(final Object value) {
|
||||
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType(toList(Integer.class, value));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
|
||||
int count = 0;
|
||||
for ( final Integer i : toList(Integer.class, value) ) {
|
||||
if ( i != null ) { // necessary because .,. => [null, null] in VC
|
||||
encoder.encodeRawInt(i, type);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Helper methods
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Helper function that takes an object and returns a list representation
|
||||
* of it:
|
||||
*
|
||||
* o == null => []
|
||||
* o is a list => o
|
||||
* else => [o]
|
||||
*
|
||||
* @param o
|
||||
* @return
|
||||
*/
|
||||
private final static <T> List<T> toList(final Class<T> c, final Object o) {
|
||||
if ( o == null ) return Collections.emptyList();
|
||||
else if ( o instanceof List ) return (List<T>)o;
|
||||
else return Collections.singletonList((T)o);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,337 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Type;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Utils;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 06/12
|
||||
*/
|
||||
public abstract class BCF2FieldWriter {
|
||||
private final VCFHeader header;
|
||||
private final BCF2FieldEncoder fieldEncoder;
|
||||
|
||||
@Requires({"header != null", "fieldEncoder != null"})
|
||||
protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
this.header = header;
|
||||
this.fieldEncoder = fieldEncoder;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
protected VCFHeader getHeader() { return header; }
|
||||
@Ensures("result != null")
|
||||
protected BCF2FieldEncoder getFieldEncoder() {
|
||||
return fieldEncoder;
|
||||
}
|
||||
@Ensures("result != null")
|
||||
protected String getField() { return getFieldEncoder().getField(); }
|
||||
|
||||
@Requires("vc != null")
|
||||
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
|
||||
fieldEncoder.writeFieldKey(encoder);
|
||||
}
|
||||
|
||||
public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } // TODO -- overload done so that we null out values and test for correctness
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Sites writers
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public static abstract class SiteWriter extends BCF2FieldWriter {
|
||||
protected SiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
super(header, fieldEncoder);
|
||||
}
|
||||
|
||||
public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException;
|
||||
}
|
||||
|
||||
public static class GenericSiteWriter extends SiteWriter {
|
||||
public GenericSiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
super(header, fieldEncoder);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
|
||||
final Object rawValue = vc.getAttribute(getField(), null);
|
||||
final BCF2Type type = getFieldEncoder().getType(rawValue);
|
||||
if ( rawValue == null ) {
|
||||
// the value is missing, just write in null
|
||||
encoder.encodeType(0, type);
|
||||
} else {
|
||||
final int valueCount = getFieldEncoder().numElements(vc, rawValue);
|
||||
encoder.encodeType(valueCount, type);
|
||||
getFieldEncoder().encodeValue(encoder, rawValue, type, valueCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Genotypes writers
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public static abstract class GenotypesWriter extends BCF2FieldWriter {
|
||||
int nValuesPerGenotype = -1;
|
||||
BCF2Type encodingType = null;
|
||||
|
||||
protected GenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
super(header, fieldEncoder);
|
||||
|
||||
if ( fieldEncoder.hasConstantNumElements() ) {
|
||||
nValuesPerGenotype = getFieldEncoder().numElements();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@Requires({"encodingType != null",
|
||||
"nValuesPerGenotype >= 0 || ! getFieldEncoder().hasConstantNumElements()"})
|
||||
@Ensures("nValuesPerGenotype >= 0")
|
||||
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
|
||||
// writes the key information
|
||||
super.start(encoder, vc);
|
||||
|
||||
// only update if we need to
|
||||
if ( ! getFieldEncoder().hasConstantNumElements() ) {
|
||||
if ( getFieldEncoder().hasContextDeterminedNumElements() )
|
||||
// we are cheap -- just depends on genotype of allele counts
|
||||
nValuesPerGenotype = getFieldEncoder().numElements(vc);
|
||||
else
|
||||
// we have to go fishing through the values themselves (expensive)
|
||||
nValuesPerGenotype = computeMaxSizeOfGenotypeFieldFromValues(vc);
|
||||
}
|
||||
|
||||
encoder.encodeType(nValuesPerGenotype, encodingType);
|
||||
}
|
||||
|
||||
@Requires({"encodingType != null", "nValuesPerGenotype >= 0"})
|
||||
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
|
||||
final Object fieldValue = g.getExtendedAttribute(getField(), null);
|
||||
getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype);
|
||||
}
|
||||
|
||||
@Ensures({"result >= 0"})
|
||||
protected int numElements(final VariantContext vc, final Genotype g) {
|
||||
return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField()));
|
||||
}
|
||||
|
||||
@Ensures({"result >= 0"})
|
||||
private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) {
|
||||
int size = -1;
|
||||
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
size = Math.max(size, numElements(vc, g));
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
}
|
||||
|
||||
public static class StaticallyTypeGenotypesWriter extends GenotypesWriter {
|
||||
public StaticallyTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
super(header, fieldEncoder);
|
||||
encodingType = getFieldEncoder().getStaticType();
|
||||
}
|
||||
}
|
||||
|
||||
public static class IntegerTypeGenotypesWriter extends GenotypesWriter {
|
||||
public IntegerTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
super(header, fieldEncoder);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
|
||||
// the only value that is dynamic are integers
|
||||
final List<Integer> values = new ArrayList<Integer>(vc.getNSamples());
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
for ( final Object i : BCF2Utils.toList(g.getExtendedAttribute(getField(), null)) ) {
|
||||
if ( i != null ) values.add((Integer)i); // we know they are all integers
|
||||
}
|
||||
}
|
||||
|
||||
encodingType = BCF2Utils.determineIntegerType(values);
|
||||
super.start(encoder, vc);
|
||||
}
|
||||
}
|
||||
|
||||
public static class IGFGenotypesWriter extends GenotypesWriter {
|
||||
final IntGenotypeFieldAccessors.Accessor ige;
|
||||
|
||||
public IGFGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder, final IntGenotypeFieldAccessors.Accessor ige) {
|
||||
super(header, fieldEncoder);
|
||||
this.ige = ige;
|
||||
|
||||
if ( ! (fieldEncoder instanceof BCF2FieldEncoder.IntArray) )
|
||||
throw new IllegalArgumentException("BUG: IntGenotypesWriter requires IntArray encoder for field " + getField());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
|
||||
// TODO
|
||||
// TODO this piece of code consumes like 10% of the runtime alone because fo the vc.getGenotypes() iteration
|
||||
// TODO
|
||||
encodingType = BCF2Type.INT8;
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
final int[] pls = ige.getValues(g);
|
||||
final BCF2Type plsType = getFieldEncoder().getType(pls);
|
||||
encodingType = BCF2Utils.maxIntegerType(encodingType, plsType);
|
||||
if ( encodingType == BCF2Type.INT32 )
|
||||
break; // stop early
|
||||
}
|
||||
|
||||
super.start(encoder, vc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
|
||||
getFieldEncoder().encodeValue(encoder, ige.getValues(g), encodingType, nValuesPerGenotype);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int numElements(final VariantContext vc, final Genotype g) {
|
||||
return ige.getSize(g);
|
||||
}
|
||||
}
|
||||
|
||||
public static class FTGenotypesWriter extends StaticallyTypeGenotypesWriter {
|
||||
public FTGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
super(header, fieldEncoder);
|
||||
}
|
||||
|
||||
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
|
||||
final String fieldValue = g.getFilters();
|
||||
getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int numElements(final VariantContext vc, final Genotype g) {
|
||||
return getFieldEncoder().numElements(vc, g.getFilters());
|
||||
}
|
||||
}
|
||||
|
||||
public static class GTWriter extends GenotypesWriter {
|
||||
final Map<Allele, Integer> alleleMapForTriPlus = new HashMap<Allele, Integer>(5);
|
||||
Allele ref, alt1;
|
||||
|
||||
public GTWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
|
||||
super(header, fieldEncoder);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
|
||||
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
|
||||
throw new IllegalStateException("Current BCF2 encoder cannot handle sites " +
|
||||
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
|
||||
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
encodingType = BCF2Type.INT8;
|
||||
buildAlleleMap(vc);
|
||||
nValuesPerGenotype = vc.getMaxPloidy(2);
|
||||
|
||||
super.start(encoder, vc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
|
||||
final int samplePloidy = g.getPloidy();
|
||||
for ( int i = 0; i < nValuesPerGenotype; i++ ) {
|
||||
if ( i < samplePloidy ) {
|
||||
// we encode the actual allele
|
||||
final Allele a = g.getAllele(i);
|
||||
final int offset = getAlleleOffset(a);
|
||||
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
|
||||
encoder.encodeRawBytes(encoded, encodingType);
|
||||
} else {
|
||||
// we need to pad with missing as we have ploidy < max for this sample
|
||||
encoder.encodeRawBytes(encodingType.getMissingBytes(), encodingType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fast path code to determine the offset.
|
||||
*
|
||||
* Inline tests for == against ref (most common, first test)
|
||||
* == alt1 (second most common, second test)
|
||||
* == NO_CALL (third)
|
||||
* and finally in the map from allele => offset for all alt 2+ alleles
|
||||
*
|
||||
* @param a the allele whose offset we wish to determine
|
||||
* @return the offset (from 0) of the allele in the list of variant context alleles (-1 means NO_CALL)
|
||||
*/
|
||||
@Requires("a != null")
|
||||
private final int getAlleleOffset(final Allele a) {
|
||||
if ( a == ref ) return 0;
|
||||
else if ( a == alt1 ) return 1;
|
||||
else if ( a == Allele.NO_CALL ) return -1;
|
||||
else {
|
||||
final Integer o = alleleMapForTriPlus.get(a);
|
||||
if ( o == null ) throw new IllegalStateException("BUG: Couldn't find allele offset for allele " + a);
|
||||
return o;
|
||||
}
|
||||
}
|
||||
|
||||
private final void buildAlleleMap(final VariantContext vc) {
|
||||
// these are fast path options to determine the offsets for
|
||||
final int nAlleles = vc.getNAlleles();
|
||||
ref = vc.getReference();
|
||||
alt1 = nAlleles > 1 ? vc.getAlternateAllele(0) : null;
|
||||
|
||||
if ( nAlleles > 2 ) {
|
||||
// for multi-allelics we need to clear the map, and add additional looks
|
||||
alleleMapForTriPlus.clear();
|
||||
final List<Allele> alleles = vc.getAlleles();
|
||||
for ( int i = 2; i < alleles.size(); i++ ) {
|
||||
alleleMapForTriPlus.put(alleles.get(i), i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,180 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 06/12
|
||||
*/
|
||||
public class BCF2FieldWriterManager {
|
||||
final Map<String, BCF2FieldWriter.SiteWriter> siteWriters = new HashMap<String, BCF2FieldWriter.SiteWriter>();
|
||||
final Map<String, BCF2FieldWriter.GenotypesWriter> genotypesWriters = new HashMap<String, BCF2FieldWriter.GenotypesWriter>();
|
||||
final IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
|
||||
|
||||
public BCF2FieldWriterManager() { }
|
||||
|
||||
/**
|
||||
* Setup the FieldWriters appropriate to each INFO and FORMAT in the VCF header
|
||||
*
|
||||
* Must be called before any of the getter methods will work
|
||||
*
|
||||
* @param header a VCFHeader containing description for every INFO and FORMAT field we'll attempt to write out to BCF
|
||||
* @param encoder the encoder we are going to use to write out the BCF2 data
|
||||
* @param stringDictionary a map from VCFHeader strings to their offsets for encoding
|
||||
*/
|
||||
public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map<String, Integer> stringDictionary) {
|
||||
for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) {
|
||||
final String field = line.getID();
|
||||
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary);
|
||||
add(siteWriters, field, writer);
|
||||
}
|
||||
|
||||
for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) {
|
||||
final String field = line.getID();
|
||||
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary);
|
||||
add(genotypesWriters, field, writer);
|
||||
}
|
||||
}
|
||||
|
||||
@Requires({"field != null", "writer != null"})
|
||||
@Ensures("map.containsKey(field)")
|
||||
private final <T> void add(final Map<String, T> map, final String field, final T writer) {
|
||||
if ( map.containsKey(field) )
|
||||
throw new IllegalStateException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders");
|
||||
map.put(field, writer);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Master routine to look at the header, a specific line, and
|
||||
// build an appropriate SiteWriter for that header element
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFHeader header,
|
||||
final VCFInfoHeaderLine line,
|
||||
final BCF2Encoder encoder,
|
||||
final Map<String, Integer> dict) {
|
||||
return new BCF2FieldWriter.GenericSiteWriter(header, createFieldEncoder(line, encoder, dict, false));
|
||||
}
|
||||
|
||||
private BCF2FieldEncoder createFieldEncoder(final VCFCompoundHeaderLine line,
|
||||
final BCF2Encoder encoder,
|
||||
final Map<String, Integer> dict,
|
||||
final boolean createGenotypesEncoders ) {
|
||||
|
||||
if ( createGenotypesEncoders && intGenotypeFieldAccessors.getAccessor(line.getID()) != null ) {
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED && line.getType() != VCFHeaderLineType.Integer )
|
||||
System.err.println("Warning: field " + line.getID() + " expected to encode an integer but saw " + line.getType() + " for record " + line);
|
||||
return new BCF2FieldEncoder.IntArray(line, dict);
|
||||
} else if ( createGenotypesEncoders && line.getID().equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||
return new BCF2FieldEncoder.GenericInts(line, dict);
|
||||
} else {
|
||||
switch ( line.getType() ) {
|
||||
case Character:
|
||||
case String:
|
||||
return new BCF2FieldEncoder.StringOrCharacter(line, dict);
|
||||
case Flag:
|
||||
return new BCF2FieldEncoder.Flag(line, dict);
|
||||
case Float:
|
||||
return new BCF2FieldEncoder.Float(line, dict);
|
||||
case Integer:
|
||||
if ( line.isFixedCount() && line.getCount() == 1 )
|
||||
return new BCF2FieldEncoder.AtomicInt(line, dict);
|
||||
else
|
||||
return new BCF2FieldEncoder.GenericInts(line, dict);
|
||||
default:
|
||||
throw new IllegalArgumentException("Unexpected type for field " + line.getID());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Master routine to look at the header, a specific line, and
|
||||
// build an appropriate Genotypes for that header element
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
private BCF2FieldWriter.GenotypesWriter createGenotypesWriter(final VCFHeader header,
|
||||
final VCFFormatHeaderLine line,
|
||||
final BCF2Encoder encoder,
|
||||
final Map<String, Integer> dict) {
|
||||
final String field = line.getID();
|
||||
final BCF2FieldEncoder fieldEncoder = createFieldEncoder(line, encoder, dict, true);
|
||||
|
||||
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||
return new BCF2FieldWriter.GTWriter(header, fieldEncoder);
|
||||
} else if ( line.getID().equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
|
||||
return new BCF2FieldWriter.FTGenotypesWriter(header, fieldEncoder);
|
||||
} else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) {
|
||||
return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field));
|
||||
} else if ( line.getType() == VCFHeaderLineType.Integer ) {
|
||||
return new BCF2FieldWriter.IntegerTypeGenotypesWriter(header, fieldEncoder);
|
||||
} else {
|
||||
return new BCF2FieldWriter.StaticallyTypeGenotypesWriter(header, fieldEncoder);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Accessors to get site / genotype writers
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Get a site writer specialized to encode values for site info field
|
||||
* @param field key found in the VCF header INFO records
|
||||
* @return non-null writer if one can be found, or null if none exists for field
|
||||
*/
|
||||
public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String field) {
|
||||
return getWriter(field, siteWriters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a genotypes writer specialized to encode values for genotypes field
|
||||
* @param field key found in the VCF header FORMAT records
|
||||
* @return non-null writer if one can be found, or null if none exists for field
|
||||
*/
|
||||
public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String field) {
|
||||
return getWriter(field, genotypesWriters);
|
||||
}
|
||||
|
||||
@Requires({"map != null", "key != null"})
|
||||
public <T> T getWriter(final String key, final Map<String, T> map) {
|
||||
return map.get(key);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,425 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Codec;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Type;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Utils;
|
||||
import org.broadinstitute.variant.bcf2.BCFVersion;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFContigHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.broadinstitute.variant.vcf.VCFUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* VariantContextWriter that emits BCF2 binary encoding
|
||||
*
|
||||
* Overall structure of this writer is complex for efficiency reasons
|
||||
*
|
||||
* -- The BCF2Writer manages the low-level BCF2 encoder, the mappings
|
||||
* from contigs and strings to offsets, the VCF header, and holds the
|
||||
* lower-level encoders that map from VC and Genotype fields to their
|
||||
* specific encoders. This class also writes out the standard BCF2 fields
|
||||
* like POS, contig, the size of info and genotype data, QUAL, etc. It
|
||||
* has loops over the INFO and GENOTYPES to encode each individual datum
|
||||
* with the generic field encoders, but the actual encoding work is
|
||||
* done with by the FieldWriters classes themselves
|
||||
*
|
||||
* -- BCF2FieldWriter are specialized classes for writing out SITE and
|
||||
* genotype information for specific SITE/GENOTYPE fields (like AC for
|
||||
* sites and GQ for genotypes). These are objects in themselves because
|
||||
* the manage all of the complexity of relating the types in the VCF header
|
||||
* with the proper encoding in BCF as well as the type representing this
|
||||
* in java. Relating all three of these pieces of information together
|
||||
* is the main complexity challenge in the encoder. The piece of code
|
||||
* that determines which FieldWriters to associate with each SITE and
|
||||
* GENOTYPE field is the BCF2FieldWriterManager. These FieldWriters
|
||||
* are specialized for specific combinations of encoders (see below)
|
||||
* and contexts (genotypes) for efficiency, so they smartly manage
|
||||
* the writing of PLs (encoded as int[]) directly into the lowest
|
||||
* level BCFEncoder.
|
||||
*
|
||||
* -- At the third level is the BCF2FieldEncoder, relatively simple
|
||||
* pieces of code that handle the task of determining the right
|
||||
* BCF2 type for specific field values, as well as reporting back
|
||||
* information such as the number of elements used to encode it
|
||||
* (simple for atomic values like Integer but complex for PLs
|
||||
* or lists of strings)
|
||||
*
|
||||
* -- At the lowest level is the BCF2Encoder itself. This provides
|
||||
* just the limited encoding methods specified by the BCF2 specification. This encoder
|
||||
* doesn't do anything but make it possible to conveniently write out valid low-level
|
||||
* BCF2 constructs.
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 06/12
|
||||
*/
|
||||
class BCF2Writer extends IndexingVariantContextWriter {
|
||||
public static final int MAJOR_VERSION = 2;
|
||||
public static final int MINOR_VERSION = 1;
|
||||
|
||||
final private static boolean ALLOW_MISSING_CONTIG_LINES = false;
|
||||
|
||||
private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
|
||||
private VCFHeader header;
|
||||
private final Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
|
||||
private final Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
|
||||
private final boolean doNotWriteGenotypes;
|
||||
private String[] sampleNames = null;
|
||||
|
||||
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
|
||||
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
|
||||
|
||||
/**
|
||||
* cached results for whether we can write out raw genotypes data.
|
||||
*/
|
||||
private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null;
|
||||
private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false;
|
||||
|
||||
|
||||
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
|
||||
this.outputStream = getOutputStream();
|
||||
this.doNotWriteGenotypes = doNotWriteGenotypes;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Interface functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public void writeHeader(VCFHeader header) {
|
||||
// make sure the header is sorted correctly
|
||||
header = new VCFHeader(header.getMetaDataInSortedOrder(), header.getGenotypeSamples());
|
||||
|
||||
// create the config offsets map
|
||||
if ( header.getContigLines().isEmpty() ) {
|
||||
if ( ALLOW_MISSING_CONTIG_LINES ) {
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
|
||||
System.err.println("No contig dictionary found in header, falling back to reference sequence dictionary");
|
||||
}
|
||||
createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null));
|
||||
} else {
|
||||
throw new IllegalStateException("Cannot write BCF2 file with missing contig lines");
|
||||
}
|
||||
} else {
|
||||
createContigDictionary(header.getContigLines());
|
||||
}
|
||||
|
||||
// set up the map from dictionary string values -> offset
|
||||
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
|
||||
for ( int i = 0; i < dict.size(); i++ ) {
|
||||
stringDictionaryMap.put(dict.get(i), i);
|
||||
}
|
||||
|
||||
sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]);
|
||||
|
||||
// setup the field encodings
|
||||
fieldManager.setup(header, encoder, stringDictionaryMap);
|
||||
|
||||
try {
|
||||
// write out the header into a byte stream, get it's length, and write everything to the file
|
||||
final ByteArrayOutputStream capture = new ByteArrayOutputStream();
|
||||
final OutputStreamWriter writer = new OutputStreamWriter(capture);
|
||||
this.header = VCFWriter.writeHeader(header, writer, doNotWriteGenotypes, VCFWriter.getVersionLine(), "BCF2 stream");
|
||||
writer.append('\0'); // the header is null terminated by a byte
|
||||
writer.close();
|
||||
|
||||
final byte[] headerBytes = capture.toByteArray();
|
||||
new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream);
|
||||
BCF2Type.INT32.write(headerBytes.length, outputStream);
|
||||
outputStream.write(headerBytes);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("BCF2 stream: Got IOException while trying to write BCF2 header", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add( VariantContext vc ) {
|
||||
if ( doNotWriteGenotypes )
|
||||
vc = new VariantContextBuilder(vc).noGenotypes().make();
|
||||
vc = vc.fullyDecode(header, false);
|
||||
|
||||
super.add(vc); // allow on the fly indexing
|
||||
|
||||
try {
|
||||
final byte[] infoBlock = buildSitesData(vc);
|
||||
final byte[] genotypesBlock = buildSamplesData(vc);
|
||||
|
||||
// write the two blocks to disk
|
||||
writeBlock(infoBlock, genotypesBlock);
|
||||
}
|
||||
catch ( IOException e ) {
|
||||
throw new RuntimeException("Error writing record to BCF2 file: " + vc.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
try {
|
||||
outputStream.flush();
|
||||
outputStream.close();
|
||||
}
|
||||
catch ( IOException e ) {
|
||||
throw new RuntimeException("Failed to close BCF2 file");
|
||||
}
|
||||
super.close();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// implicit block
|
||||
//
|
||||
// The first four records of BCF are inline untype encoded data of:
|
||||
//
|
||||
// 4 byte integer chrom offset
|
||||
// 4 byte integer start
|
||||
// 4 byte integer ref length
|
||||
// 4 byte float qual
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
private byte[] buildSitesData( VariantContext vc ) throws IOException {
|
||||
final int contigIndex = contigDictionary.get(vc.getChr());
|
||||
if ( contigIndex == -1 )
|
||||
throw new IllegalStateException(String.format("Contig %s not found in sequence dictionary from reference", vc.getChr()));
|
||||
|
||||
// note use of encodeRawValue to not insert the typing byte
|
||||
encoder.encodeRawValue(contigIndex, BCF2Type.INT32);
|
||||
|
||||
// pos. GATK is 1 based, BCF2 is 0 based
|
||||
encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32);
|
||||
|
||||
// ref length. GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1
|
||||
// for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1
|
||||
encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32);
|
||||
|
||||
// qual
|
||||
if ( vc.hasLog10PError() )
|
||||
encoder.encodeRawFloat((float) vc.getPhredScaledQual());
|
||||
else
|
||||
encoder.encodeRawMissingValue(BCF2Type.FLOAT);
|
||||
|
||||
// info fields
|
||||
final int nAlleles = vc.getNAlleles();
|
||||
final int nInfo = vc.getAttributes().size();
|
||||
final int nGenotypeFormatFields = getNGenotypeFormatFields(vc);
|
||||
final int nSamples = header.getNGenotypeSamples();
|
||||
|
||||
encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x0000FFFF), BCF2Type.INT32);
|
||||
encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x00FFFFF), BCF2Type.INT32);
|
||||
|
||||
buildID(vc);
|
||||
buildAlleles(vc);
|
||||
buildFilter(vc);
|
||||
buildInfo(vc);
|
||||
|
||||
return encoder.getRecordBytes();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Can we safely write on the raw (undecoded) genotypes of an input VC?
|
||||
*
|
||||
* The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in
|
||||
* which case we return the previous result. If it's not cached, we use the BCF2Util to
|
||||
* compare the VC header with our header (expensive) and cache it.
|
||||
*
|
||||
* @param lazyData
|
||||
* @return
|
||||
*/
|
||||
private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) {
|
||||
if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) {
|
||||
// result is already cached
|
||||
canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header);
|
||||
lastVCFHeaderOfUnparsedGenotypes = lazyData.header;
|
||||
}
|
||||
|
||||
return canPassOnUnparsedGenotypeDataForLastVCFHeader;
|
||||
}
|
||||
|
||||
private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
|
||||
if ( vc.getGenotypes().isLazyWithData() ) {
|
||||
final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
|
||||
|
||||
if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData &&
|
||||
canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) {
|
||||
return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
|
||||
} else {
|
||||
lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to get the nGenotypeFields as efficiently as possible.
|
||||
*
|
||||
* If this is a lazy BCF2 object just grab the field count from there,
|
||||
* otherwise do the whole counting by types test in the actual data
|
||||
*
|
||||
* @param vc
|
||||
* @return
|
||||
*/
|
||||
private final int getNGenotypeFormatFields(final VariantContext vc) {
|
||||
final BCF2Codec.LazyData lazyData = getLazyData(vc);
|
||||
return lazyData != null ? lazyData.nGenotypeFields : VCFWriter.calcVCFGenotypeKeys(vc, header).size();
|
||||
}
|
||||
|
||||
private void buildID( VariantContext vc ) throws IOException {
|
||||
encoder.encodeTypedString(vc.getID());
|
||||
}
|
||||
|
||||
private void buildAlleles( VariantContext vc ) throws IOException {
|
||||
for ( Allele allele : vc.getAlleles() ) {
|
||||
final byte[] s = allele.getDisplayBases();
|
||||
if ( s == null )
|
||||
throw new IllegalStateException("BUG: BCF2Writer encountered null padded allele" + allele);
|
||||
encoder.encodeTypedString(s);
|
||||
}
|
||||
}
|
||||
|
||||
private void buildFilter( VariantContext vc ) throws IOException {
|
||||
if ( vc.isFiltered() ) {
|
||||
encodeStringsByRef(vc.getFilters());
|
||||
} else if ( vc.filtersWereApplied() ) {
|
||||
encodeStringsByRef(Collections.singleton(VCFConstants.PASSES_FILTERS_v4));
|
||||
} else {
|
||||
encoder.encodeTypedMissing(BCF2Type.INT8);
|
||||
}
|
||||
}
|
||||
|
||||
private void buildInfo( VariantContext vc ) throws IOException {
|
||||
for ( Map.Entry<String, Object> infoFieldEntry : vc.getAttributes().entrySet() ) {
|
||||
final String field = infoFieldEntry.getKey();
|
||||
final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(field);
|
||||
if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "INFO");
|
||||
writer.start(encoder, vc);
|
||||
writer.site(encoder, vc);
|
||||
writer.done(encoder, vc);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
|
||||
final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects
|
||||
if ( lazyData != null ) {
|
||||
// we never decoded any data from this BCF file, so just pass it back
|
||||
return lazyData.bytes;
|
||||
}
|
||||
|
||||
// we have to do work to convert the VC into a BCF2 byte stream
|
||||
final List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header);
|
||||
for ( final String field : genotypeFields ) {
|
||||
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
|
||||
if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT");
|
||||
|
||||
assert writer != null;
|
||||
|
||||
writer.start(encoder, vc);
|
||||
for ( final String name : sampleNames ) {
|
||||
Genotype g = vc.getGenotype(name);
|
||||
if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype);
|
||||
writer.addGenotype(encoder, vc, g);
|
||||
}
|
||||
writer.done(encoder, vc);
|
||||
}
|
||||
return encoder.getRecordBytes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws a meaningful error message when a field (INFO or FORMAT) is found when writing out a file
|
||||
* but there's no header line for it.
|
||||
*
|
||||
* @param vc
|
||||
* @param field
|
||||
* @param fieldType
|
||||
*/
|
||||
private final void errorUnexpectedFieldToWrite(final VariantContext vc, final String field, final String fieldType) {
|
||||
throw new IllegalStateException("Found field " + field + " in the " + fieldType + " fields of VariantContext at " +
|
||||
vc.getChr() + ":" + vc.getStart() + " from " + vc.getSource() + " but this hasn't been defined in the VCFHeader");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Low-level block encoding
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Write the data in the encoder to the outputstream as a length encoded
|
||||
* block of data. After this call the encoder stream will be ready to
|
||||
* start a new data block
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
@Requires({"infoBlock.length > 0", "genotypesBlock.length >= 0"})
|
||||
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
|
||||
BCF2Type.INT32.write(infoBlock.length, outputStream);
|
||||
BCF2Type.INT32.write(genotypesBlock.length, outputStream);
|
||||
outputStream.write(infoBlock);
|
||||
outputStream.write(genotypesBlock);
|
||||
}
|
||||
|
||||
@Requires("! strings.isEmpty()")
|
||||
@Ensures("result.isIntegerType()")
|
||||
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
|
||||
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
|
||||
|
||||
// iterate over strings until we find one that needs 16 bits, and break
|
||||
for ( final String string : strings ) {
|
||||
final Integer got = stringDictionaryMap.get(string);
|
||||
if ( got == null ) throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF");
|
||||
final int offset = got;
|
||||
offsets.add(offset);
|
||||
}
|
||||
|
||||
final BCF2Type type = BCF2Utils.determineIntegerType(offsets);
|
||||
encoder.encodeTyped(offsets, type);
|
||||
return type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the contigDictionary from the contigLines extracted from the VCF header
|
||||
*
|
||||
* @param contigLines
|
||||
*/
|
||||
@Requires("contigDictionary.isEmpty()")
|
||||
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
|
||||
int offset = 0;
|
||||
for ( VCFContigHeaderLine contig : contigLines )
|
||||
contigDictionary.put(contig.getID(), offset++);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,181 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.index.DynamicIndexCreator;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* this class writes VCF files
|
||||
*/
|
||||
abstract class IndexingVariantContextWriter implements VariantContextWriter {
|
||||
private final String name;
|
||||
private final SAMSequenceDictionary refDict;
|
||||
|
||||
private OutputStream outputStream;
|
||||
private PositionalOutputStream positionalOutputStream = null;
|
||||
private DynamicIndexCreator indexer = null;
|
||||
private LittleEndianOutputStream idxStream = null;
|
||||
|
||||
@Requires({"name != null",
|
||||
"! ( location == null && output == null )",
|
||||
"! ( enableOnTheFlyIndexing && location == null )"})
|
||||
protected IndexingVariantContextWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
|
||||
outputStream = output;
|
||||
this.name = name;
|
||||
this.refDict = refDict;
|
||||
|
||||
if ( enableOnTheFlyIndexing ) {
|
||||
try {
|
||||
idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location)));
|
||||
//System.out.println("Creating index on the fly for " + location);
|
||||
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
indexer.initialize(location, indexer.defaultBinSize());
|
||||
positionalOutputStream = new PositionalOutputStream(output);
|
||||
outputStream = positionalOutputStream;
|
||||
} catch ( IOException ex ) {
|
||||
// No matter what we keep going, since we don't care if we can't create the index file
|
||||
idxStream = null;
|
||||
indexer = null;
|
||||
positionalOutputStream = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public OutputStream getOutputStream() {
|
||||
return outputStream;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public String getStreamName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public abstract void writeHeader(VCFHeader header);
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
// try to close the index stream (keep it separate to help debugging efforts)
|
||||
if ( indexer != null ) {
|
||||
Index index = indexer.finalizeIndex(positionalOutputStream.getPosition());
|
||||
setIndexSequenceDictionary(index, refDict);
|
||||
index.write(idxStream);
|
||||
idxStream.close();
|
||||
}
|
||||
|
||||
// close the underlying output stream as well
|
||||
outputStream.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close index for " + getStreamName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the reference sequence dictionary used for the variant contexts being written
|
||||
*/
|
||||
public SAMSequenceDictionary getRefDict() {
|
||||
return refDict;
|
||||
}
|
||||
|
||||
/**
|
||||
* add a record to the file
|
||||
*
|
||||
* @param vc the Variant Context object
|
||||
*/
|
||||
public void add(VariantContext vc) {
|
||||
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
|
||||
if ( indexer != null )
|
||||
indexer.addFeature(vc, positionalOutputStream.getPosition());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reasonable "name" for this writer, to display to the user if something goes wrong
|
||||
*
|
||||
* @param location
|
||||
* @param stream
|
||||
* @return
|
||||
*/
|
||||
protected static final String writerName(final File location, final OutputStream stream) {
|
||||
return location == null ? stream.toString() : location.getAbsolutePath();
|
||||
}
|
||||
|
||||
// a constant we use for marking sequence dictionary entries in the Tribble index property list
|
||||
private static final String SequenceDictionaryPropertyPredicate = "DICT:";
|
||||
|
||||
private static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
|
||||
for ( SAMSequenceRecord seq : dict.getSequences() ) {
|
||||
final String contig = SequenceDictionaryPropertyPredicate + seq.getSequenceName();
|
||||
final String length = String.valueOf(seq.getSequenceLength());
|
||||
index.addProperty(contig,length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final class PositionalOutputStream extends OutputStream {
|
||||
private final OutputStream out;
|
||||
private long position = 0;
|
||||
|
||||
public PositionalOutputStream(final OutputStream out) {
|
||||
this.out = out;
|
||||
}
|
||||
|
||||
public final void write(final byte[] bytes) throws IOException {
|
||||
write(bytes, 0, bytes.length);
|
||||
}
|
||||
|
||||
public final void write(final byte[] bytes, final int startIndex, final int numBytes) throws IOException {
|
||||
position += numBytes;
|
||||
out.write(bytes, startIndex, numBytes);
|
||||
}
|
||||
|
||||
public final void write(int c) throws IOException {
|
||||
position++;
|
||||
out.write(c);
|
||||
}
|
||||
|
||||
public final long getPosition() { return position; }
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* A convenient way to provide a single view on the many int and int[] field values we work with,
|
||||
* for writing out the values. This class makes writing out the inline AD, GQ, PL, DP fields
|
||||
* easy and fast
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 6/12
|
||||
*/
|
||||
class IntGenotypeFieldAccessors {
|
||||
// initialized once per writer to allow parallel writers to work
|
||||
private final HashMap<String, Accessor> intGenotypeFieldEncoders = new HashMap<String, Accessor>();
|
||||
|
||||
public IntGenotypeFieldAccessors() {
|
||||
intGenotypeFieldEncoders.put(VCFConstants.DEPTH_KEY, new IntGenotypeFieldAccessors.DPAccessor());
|
||||
intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new IntGenotypeFieldAccessors.ADAccessor());
|
||||
intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_PL_KEY, new IntGenotypeFieldAccessors.PLAccessor());
|
||||
intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_QUALITY_KEY, new IntGenotypeFieldAccessors.GQAccessor());
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an accessor for field, or null if none exists
|
||||
* @param field
|
||||
* @return
|
||||
*/
|
||||
public Accessor getAccessor(final String field) {
|
||||
return intGenotypeFieldEncoders.get(field);
|
||||
}
|
||||
|
||||
public static abstract class Accessor {
|
||||
public abstract int[] getValues(final Genotype g);
|
||||
|
||||
public final int getSize(final Genotype g) {
|
||||
final int[] v = getValues(g);
|
||||
return v == null ? 0 : v.length;
|
||||
}
|
||||
}
|
||||
|
||||
private static abstract class AtomicAccessor extends Accessor {
|
||||
private final int[] singleton = new int[1];
|
||||
|
||||
@Override
|
||||
public int[] getValues(final Genotype g) {
|
||||
singleton[0] = getValue(g);
|
||||
return singleton[0] == -1 ? null : singleton;
|
||||
}
|
||||
|
||||
public abstract int getValue(final Genotype g);
|
||||
}
|
||||
|
||||
public static class GQAccessor extends AtomicAccessor {
|
||||
@Override public int getValue(final Genotype g) { return Math.min(g.getGQ(), VCFConstants.MAX_GENOTYPE_QUAL); }
|
||||
}
|
||||
|
||||
public static class DPAccessor extends AtomicAccessor {
|
||||
@Override public int getValue(final Genotype g) { return g.getDP(); }
|
||||
}
|
||||
|
||||
public static class ADAccessor extends Accessor {
|
||||
@Override public int[] getValues(final Genotype g) { return g.getAD(); }
|
||||
}
|
||||
|
||||
public static class PLAccessor extends Accessor {
|
||||
@Override public int[] getValues(final Genotype g) { return g.getPL(); }
|
||||
}
|
||||
}
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
/**
|
||||
* Available writer options for VariantContextWriters
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 5/12
|
||||
*/
|
||||
public enum Options {
|
||||
INDEX_ON_THE_FLY,
|
||||
DO_NOT_WRITE_GENOTYPES,
|
||||
ALLOW_MISSING_FIELDS_IN_HEADER,
|
||||
FORCE_BCF
|
||||
}
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
/**
|
||||
* this class writes VCF files, allowing records to be passed in unsorted (up to a certain genomic distance away)
|
||||
*/
|
||||
class SortingVariantContextWriter extends SortingVariantContextWriterBase {
|
||||
|
||||
// the maximum START distance between records that we'll cache
|
||||
private int maxCachingStartDistance;
|
||||
|
||||
/**
|
||||
* create a local-sorting VCF writer, given an inner VCF writer to write to
|
||||
*
|
||||
* @param innerWriter the VCFWriter to write to
|
||||
* @param maxCachingStartDistance the maximum start distance between records that we'll cache
|
||||
* @param takeOwnershipOfInner Should this Writer close innerWriter when it's done with it
|
||||
*/
|
||||
public SortingVariantContextWriter(VariantContextWriter innerWriter, int maxCachingStartDistance, boolean takeOwnershipOfInner) {
|
||||
super(innerWriter, takeOwnershipOfInner);
|
||||
this.maxCachingStartDistance = maxCachingStartDistance;
|
||||
}
|
||||
|
||||
public SortingVariantContextWriter(VariantContextWriter innerWriter, int maxCachingStartDistance) {
|
||||
this(innerWriter, maxCachingStartDistance, false); // by default, don't own inner
|
||||
}
|
||||
|
||||
protected void noteCurrentRecord(VariantContext vc) {
|
||||
super.noteCurrentRecord(vc); // first, check for errors
|
||||
|
||||
// then, update mostUpstreamWritableLoc:
|
||||
int mostUpstreamWritableIndex = vc.getStart() - maxCachingStartDistance;
|
||||
this.mostUpstreamWritableLoc = Math.max(BEFORE_MOST_UPSTREAM_LOC, mostUpstreamWritableIndex);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,195 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
/**
|
||||
* This class writes VCF files, allowing records to be passed in unsorted.
|
||||
* It also enforces that it is never passed records of the same chromosome with any other chromosome in between them.
|
||||
*/
|
||||
abstract class SortingVariantContextWriterBase implements VariantContextWriter {
|
||||
|
||||
// The VCFWriter to which to actually write the sorted VCF records
|
||||
private final VariantContextWriter innerWriter;
|
||||
|
||||
// the current queue of un-emitted records
|
||||
private final Queue<VCFRecord> queue;
|
||||
|
||||
// The locus until which we are permitted to write out (inclusive)
|
||||
protected Integer mostUpstreamWritableLoc;
|
||||
protected static final int BEFORE_MOST_UPSTREAM_LOC = 0; // No real locus index is <= 0
|
||||
|
||||
// The set of chromosomes already passed over and to which it is forbidden to return
|
||||
private final Set<String> finishedChromosomes;
|
||||
|
||||
// Should we call innerWriter.close() in close()
|
||||
private final boolean takeOwnershipOfInner;
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* create a local-sorting VCF writer, given an inner VCF writer to write to
|
||||
*
|
||||
* @param innerWriter the VCFWriter to write to
|
||||
* @param takeOwnershipOfInner Should this Writer close innerWriter when it's done with it
|
||||
*/
|
||||
public SortingVariantContextWriterBase(VariantContextWriter innerWriter, boolean takeOwnershipOfInner) {
|
||||
this.innerWriter = innerWriter;
|
||||
this.finishedChromosomes = new TreeSet<String>();
|
||||
this.takeOwnershipOfInner = takeOwnershipOfInner;
|
||||
|
||||
// has to be PriorityBlockingQueue to be thread-safe
|
||||
this.queue = new PriorityBlockingQueue<VCFRecord>(50, new VariantContextComparator());
|
||||
|
||||
this.mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC;
|
||||
}
|
||||
|
||||
public SortingVariantContextWriterBase(VariantContextWriter innerWriter) {
|
||||
this(innerWriter, false); // by default, don't own inner
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// public interface functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public void writeHeader(VCFHeader header) {
|
||||
innerWriter.writeHeader(header);
|
||||
}
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file; we need to flush the queue first
|
||||
*/
|
||||
@Override
|
||||
public void close() {
|
||||
stopWaitingToSort();
|
||||
|
||||
if (takeOwnershipOfInner)
|
||||
innerWriter.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* add a record to the file
|
||||
*
|
||||
* @param vc the Variant Context object
|
||||
*/
|
||||
@Override
|
||||
public synchronized void add(VariantContext vc) {
|
||||
/* Note that the code below does not prevent the successive add()-ing of: (chr1, 10), (chr20, 200), (chr15, 100)
|
||||
since there is no implicit ordering of chromosomes:
|
||||
*/
|
||||
VCFRecord firstRec = queue.peek();
|
||||
if (firstRec != null && !vc.getChr().equals(firstRec.vc.getChr())) { // if we hit a new contig, flush the queue
|
||||
if (finishedChromosomes.contains(vc.getChr()))
|
||||
throw new IllegalArgumentException("Added a record at " + vc.getChr() + ":" + vc.getStart() + ", but already finished with chromosome" + vc.getChr());
|
||||
|
||||
finishedChromosomes.add(firstRec.vc.getChr());
|
||||
stopWaitingToSort();
|
||||
}
|
||||
|
||||
noteCurrentRecord(vc); // possibly overwritten
|
||||
|
||||
queue.add(new VCFRecord(vc));
|
||||
emitSafeRecords();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a string representation of this object.
|
||||
* @return a string representation of this object
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getName();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// protected interface functions for subclasses to use
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private synchronized void stopWaitingToSort() {
|
||||
emitRecords(true);
|
||||
mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC;
|
||||
}
|
||||
|
||||
protected synchronized void emitSafeRecords() {
|
||||
emitRecords(false);
|
||||
}
|
||||
|
||||
protected void noteCurrentRecord(VariantContext vc) {
|
||||
// did the user break the contract by giving a record too late?
|
||||
if (mostUpstreamWritableLoc != null && vc.getStart() < mostUpstreamWritableLoc) // went too far back, since may have already written anything that is <= mostUpstreamWritableLoc
|
||||
throw new IllegalArgumentException("Permitted to write any record upstream of position " + mostUpstreamWritableLoc + ", but a record at " + vc.getChr() + ":" + vc.getStart() + " was just added.");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// private implementation functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private synchronized void emitRecords(boolean emitUnsafe) {
|
||||
while (!queue.isEmpty()) {
|
||||
VCFRecord firstRec = queue.peek();
|
||||
|
||||
// No need to wait, waiting for nothing, or before what we're waiting for:
|
||||
if (emitUnsafe || mostUpstreamWritableLoc == null || firstRec.vc.getStart() <= mostUpstreamWritableLoc) {
|
||||
queue.poll();
|
||||
innerWriter.add(firstRec.vc);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class VariantContextComparator implements Comparator<VCFRecord> {
|
||||
public int compare(VCFRecord r1, VCFRecord r2) {
|
||||
return r1.vc.getStart() - r2.vc.getStart();
|
||||
}
|
||||
}
|
||||
|
||||
private static class VCFRecord {
|
||||
public VariantContext vc;
|
||||
|
||||
public VCFRecord(VariantContext vc) {
|
||||
this.vc = vc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,606 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.lang.reflect.Array;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* this class writes VCF files
|
||||
*/
|
||||
class VCFWriter extends IndexingVariantContextWriter {
|
||||
private final static String VERSION_LINE = VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString();
|
||||
|
||||
// should we write genotypes or just sites?
|
||||
final protected boolean doNotWriteGenotypes;
|
||||
|
||||
// the VCF header we're storing
|
||||
protected VCFHeader mHeader = null;
|
||||
|
||||
final private boolean allowMissingFieldsInHeader;
|
||||
|
||||
/**
|
||||
* The VCF writer uses an internal Writer, based by the ByteArrayOutputStream lineBuffer,
|
||||
* to temp. buffer the header and per-site output before flushing the per line output
|
||||
* in one go to the super.getOutputStream. This results in high-performance, proper encoding,
|
||||
* and allows us to avoid flushing explicitly the output stream getOutputStream, which
|
||||
* allows us to properly compress vcfs in gz format without breaking indexing on the fly
|
||||
* for uncompressed streams.
|
||||
*/
|
||||
private static final int INITIAL_BUFFER_SIZE = 1024 * 16;
|
||||
private final ByteArrayOutputStream lineBuffer = new ByteArrayOutputStream(INITIAL_BUFFER_SIZE);
|
||||
private final Writer writer;
|
||||
|
||||
/**
|
||||
* The encoding used for VCF files. ISO-8859-1
|
||||
*/
|
||||
final private Charset charset;
|
||||
|
||||
private IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
|
||||
|
||||
public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
|
||||
final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes,
|
||||
final boolean allowMissingFieldsInHeader ) {
|
||||
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
|
||||
this.doNotWriteGenotypes = doNotWriteGenotypes;
|
||||
this.allowMissingFieldsInHeader = allowMissingFieldsInHeader;
|
||||
this.charset = Charset.forName("ISO-8859-1");
|
||||
this.writer = new OutputStreamWriter(lineBuffer, charset);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// VCFWriter interface functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Write String s to the internal buffered writer.
|
||||
*
|
||||
* flushBuffer() must be called to actually write the data to the true output stream.
|
||||
*
|
||||
* @param s the string to write
|
||||
* @throws IOException
|
||||
*/
|
||||
private void write(final String s) throws IOException {
|
||||
writer.write(s);
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually write the line buffer contents to the destination output stream.
|
||||
*
|
||||
* After calling this function the line buffer is reset, so the contents of the buffer can be reused
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private void flushBuffer() throws IOException {
|
||||
writer.flush();
|
||||
getOutputStream().write(lineBuffer.toByteArray());
|
||||
lineBuffer.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeHeader(VCFHeader header) {
|
||||
// note we need to update the mHeader object after this call because they header
|
||||
// may have genotypes trimmed out of it, if doNotWriteGenotypes is true
|
||||
try {
|
||||
mHeader = writeHeader(header, writer, doNotWriteGenotypes, getVersionLine(), getStreamName());
|
||||
flushBuffer();
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException("Couldn't write file " + getStreamName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String getVersionLine() {
|
||||
return VERSION_LINE;
|
||||
}
|
||||
|
||||
public static VCFHeader writeHeader(VCFHeader header,
|
||||
final Writer writer,
|
||||
final boolean doNotWriteGenotypes,
|
||||
final String versionLine,
|
||||
final String streamNameForError) {
|
||||
header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header;
|
||||
|
||||
try {
|
||||
// the file format field needs to be written first
|
||||
writer.write(versionLine + "\n");
|
||||
|
||||
for ( VCFHeaderLine line : header.getMetaDataInSortedOrder() ) {
|
||||
if ( VCFHeaderVersion.isFormatString(line.getKey()) )
|
||||
continue;
|
||||
|
||||
writer.write(VCFHeader.METADATA_INDICATOR);
|
||||
writer.write(line.toString());
|
||||
writer.write("\n");
|
||||
}
|
||||
|
||||
// write out the column line
|
||||
writer.write(VCFHeader.HEADER_INDICATOR);
|
||||
boolean isFirst = true;
|
||||
for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) {
|
||||
if ( isFirst )
|
||||
isFirst = false; // don't write out a field separator
|
||||
else
|
||||
writer.write(VCFConstants.FIELD_SEPARATOR);
|
||||
writer.write(field.toString());
|
||||
}
|
||||
|
||||
if ( header.hasGenotypingData() ) {
|
||||
writer.write(VCFConstants.FIELD_SEPARATOR);
|
||||
writer.write("FORMAT");
|
||||
for ( String sample : header.getGenotypeSamples() ) {
|
||||
writer.write(VCFConstants.FIELD_SEPARATOR);
|
||||
writer.write(sample);
|
||||
}
|
||||
}
|
||||
|
||||
writer.write("\n");
|
||||
writer.flush(); // necessary so that writing to an output stream will work
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("IOException writing the VCF header to " + streamNameForError, e);
|
||||
}
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
@Override
|
||||
public void close() {
|
||||
// try to close the vcf stream
|
||||
try {
|
||||
// TODO -- would it be useful to null out the line buffer so we don't have it around unnecessarily?
|
||||
writer.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close " + getStreamName(), e);
|
||||
}
|
||||
|
||||
super.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* add a record to the file
|
||||
*
|
||||
* @param vc the Variant Context object
|
||||
*/
|
||||
@Override
|
||||
public void add(VariantContext vc) {
|
||||
if ( mHeader == null )
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName());
|
||||
|
||||
if ( doNotWriteGenotypes )
|
||||
vc = new VariantContextBuilder(vc).noGenotypes().make();
|
||||
|
||||
try {
|
||||
super.add(vc);
|
||||
|
||||
Map<Allele, String> alleleMap = buildAlleleMap(vc);
|
||||
|
||||
// CHROM
|
||||
write(vc.getChr());
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// POS
|
||||
write(String.valueOf(vc.getStart()));
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// ID
|
||||
String ID = vc.getID();
|
||||
write(ID);
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// REF
|
||||
String refString = vc.getReference().getDisplayString();
|
||||
write(refString);
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// ALT
|
||||
if ( vc.isVariant() ) {
|
||||
Allele altAllele = vc.getAlternateAllele(0);
|
||||
String alt = altAllele.getDisplayString();
|
||||
write(alt);
|
||||
|
||||
for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
|
||||
altAllele = vc.getAlternateAllele(i);
|
||||
alt = altAllele.getDisplayString();
|
||||
write(",");
|
||||
write(alt);
|
||||
}
|
||||
} else {
|
||||
write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
|
||||
}
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// QUAL
|
||||
if ( !vc.hasLog10PError() )
|
||||
write(VCFConstants.MISSING_VALUE_v4);
|
||||
else
|
||||
write(formatQualValue(vc.getPhredScaledQual()));
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// FILTER
|
||||
String filters = getFilterString(vc);
|
||||
write(filters);
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
// INFO
|
||||
Map<String, String> infoFields = new TreeMap<String, String>();
|
||||
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
|
||||
String key = field.getKey();
|
||||
|
||||
if ( ! mHeader.hasInfoLine(key) )
|
||||
fieldIsMissingFromHeaderError(vc, key, "INFO");
|
||||
|
||||
String outputValue = formatVCFField(field.getValue());
|
||||
if ( outputValue != null )
|
||||
infoFields.put(key, outputValue);
|
||||
}
|
||||
writeInfoString(infoFields);
|
||||
|
||||
// FORMAT
|
||||
final GenotypesContext gc = vc.getGenotypes();
|
||||
if ( gc.isLazyWithData() && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() instanceof String ) {
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
write(((LazyGenotypesContext) gc).getUnparsedGenotypeData().toString());
|
||||
} else {
|
||||
List<String> genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader);
|
||||
if ( ! genotypeAttributeKeys.isEmpty() ) {
|
||||
for ( final String format : genotypeAttributeKeys )
|
||||
if ( ! mHeader.hasFormatLine(format) )
|
||||
fieldIsMissingFromHeaderError(vc, format, "FORMAT");
|
||||
|
||||
final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
|
||||
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
write(genotypeFormatString);
|
||||
|
||||
addGenotypeData(vc, alleleMap, genotypeAttributeKeys);
|
||||
}
|
||||
}
|
||||
|
||||
write("\n");
|
||||
// note that we cannot call flush here if we want block gzipping to work properly
|
||||
// calling flush results in all gzipped blocks for each variant
|
||||
flushBuffer();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to write the VCF object to " + getStreamName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
|
||||
final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()+1);
|
||||
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
|
||||
|
||||
final List<Allele> alleles = vc.getAlleles();
|
||||
for ( int i = 0; i < alleles.size(); i++ ) {
|
||||
alleleMap.put(alleles.get(i), String.valueOf(i));
|
||||
}
|
||||
|
||||
return alleleMap;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// implementation functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private final String getFilterString(final VariantContext vc) {
|
||||
if ( vc.isFiltered() ) {
|
||||
for ( final String filter : vc.getFilters() )
|
||||
if ( ! mHeader.hasFilterLine(filter) )
|
||||
fieldIsMissingFromHeaderError(vc, filter, "FILTER");
|
||||
|
||||
return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters()));
|
||||
}
|
||||
else if ( vc.filtersWereApplied() )
|
||||
return VCFConstants.PASSES_FILTERS_v4;
|
||||
else
|
||||
return VCFConstants.UNFILTERED;
|
||||
}
|
||||
|
||||
private static final String QUAL_FORMAT_STRING = "%.2f";
|
||||
private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00";
|
||||
|
||||
private String formatQualValue(double qual) {
|
||||
String s = String.format(QUAL_FORMAT_STRING, qual);
|
||||
if ( s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM) )
|
||||
s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length());
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the info string; assumes that no values are null
|
||||
*
|
||||
* @param infoFields a map of info fields
|
||||
* @throws IOException for writer
|
||||
*/
|
||||
private void writeInfoString(Map<String, String> infoFields) throws IOException {
|
||||
if ( infoFields.isEmpty() ) {
|
||||
write(VCFConstants.EMPTY_INFO_FIELD);
|
||||
return;
|
||||
}
|
||||
|
||||
boolean isFirst = true;
|
||||
for ( Map.Entry<String, String> entry : infoFields.entrySet() ) {
|
||||
if ( isFirst )
|
||||
isFirst = false;
|
||||
else
|
||||
write(VCFConstants.INFO_FIELD_SEPARATOR);
|
||||
|
||||
String key = entry.getKey();
|
||||
write(key);
|
||||
|
||||
if ( !entry.getValue().equals("") ) {
|
||||
VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key);
|
||||
if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) {
|
||||
write("=");
|
||||
write(entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* add the genotype data
|
||||
*
|
||||
* @param vc the variant context
|
||||
* @param genotypeFormatKeys Genotype formatting string
|
||||
* @param alleleMap alleles for this context
|
||||
* @throws IOException for writer
|
||||
*/
|
||||
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
|
||||
throws IOException {
|
||||
final int ploidy = vc.getMaxPloidy(2);
|
||||
|
||||
for ( String sample : mHeader.getGenotypeSamples() ) {
|
||||
write(VCFConstants.FIELD_SEPARATOR);
|
||||
|
||||
Genotype g = vc.getGenotype(sample);
|
||||
if ( g == null ) g = GenotypeBuilder.createMissing(sample, ploidy);
|
||||
|
||||
final List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
|
||||
for ( String field : genotypeFormatKeys ) {
|
||||
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
|
||||
if ( !g.isAvailable() ) {
|
||||
throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record");
|
||||
}
|
||||
|
||||
writeAllele(g.getAllele(0), alleleMap);
|
||||
for (int i = 1; i < g.getPloidy(); i++) {
|
||||
write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
|
||||
writeAllele(g.getAllele(i), alleleMap);
|
||||
}
|
||||
|
||||
continue;
|
||||
} else {
|
||||
String outputValue;
|
||||
if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) {
|
||||
outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4;
|
||||
} else {
|
||||
final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field);
|
||||
if ( accessor != null ) {
|
||||
final int[] intValues = accessor.getValues(g);
|
||||
if ( intValues == null )
|
||||
outputValue = VCFConstants.MISSING_VALUE_v4;
|
||||
else if ( intValues.length == 1 ) // fast path
|
||||
outputValue = Integer.toString(intValues[0]);
|
||||
else {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(intValues[0]);
|
||||
for ( int i = 1; i < intValues.length; i++) {
|
||||
sb.append(",");
|
||||
sb.append(intValues[i]);
|
||||
}
|
||||
outputValue = sb.toString();
|
||||
}
|
||||
} else {
|
||||
Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4;
|
||||
|
||||
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field);
|
||||
if ( metaData != null ) {
|
||||
int numInFormatField = metaData.getCount(vc);
|
||||
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
|
||||
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
|
||||
// For example, if Number=2, the string has to be ".,."
|
||||
StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
|
||||
for ( int i = 1; i < numInFormatField; i++ ) {
|
||||
sb.append(",");
|
||||
sb.append(VCFConstants.MISSING_VALUE_v4);
|
||||
}
|
||||
val = sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
// assume that if key is absent, then the given string encoding suffices
|
||||
outputValue = formatVCFField(val);
|
||||
}
|
||||
}
|
||||
|
||||
if ( outputValue != null )
|
||||
attrs.add(outputValue);
|
||||
}
|
||||
}
|
||||
|
||||
// strip off trailing missing values
|
||||
for (int i = attrs.size()-1; i >= 0; i--) {
|
||||
if ( isMissingValue(attrs.get(i)) )
|
||||
attrs.remove(i);
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = 0; i < attrs.size(); i++) {
|
||||
if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) )
|
||||
write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
|
||||
write(attrs.get(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isMissingValue(String s) {
|
||||
// we need to deal with the case that it's a list of missing values
|
||||
return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());
|
||||
}
|
||||
|
||||
private void writeAllele(Allele allele, Map<Allele, String> alleleMap) throws IOException {
|
||||
String encoding = alleleMap.get(allele);
|
||||
if ( encoding == null )
|
||||
throw new TribbleException.InternalCodecException("Allele " + allele + " is not an allele in the variant context");
|
||||
write(encoding);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a double value and pretty prints it to a String for display
|
||||
*
|
||||
* Large doubles => gets %.2f style formatting
|
||||
* Doubles < 1 / 10 but > 1/100 </>=> get %.3f style formatting
|
||||
* Double < 1/100 => %.3e formatting
|
||||
* @param d
|
||||
* @return
|
||||
*/
|
||||
public static final String formatVCFDouble(final double d) {
|
||||
String format;
|
||||
if ( d < 1 ) {
|
||||
if ( d < 0.01 ) {
|
||||
if ( Math.abs(d) >= 1e-20 )
|
||||
format = "%.3e";
|
||||
else {
|
||||
// return a zero format
|
||||
return "0.00";
|
||||
}
|
||||
} else {
|
||||
format = "%.3f";
|
||||
}
|
||||
} else {
|
||||
format = "%.2f";
|
||||
}
|
||||
|
||||
return String.format(format, d);
|
||||
}
|
||||
|
||||
public static String formatVCFField(Object val) {
|
||||
String result;
|
||||
if ( val == null )
|
||||
result = VCFConstants.MISSING_VALUE_v4;
|
||||
else if ( val instanceof Double )
|
||||
result = formatVCFDouble((Double) val);
|
||||
else if ( val instanceof Boolean )
|
||||
result = (Boolean)val ? "" : null; // empty string for true, null for false
|
||||
else if ( val instanceof List ) {
|
||||
result = formatVCFField(((List)val).toArray());
|
||||
} else if ( val.getClass().isArray() ) {
|
||||
final int length = Array.getLength(val);
|
||||
if ( length == 0 )
|
||||
return formatVCFField(null);
|
||||
final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0)));
|
||||
for ( int i = 1; i < length; i++) {
|
||||
sb.append(",");
|
||||
sb.append(formatVCFField(Array.get(val, i)));
|
||||
}
|
||||
result = sb.toString();
|
||||
} else
|
||||
result = val.toString();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine which genotype fields are in use in the genotypes in VC
|
||||
* @param vc
|
||||
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first
|
||||
*/
|
||||
public static List<String> calcVCFGenotypeKeys(final VariantContext vc, final VCFHeader header) {
|
||||
Set<String> keys = new HashSet<String>();
|
||||
|
||||
boolean sawGoodGT = false;
|
||||
boolean sawGoodQual = false;
|
||||
boolean sawGenotypeFilter = false;
|
||||
boolean sawDP = false;
|
||||
boolean sawAD = false;
|
||||
boolean sawPL = false;
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
keys.addAll(g.getExtendedAttributes().keySet());
|
||||
if ( g.isAvailable() ) sawGoodGT = true;
|
||||
if ( g.hasGQ() ) sawGoodQual = true;
|
||||
if ( g.hasDP() ) sawDP = true;
|
||||
if ( g.hasAD() ) sawAD = true;
|
||||
if ( g.hasPL() ) sawPL = true;
|
||||
if (g.isFiltered()) sawGenotypeFilter = true;
|
||||
}
|
||||
|
||||
if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
|
||||
if ( sawDP ) keys.add(VCFConstants.DEPTH_KEY);
|
||||
if ( sawAD ) keys.add(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
|
||||
if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY);
|
||||
if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
|
||||
|
||||
List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
|
||||
|
||||
// make sure the GT is first
|
||||
if ( sawGoodGT ) {
|
||||
List<String> newList = new ArrayList<String>(sortedList.size()+1);
|
||||
newList.add(VCFConstants.GENOTYPE_KEY);
|
||||
newList.addAll(sortedList);
|
||||
sortedList = newList;
|
||||
}
|
||||
|
||||
if ( sortedList.isEmpty() && header.hasGenotypingData() ) {
|
||||
// this needs to be done in case all samples are no-calls
|
||||
return Collections.singletonList(VCFConstants.GENOTYPE_KEY);
|
||||
} else {
|
||||
return sortedList;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static int countOccurrences(char c, String s) {
|
||||
int count = 0;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
count += s.charAt(i) == c ? 1 : 0;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private final void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) {
|
||||
if ( !allowMissingFieldsInHeader)
|
||||
throw new IllegalStateException("Key " + id + " found in VariantContext field " + field
|
||||
+ " at " + vc.getChr() + ":" + vc.getStart()
|
||||
+ " but this key isn't defined in the VCFHeader. We require all VCFs to have"
|
||||
+ " complete VCF headers by default.");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
/**
|
||||
* this class writes VCF files
|
||||
*/
|
||||
public interface VariantContextWriter {
|
||||
|
||||
public void writeHeader(VCFHeader header);
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
public void close();
|
||||
|
||||
public void add(VariantContext vc);
|
||||
}
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.EnumSet;
|
||||
|
||||
/**
|
||||
* Factory methods to create VariantContext writers
|
||||
*
|
||||
* @author depristo
|
||||
* @since 5/12
|
||||
*/
|
||||
public class VariantContextWriterFactory {
|
||||
|
||||
public static final EnumSet<Options> DEFAULT_OPTIONS = EnumSet.of(Options.INDEX_ON_THE_FLY);
|
||||
public static final EnumSet<Options> NO_OPTIONS = EnumSet.noneOf(Options.class);
|
||||
|
||||
private VariantContextWriterFactory() {}
|
||||
|
||||
public static VariantContextWriter create(final File location, final SAMSequenceDictionary refDict) {
|
||||
return create(location, openOutputStream(location), refDict, DEFAULT_OPTIONS);
|
||||
}
|
||||
|
||||
public static VariantContextWriter create(final File location, final SAMSequenceDictionary refDict, final EnumSet<Options> options) {
|
||||
return create(location, openOutputStream(location), refDict, options);
|
||||
}
|
||||
|
||||
public static VariantContextWriter create(final File location,
|
||||
final OutputStream output,
|
||||
final SAMSequenceDictionary refDict) {
|
||||
return create(location, output, refDict, DEFAULT_OPTIONS);
|
||||
}
|
||||
|
||||
public static VariantContextWriter create(final OutputStream output,
|
||||
final SAMSequenceDictionary refDict,
|
||||
final EnumSet<Options> options) {
|
||||
return create(null, output, refDict, options);
|
||||
}
|
||||
|
||||
public static VariantContextWriter create(final File location,
|
||||
final OutputStream output,
|
||||
final SAMSequenceDictionary refDict,
|
||||
final EnumSet<Options> options) {
|
||||
final boolean enableBCF = isBCFOutput(location, options);
|
||||
|
||||
if ( enableBCF )
|
||||
return new BCF2Writer(location, output, refDict,
|
||||
options.contains(Options.INDEX_ON_THE_FLY),
|
||||
options.contains(Options.DO_NOT_WRITE_GENOTYPES));
|
||||
else {
|
||||
return new VCFWriter(location, output, refDict,
|
||||
options.contains(Options.INDEX_ON_THE_FLY),
|
||||
options.contains(Options.DO_NOT_WRITE_GENOTYPES),
|
||||
options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we output a BCF file based solely on the name of the file at location?
|
||||
*
|
||||
* @param location
|
||||
* @return
|
||||
*/
|
||||
public static boolean isBCFOutput(final File location) {
|
||||
return isBCFOutput(location, EnumSet.noneOf(Options.class));
|
||||
}
|
||||
|
||||
public static boolean isBCFOutput(final File location, final EnumSet<Options> options) {
|
||||
return options.contains(Options.FORCE_BCF) || (location != null && location.getName().contains(".bcf"));
|
||||
}
|
||||
|
||||
public static VariantContextWriter sortOnTheFly(final VariantContextWriter innerWriter, int maxCachingStartDistance) {
|
||||
return sortOnTheFly(innerWriter, maxCachingStartDistance, false);
|
||||
}
|
||||
|
||||
public static VariantContextWriter sortOnTheFly(final VariantContextWriter innerWriter, int maxCachingStartDistance, boolean takeOwnershipOfInner) {
|
||||
return new SortingVariantContextWriter(innerWriter, maxCachingStartDistance, takeOwnershipOfInner);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a output stream writing to location, or throws an exception if this fails
|
||||
* @param location
|
||||
* @return
|
||||
*/
|
||||
protected static OutputStream openOutputStream(final File location) {
|
||||
try {
|
||||
return new FileOutputStream(location);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new RuntimeException(location + ": Unable to create VCF writer", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,724 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.AsciiFeatureCodec;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.NameAwareCodec;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
|
||||
public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext> implements NameAwareCodec {
|
||||
public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20);
|
||||
|
||||
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
|
||||
|
||||
// we have to store the list of strings that make up the header until they're needed
|
||||
protected VCFHeader header = null;
|
||||
protected VCFHeaderVersion version = null;
|
||||
|
||||
// a mapping of the allele
|
||||
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||
|
||||
// for ParsingUtils.split
|
||||
protected String[] GTValueArray = new String[100];
|
||||
protected String[] genotypeKeyArray = new String[100];
|
||||
protected String[] infoFieldArray = new String[1000];
|
||||
protected String[] infoValueArray = new String[1000];
|
||||
|
||||
// for performance testing purposes
|
||||
public static boolean validate = true;
|
||||
|
||||
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
||||
// todo: make this thread safe?
|
||||
protected String[] parts = null;
|
||||
protected String[] genotypeParts = null;
|
||||
protected final String[] locParts = new String[6];
|
||||
|
||||
// for performance we cache the hashmap of filter encodings for quick lookup
|
||||
protected HashMap<String,List<String>> filterHash = new HashMap<String,List<String>>();
|
||||
|
||||
// we store a name to give to each of the variant contexts we emit
|
||||
protected String name = "Unknown";
|
||||
|
||||
protected int lineNo = 0;
|
||||
|
||||
protected Map<String, String> stringCache = new HashMap<String, String>();
|
||||
|
||||
protected boolean warnedAboutNoEqualsForNonFlag = false;
|
||||
|
||||
/**
|
||||
* If true, then we'll magically fix up VCF headers on the fly when we read them in
|
||||
*/
|
||||
protected boolean doOnTheFlyModifications = true;
|
||||
|
||||
protected AbstractVCFCodec() {
|
||||
super(VariantContext.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a LazyParser for a LazyGenotypesContext to use to decode
|
||||
* our genotypes only when necessary. We do this instead of eagarly
|
||||
* decoding the genotypes just to turn around and reencode in the frequent
|
||||
* case where we don't actually want to manipulate the genotypes
|
||||
*/
|
||||
class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser {
|
||||
final List<Allele> alleles;
|
||||
final String contig;
|
||||
final int start;
|
||||
|
||||
LazyVCFGenotypesParser(final List<Allele> alleles, final String contig, final int start) {
|
||||
this.alleles = alleles;
|
||||
this.contig = contig;
|
||||
this.start = start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LazyGenotypesContext.LazyData parse(final Object data) {
|
||||
//System.out.printf("Loading genotypes... %s:%d%n", contig, start);
|
||||
return createGenotypeMap((String) data, alleles, contig, start);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param reader the line reader to take header lines from
|
||||
* @return the number of header lines
|
||||
*/
|
||||
public abstract Object readHeader(LineReader reader);
|
||||
|
||||
/**
|
||||
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||
* @param filterString the string to parse
|
||||
* @return a set of the filters applied
|
||||
*/
|
||||
protected abstract List<String> parseFilters(String filterString);
|
||||
|
||||
/**
|
||||
* create a VCF header from a set of header record lines
|
||||
*
|
||||
* @param headerStrings a list of strings that represent all the ## and # entries
|
||||
* @return a VCFHeader object
|
||||
*/
|
||||
protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) {
|
||||
this.version = version;
|
||||
|
||||
Set<VCFHeaderLine> metaData = new LinkedHashSet<VCFHeaderLine>();
|
||||
Set<String> sampleNames = new LinkedHashSet<String>();
|
||||
int contigCounter = 0;
|
||||
// iterate over all the passed in strings
|
||||
for ( String str : headerStrings ) {
|
||||
if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) {
|
||||
String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR);
|
||||
if ( strings.length < VCFHeader.HEADER_FIELDS.values().length )
|
||||
throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str);
|
||||
|
||||
int arrayIndex = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||
try {
|
||||
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
|
||||
throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'");
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name.");
|
||||
}
|
||||
arrayIndex++;
|
||||
}
|
||||
|
||||
boolean sawFormatTag = false;
|
||||
if ( arrayIndex < strings.length ) {
|
||||
if ( !strings[arrayIndex].equals("FORMAT") )
|
||||
throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
|
||||
sawFormatTag = true;
|
||||
arrayIndex++;
|
||||
}
|
||||
|
||||
while ( arrayIndex < strings.length )
|
||||
sampleNames.add(strings[arrayIndex++]);
|
||||
|
||||
if ( sawFormatTag && sampleNames.size() == 0 )
|
||||
throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data");
|
||||
|
||||
} else {
|
||||
if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) {
|
||||
final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version);
|
||||
metaData.add(info);
|
||||
} else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) {
|
||||
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version);
|
||||
metaData.add(filter);
|
||||
} else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) {
|
||||
final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version);
|
||||
metaData.add(format);
|
||||
} else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) {
|
||||
final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++);
|
||||
metaData.add(contig);
|
||||
} else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) {
|
||||
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"));
|
||||
metaData.add(alt);
|
||||
} else {
|
||||
int equals = str.indexOf("=");
|
||||
if ( equals != -1 )
|
||||
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.header = new VCFHeader(metaData, sampleNames);
|
||||
if ( doOnTheFlyModifications )
|
||||
this.header = VCFStandardHeaderLines.repairStandardHeaderLines(this.header);
|
||||
return this.header;
|
||||
}
|
||||
|
||||
/**
|
||||
* the fast decode function
|
||||
* @param line the line of text for the record
|
||||
* @return a feature, (not guaranteed complete) that has the correct start and stop
|
||||
*/
|
||||
public Feature decodeLoc(String line) {
|
||||
return decodeLine(line, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* decode the line into a feature (VariantContext)
|
||||
* @param line the line
|
||||
* @return a VariantContext
|
||||
*/
|
||||
public VariantContext decode(String line) {
|
||||
return decodeLine(line, true);
|
||||
}
|
||||
|
||||
private final VariantContext decodeLine(final String line, final boolean includeGenotypes) {
|
||||
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
|
||||
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
|
||||
|
||||
// our header cannot be null, we need the genotype sample names and counts
|
||||
if (header == null) throw new TribbleException("VCF Header cannot be null when decoding a record");
|
||||
|
||||
if (parts == null)
|
||||
parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
|
||||
|
||||
int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
|
||||
|
||||
// if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data)
|
||||
if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
|
||||
(header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
|
||||
throw new TribbleException("Line " + lineNo + ": there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
|
||||
" tokens, and saw " + nParts + " )");
|
||||
|
||||
return parseVCFLine(parts, includeGenotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the VCF line
|
||||
*
|
||||
* @param parts the parts split up
|
||||
* @return a variant context object
|
||||
*/
|
||||
private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) {
|
||||
VariantContextBuilder builder = new VariantContextBuilder();
|
||||
builder.source(getName());
|
||||
|
||||
// increment the line count
|
||||
// TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when
|
||||
// TODO -- the first record is far along the contig) and the line counter can get out of sync
|
||||
lineNo++;
|
||||
|
||||
// parse out the required fields
|
||||
final String chr = getCachedString(parts[0]);
|
||||
builder.chr(chr);
|
||||
int pos = -1;
|
||||
try {
|
||||
pos = Integer.valueOf(parts[1]);
|
||||
} catch (NumberFormatException e) {
|
||||
generateException(parts[1] + " is not a valid start position in the VCF format");
|
||||
}
|
||||
builder.start(pos);
|
||||
|
||||
if ( parts[2].length() == 0 )
|
||||
generateException("The VCF specification requires a valid ID field");
|
||||
else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) )
|
||||
builder.noID();
|
||||
else
|
||||
builder.id(parts[2]);
|
||||
|
||||
final String ref = getCachedString(parts[3].toUpperCase());
|
||||
final String alts = getCachedString(parts[4].toUpperCase());
|
||||
builder.log10PError(parseQual(parts[5]));
|
||||
|
||||
final List<String> filters = parseFilters(getCachedString(parts[6]));
|
||||
if ( filters != null ) builder.filters(new HashSet<String>(filters));
|
||||
final Map<String, Object> attrs = parseInfo(parts[7]);
|
||||
builder.attributes(attrs);
|
||||
|
||||
if ( attrs.containsKey(VCFConstants.END_KEY) ) {
|
||||
// update stop with the end key if provided
|
||||
try {
|
||||
builder.stop(Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()));
|
||||
} catch (Exception e) {
|
||||
generateException("the END value in the INFO field is not valid");
|
||||
}
|
||||
} else {
|
||||
builder.stop(pos + ref.length() - 1);
|
||||
}
|
||||
|
||||
// get our alleles, filters, and setup an attribute map
|
||||
final List<Allele> alleles = parseAlleles(ref, alts, lineNo);
|
||||
builder.alleles(alleles);
|
||||
|
||||
// do we have genotyping data
|
||||
if (parts.length > NUM_STANDARD_FIELDS && includeGenotypes) {
|
||||
final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos);
|
||||
final int nGenotypes = header.getNGenotypeSamples();
|
||||
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes);
|
||||
|
||||
// did we resort the sample names? If so, we need to load the genotype data
|
||||
if ( !header.samplesWereAlreadySorted() )
|
||||
lazy.decode();
|
||||
|
||||
builder.genotypesNoValidation(lazy);
|
||||
}
|
||||
|
||||
VariantContext vc = null;
|
||||
try {
|
||||
vc = builder.make();
|
||||
} catch (Exception e) {
|
||||
generateException(e.getMessage());
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the name of this codec
|
||||
* @return our set name
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the name of this codec
|
||||
* @param name new name
|
||||
*/
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a cached copy of the supplied string.
|
||||
*
|
||||
* @param str string
|
||||
* @return interned string
|
||||
*/
|
||||
protected String getCachedString(String str) {
|
||||
String internedString = stringCache.get(str);
|
||||
if ( internedString == null ) {
|
||||
internedString = new String(str);
|
||||
stringCache.put(internedString, internedString);
|
||||
}
|
||||
return internedString;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the info fields
|
||||
* @param infoField the fields
|
||||
* @return a mapping of keys to objects
|
||||
*/
|
||||
private Map<String, Object> parseInfo(String infoField) {
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
if ( infoField.length() == 0 )
|
||||
generateException("The VCF specification requires a valid info field");
|
||||
|
||||
if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) {
|
||||
if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 )
|
||||
generateException("The VCF specification does not allow for whitespace in the INFO field");
|
||||
|
||||
int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false);
|
||||
for (int i = 0; i < infoFieldSplitSize; i++) {
|
||||
String key;
|
||||
Object value;
|
||||
|
||||
int eqI = infoFieldArray[i].indexOf("=");
|
||||
if ( eqI != -1 ) {
|
||||
key = infoFieldArray[i].substring(0, eqI);
|
||||
String valueString = infoFieldArray[i].substring(eqI+1);
|
||||
|
||||
// split on the INFO field separator
|
||||
int infoValueSplitSize = ParsingUtils.split(valueString, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false);
|
||||
if ( infoValueSplitSize == 1 ) {
|
||||
value = infoValueArray[0];
|
||||
final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key);
|
||||
if ( headerLine != null && headerLine.getType() == VCFHeaderLineType.Flag && value.equals("0") ) {
|
||||
// deal with the case where a flag field has =0, such as DB=0, by skipping the add
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
ArrayList<String> valueList = new ArrayList<String>(infoValueSplitSize);
|
||||
for ( int j = 0; j < infoValueSplitSize; j++ )
|
||||
valueList.add(infoValueArray[j]);
|
||||
value = valueList;
|
||||
}
|
||||
} else {
|
||||
key = infoFieldArray[i];
|
||||
final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key);
|
||||
if ( headerLine != null && headerLine.getType() != VCFHeaderLineType.Flag ) {
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED && ! warnedAboutNoEqualsForNonFlag ) {
|
||||
System.err.println("Found info key " + key + " without a = value, but the header says the field is of type "
|
||||
+ headerLine.getType() + " but this construct is only value for FLAG type fields");
|
||||
warnedAboutNoEqualsForNonFlag = true;
|
||||
}
|
||||
|
||||
value = VCFConstants.MISSING_VALUE_v4;
|
||||
} else {
|
||||
value = true;
|
||||
}
|
||||
}
|
||||
|
||||
// this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING
|
||||
if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4;
|
||||
|
||||
attributes.put(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
return attributes;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a an allele from an index and an array of alleles
|
||||
* @param index the index
|
||||
* @param alleles the alleles
|
||||
* @return an Allele
|
||||
*/
|
||||
protected static Allele oneAllele(String index, List<Allele> alleles) {
|
||||
if ( index.equals(VCFConstants.EMPTY_ALLELE) )
|
||||
return Allele.NO_CALL;
|
||||
final int i;
|
||||
try {
|
||||
i = Integer.valueOf(index);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new TribbleException.InternalCodecException("The following invalid GT allele index was encountered in the file: " + index);
|
||||
}
|
||||
if ( i >= alleles.size() )
|
||||
throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record");
|
||||
return alleles.get(i);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* parse genotype alleles from the genotype string
|
||||
* @param GT GT string
|
||||
* @param alleles list of possible alleles
|
||||
* @param cache cache of alleles for GT
|
||||
* @return the allele list for the GT string
|
||||
*/
|
||||
protected static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||
// cache results [since they are immutable] and return a single object for each genotype
|
||||
List<Allele> GTAlleles = cache.get(GT);
|
||||
|
||||
if ( GTAlleles == null ) {
|
||||
StringTokenizer st = new StringTokenizer(GT, VCFConstants.PHASING_TOKENS);
|
||||
GTAlleles = new ArrayList<Allele>(st.countTokens());
|
||||
while ( st.hasMoreTokens() ) {
|
||||
String genotype = st.nextToken();
|
||||
GTAlleles.add(oneAllele(genotype, alleles));
|
||||
}
|
||||
cache.put(GT, GTAlleles);
|
||||
}
|
||||
|
||||
return GTAlleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the qual value
|
||||
* @param qualString the quality string
|
||||
* @return return a double
|
||||
*/
|
||||
protected static Double parseQual(String qualString) {
|
||||
// if we're the VCF 4 missing char, return immediately
|
||||
if ( qualString.equals(VCFConstants.MISSING_VALUE_v4))
|
||||
return VariantContext.NO_LOG10_PERROR;
|
||||
|
||||
Double val = Double.valueOf(qualString);
|
||||
|
||||
// check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles
|
||||
if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON))
|
||||
return VariantContext.NO_LOG10_PERROR;
|
||||
|
||||
// scale and return the value
|
||||
return val / -10.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the alleles
|
||||
* @param ref the reference base
|
||||
* @param alts a string of alternates to break into alleles
|
||||
* @param lineNo the line number for this record
|
||||
* @return a list of alleles, and a pair of the shortest and longest sequence
|
||||
*/
|
||||
protected static List<Allele> parseAlleles(String ref, String alts, int lineNo) {
|
||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||
// ref
|
||||
checkAllele(ref, true, lineNo);
|
||||
Allele refAllele = Allele.create(ref, true);
|
||||
alleles.add(refAllele);
|
||||
|
||||
if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split
|
||||
parseSingleAltAllele(alleles, alts, lineNo);
|
||||
else
|
||||
for ( String alt : alts.split(",") )
|
||||
parseSingleAltAllele(alleles, alt, lineNo);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* check to make sure the allele is an acceptable allele
|
||||
* @param allele the allele to check
|
||||
* @param isRef are we the reference allele?
|
||||
* @param lineNo the line number for this record
|
||||
*/
|
||||
private static void checkAllele(String allele, boolean isRef, int lineNo) {
|
||||
if ( allele == null || allele.length() == 0 )
|
||||
generateException("Empty alleles are not permitted in VCF records", lineNo);
|
||||
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) {
|
||||
System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo));
|
||||
}
|
||||
|
||||
if ( isSymbolicAllele(allele) ) {
|
||||
if ( isRef ) {
|
||||
generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo);
|
||||
}
|
||||
} else {
|
||||
// check for VCF3 insertions or deletions
|
||||
if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) )
|
||||
generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" +
|
||||
" convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo);
|
||||
|
||||
if (!Allele.acceptableAlleleBases(allele))
|
||||
generateException("Unparsable vcf record with allele " + allele, lineNo);
|
||||
|
||||
if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) )
|
||||
generateException("The reference allele cannot be missing", lineNo);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* return true if this is a symbolic allele (e.g. <SOMETAG>) or
|
||||
* structural variation breakend (with [ or ]), otherwise false
|
||||
* @param allele the allele to check
|
||||
* @return true if the allele is a symbolic allele, otherwise false
|
||||
*/
|
||||
private static boolean isSymbolicAllele(String allele) {
|
||||
return (allele != null && allele.length() > 2 &&
|
||||
((allele.startsWith("<") && allele.endsWith(">")) ||
|
||||
(allele.contains("[") || allele.contains("]"))));
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a single allele, given the allele list
|
||||
* @param alleles the alleles available
|
||||
* @param alt the allele to parse
|
||||
* @param lineNo the line number for this record
|
||||
*/
|
||||
private static void parseSingleAltAllele(List<Allele> alleles, String alt, int lineNo) {
|
||||
checkAllele(alt, false, lineNo);
|
||||
|
||||
Allele allele = Allele.create(alt, false);
|
||||
if ( ! allele.isNoCall() )
|
||||
alleles.add(allele);
|
||||
}
|
||||
|
||||
public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) {
|
||||
try {
|
||||
return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
|
||||
isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) ||
|
||||
isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
return false;
|
||||
} catch ( IOException e ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) {
|
||||
try {
|
||||
byte[] buff = new byte[MAGIC_HEADER_LINE.length()];
|
||||
int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length());
|
||||
boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes());
|
||||
return eq;
|
||||
// String firstLine = new String(buff);
|
||||
// return firstLine.startsWith(MAGIC_HEADER_LINE);
|
||||
} catch ( IOException e ) {
|
||||
return false;
|
||||
} catch ( RuntimeException e ) {
|
||||
return false;
|
||||
} finally {
|
||||
try { stream.close(); } catch ( IOException e ) {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* create a genotype map
|
||||
*
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public LazyGenotypesContext.LazyData createGenotypeMap(final String str,
|
||||
final List<Allele> alleles,
|
||||
final String chr,
|
||||
final int pos) {
|
||||
if (genotypeParts == null)
|
||||
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
|
||||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
if ( nParts != genotypeParts.length )
|
||||
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo);
|
||||
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
|
||||
|
||||
// get the format keys
|
||||
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
|
||||
|
||||
// cycle through the sample names
|
||||
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
|
||||
|
||||
// clear out our allele mapping
|
||||
alleleMap.clear();
|
||||
|
||||
// cycle through the genotype strings
|
||||
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
|
||||
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
|
||||
|
||||
final String sampleName = sampleNameIterator.next();
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sampleName);
|
||||
|
||||
// check to see if the value list is longer than the key list, which is a problem
|
||||
if (nGTKeys < GTValueSplitSize)
|
||||
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
|
||||
|
||||
int genotypeAlleleLocation = -1;
|
||||
if (nGTKeys >= 1) {
|
||||
gb.maxAttributes(nGTKeys - 1);
|
||||
|
||||
for (int i = 0; i < nGTKeys; i++) {
|
||||
final String gtKey = genotypeKeyArray[i];
|
||||
boolean missing = i >= GTValueSplitSize;
|
||||
|
||||
// todo -- all of these on the fly parsing of the missing value should be static constants
|
||||
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
|
||||
genotypeAlleleLocation = i;
|
||||
} else if ( missing ) {
|
||||
// if its truly missing (there no provided value) skip adding it to the attributes
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
|
||||
final List<String> filters = parseFilters(getCachedString(GTValueArray[i]));
|
||||
if ( filters != null ) gb.filters(filters);
|
||||
} else if ( GTValueArray[i].equals(VCFConstants.MISSING_VALUE_v4) ) {
|
||||
// don't add missing values to the map
|
||||
} else {
|
||||
if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
||||
if ( GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) )
|
||||
gb.noGQ();
|
||||
else
|
||||
gb.GQ((int)Math.round(Double.valueOf(GTValueArray[i])));
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
|
||||
gb.AD(decodeInts(GTValueArray[i]));
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_PL_KEY)) {
|
||||
gb.PL(decodeInts(GTValueArray[i]));
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
|
||||
gb.PL(GenotypeLikelihoods.fromGLField(GTValueArray[i]).getAsPLs());
|
||||
} else if (gtKey.equals(VCFConstants.DEPTH_KEY)) {
|
||||
gb.DP(Integer.valueOf(GTValueArray[i]));
|
||||
} else {
|
||||
gb.attribute(gtKey, GTValueArray[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check to make sure we found a genotype field if our version is less than 4.1 file
|
||||
if ( version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1 )
|
||||
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
|
||||
if ( genotypeAlleleLocation > 0 )
|
||||
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
|
||||
|
||||
final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
|
||||
gb.alleles(GTalleles);
|
||||
gb.phased(genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1);
|
||||
|
||||
// add it to the list
|
||||
try {
|
||||
genotypes.add(gb.make());
|
||||
} catch (TribbleException e) {
|
||||
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
|
||||
}
|
||||
}
|
||||
|
||||
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
|
||||
}
|
||||
|
||||
|
||||
private final static String[] INT_DECODE_ARRAY = new String[10000];
|
||||
private final static int[] decodeInts(final String string) {
|
||||
final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ',');
|
||||
final int[] values = new int[nValues];
|
||||
for ( int i = 0; i < nValues; i++ )
|
||||
values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]);
|
||||
return values;
|
||||
}
|
||||
|
||||
/**
|
||||
* Forces all VCFCodecs to not perform any on the fly modifications to the VCF header
|
||||
* of VCF records. Useful primarily for raw comparisons such as when comparing
|
||||
* raw VCF records
|
||||
*/
|
||||
public final void disableOnTheFlyModifications() {
|
||||
doOnTheFlyModifications = false;
|
||||
}
|
||||
|
||||
|
||||
protected void generateException(String message) {
|
||||
throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message));
|
||||
}
|
||||
|
||||
protected static void generateException(String message, int lineNo) {
|
||||
throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been
|
||||
* depreciated in favor of VCF4 (See VCF codec for the latest information)
|
||||
*
|
||||
* <p>
|
||||
* Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example)
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
|
||||
* See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
|
||||
* </p>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 2010
|
||||
*/
|
||||
public class VCF3Codec extends AbstractVCFCodec {
|
||||
public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3";
|
||||
|
||||
|
||||
/**
|
||||
* @param reader the line reader to take header lines from
|
||||
* @return the number of header lines
|
||||
*/
|
||||
public Object readHeader(LineReader reader) {
|
||||
List<String> headerStrings = new ArrayList<String>();
|
||||
|
||||
String line;
|
||||
VCFHeaderVersion version = null;
|
||||
try {
|
||||
boolean foundHeaderVersion = false;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
lineNo++;
|
||||
if (line.startsWith(VCFHeader.METADATA_INDICATOR)) {
|
||||
String[] lineFields = line.substring(2).split("=");
|
||||
if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) {
|
||||
if ( !VCFHeaderVersion.isVersionString(lineFields[1]) )
|
||||
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
|
||||
foundHeaderVersion = true;
|
||||
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
|
||||
if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 )
|
||||
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]);
|
||||
}
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
|
||||
if (!foundHeaderVersion) {
|
||||
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
|
||||
}
|
||||
headerStrings.add(line);
|
||||
return super.parseHeaderFromLines(headerStrings, version);
|
||||
}
|
||||
else {
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||
* @param filterString the string to parse
|
||||
* @return a set of the filters applied
|
||||
*/
|
||||
protected List<String> parseFilters(String filterString) {
|
||||
|
||||
// null for unfiltered
|
||||
if ( filterString.equals(VCFConstants.UNFILTERED) )
|
||||
return null;
|
||||
|
||||
// empty set for passes filters
|
||||
List<String> fFields = new ArrayList<String>();
|
||||
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
|
||||
return new ArrayList<String>(fFields);
|
||||
|
||||
if ( filterString.length() == 0 )
|
||||
generateException("The VCF specification requires a valid filter status");
|
||||
|
||||
// do we have the filter string cached?
|
||||
if ( filterHash.containsKey(filterString) )
|
||||
return new ArrayList<String>(filterHash.get(filterString));
|
||||
|
||||
// otherwise we have to parse and cache the value
|
||||
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
|
||||
fFields.add(filterString);
|
||||
else
|
||||
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
|
||||
|
||||
filterHash.put(filterString, fFields);
|
||||
|
||||
return fFields;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canDecode(final String potentialInput) {
|
||||
return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,159 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A feature codec for the VCF 4 specification
|
||||
*
|
||||
* <p>
|
||||
* VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a
|
||||
* header line, and then data lines each containing information about a position in the genome.
|
||||
* </p>
|
||||
* <p>One of the main uses of next-generation sequencing is to discover variation amongst large populations
|
||||
* of related samples. Recently the format for storing next-generation read alignments has been
|
||||
* standardised by the SAM/BAM file format specification. This has significantly improved the
|
||||
* interoperability of next-generation tools for alignment, visualisation, and variant calling.
|
||||
* We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent
|
||||
* types of sequence variation, including SNPs, indels and larger structural variants, together
|
||||
* with rich annotations. VCF is usually stored in a compressed manner and can be indexed for
|
||||
* fast data retrieval of variants from a range of positions on the reference genome.
|
||||
* The format was developed for the 1000 Genomes Project, and has also been adopted by other projects
|
||||
* such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements
|
||||
* various utilities for processing VCF files, including validation, merging and comparing,
|
||||
* and also provides a general Perl and Python API.
|
||||
* The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.</p>
|
||||
*
|
||||
* <p>
|
||||
* See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
|
||||
* See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
|
||||
* </p>
|
||||
*
|
||||
* <h2>File format example</h2>
|
||||
* <pre>
|
||||
* ##fileformat=VCFv4.0
|
||||
* #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
|
||||
* chr1 109 . A T 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99
|
||||
* chr1 147 . C A 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:294,49:118:-57.87,-34.96,-338.46:99
|
||||
* </pre>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 2010
|
||||
*/
|
||||
public class VCFCodec extends AbstractVCFCodec {
|
||||
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
|
||||
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
|
||||
|
||||
/**
|
||||
* @param reader the line reader to take header lines from
|
||||
* @return the number of header lines
|
||||
*/
|
||||
public Object readHeader(LineReader reader) {
|
||||
List<String> headerStrings = new ArrayList<String>();
|
||||
|
||||
String line;
|
||||
try {
|
||||
boolean foundHeaderVersion = false;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
lineNo++;
|
||||
if (line.startsWith(VCFHeader.METADATA_INDICATOR)) {
|
||||
String[] lineFields = line.substring(2).split("=");
|
||||
if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) {
|
||||
if ( !VCFHeaderVersion.isVersionString(lineFields[1]) )
|
||||
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
|
||||
foundHeaderVersion = true;
|
||||
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
|
||||
if ( version == VCFHeaderVersion.VCF3_3 || version == VCFHeaderVersion.VCF3_2 )
|
||||
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]);
|
||||
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 )
|
||||
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]);
|
||||
}
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
|
||||
if (!foundHeaderVersion) {
|
||||
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
|
||||
}
|
||||
headerStrings.add(line);
|
||||
super.parseHeaderFromLines(headerStrings, version);
|
||||
return this.header;
|
||||
}
|
||||
else {
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
/**
|
||||
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||
*
|
||||
* @param filterString the string to parse
|
||||
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
|
||||
*/
|
||||
protected List<String> parseFilters(String filterString) {
|
||||
// null for unfiltered
|
||||
if ( filterString.equals(VCFConstants.UNFILTERED) )
|
||||
return null;
|
||||
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
|
||||
return Collections.emptyList();
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
|
||||
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
|
||||
if ( filterString.length() == 0 )
|
||||
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
|
||||
|
||||
// do we have the filter string cached?
|
||||
if ( filterHash.containsKey(filterString) )
|
||||
return filterHash.get(filterString);
|
||||
|
||||
// empty set for passes filters
|
||||
List<String> fFields = new LinkedList<String>();
|
||||
// otherwise we have to parse and cache the value
|
||||
if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) )
|
||||
fFields.add(filterString);
|
||||
else
|
||||
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
|
||||
|
||||
filterHash.put(filterString, Collections.unmodifiableList(fFields));
|
||||
|
||||
return fFields;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canDecode(final String potentialInput) {
|
||||
return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,258 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* a base class for compound header lines, which include info lines and format lines (so far)
|
||||
*/
|
||||
public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
|
||||
|
||||
public enum SupportedHeaderLineType {
|
||||
INFO(true), FORMAT(false);
|
||||
|
||||
public final boolean allowFlagValues;
|
||||
SupportedHeaderLineType(boolean flagValues) {
|
||||
allowFlagValues = flagValues;
|
||||
}
|
||||
}
|
||||
|
||||
// the field types
|
||||
private String name;
|
||||
private int count = -1;
|
||||
private VCFHeaderLineCount countType;
|
||||
private String description;
|
||||
private VCFHeaderLineType type;
|
||||
|
||||
// access methods
|
||||
public String getID() { return name; }
|
||||
public String getDescription() { return description; }
|
||||
public VCFHeaderLineType getType() { return type; }
|
||||
public VCFHeaderLineCount getCountType() { return countType; }
|
||||
public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; }
|
||||
public int getCount() {
|
||||
if ( ! isFixedCount() )
|
||||
throw new TribbleException("Asking for header line count when type is not an integer");
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of values expected for this header field, given the properties of VariantContext vc
|
||||
*
|
||||
* If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1
|
||||
* If the count is of type A, return vc.getNAlleles - 1
|
||||
* If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the
|
||||
* max ploidy among all samples. Note that if the max ploidy of the VC is 0 (there's no GT information
|
||||
* at all, then implicitly assume diploid samples when computing G values.
|
||||
* If the count is UNBOUNDED return -1
|
||||
*
|
||||
* @param vc
|
||||
* @return
|
||||
*/
|
||||
public int getCount(final VariantContext vc) {
|
||||
switch ( countType ) {
|
||||
case INTEGER: return count;
|
||||
case UNBOUNDED: return -1;
|
||||
case A: return vc.getNAlleles() - 1;
|
||||
case G:
|
||||
final int ploidy = vc.getMaxPloidy(2);
|
||||
return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy);
|
||||
default:
|
||||
throw new TribbleException("Unknown count type: " + countType);
|
||||
}
|
||||
}
|
||||
|
||||
public void setNumberToUnbounded() {
|
||||
countType = VCFHeaderLineCount.UNBOUNDED;
|
||||
count = -1;
|
||||
}
|
||||
|
||||
// our type of line, i.e. format, info, etc
|
||||
private final SupportedHeaderLineType lineType;
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param count the count for this header line
|
||||
* @param type the type for this header line
|
||||
* @param description the description for this header line
|
||||
* @param lineType the header line type
|
||||
*/
|
||||
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.name = name;
|
||||
this.countType = VCFHeaderLineCount.INTEGER;
|
||||
this.count = count;
|
||||
this.type = type;
|
||||
this.description = description;
|
||||
this.lineType = lineType;
|
||||
validate();
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param count the count type for this header line
|
||||
* @param type the type for this header line
|
||||
* @param description the description for this header line
|
||||
* @param lineType the header line type
|
||||
*/
|
||||
protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.name = name;
|
||||
this.countType = count;
|
||||
this.type = type;
|
||||
this.description = description;
|
||||
this.lineType = lineType;
|
||||
validate();
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the VCF header version
|
||||
* @param lineType the header line type
|
||||
*
|
||||
*/
|
||||
protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||
name = mapping.get("ID");
|
||||
count = -1;
|
||||
final String numberStr = mapping.get("Number");
|
||||
if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) {
|
||||
countType = VCFHeaderLineCount.A;
|
||||
} else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) {
|
||||
countType = VCFHeaderLineCount.G;
|
||||
} else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) &&
|
||||
numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) ||
|
||||
((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) &&
|
||||
numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) {
|
||||
countType = VCFHeaderLineCount.UNBOUNDED;
|
||||
} else {
|
||||
countType = VCFHeaderLineCount.INTEGER;
|
||||
count = Integer.valueOf(numberStr);
|
||||
|
||||
}
|
||||
|
||||
if ( count < 0 && countType == VCFHeaderLineCount.INTEGER )
|
||||
throw new TribbleException.InvalidHeader("Count < 0 for fixed size VCF header field " + name);
|
||||
|
||||
try {
|
||||
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
|
||||
} catch (Exception e) {
|
||||
throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)");
|
||||
}
|
||||
if (type == VCFHeaderLineType.Flag && !allowFlagValues())
|
||||
throw new IllegalArgumentException("Flag is an unsupported type for this kind of field");
|
||||
|
||||
description = mapping.get("Description");
|
||||
if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided
|
||||
description = UNBOUND_DESCRIPTION;
|
||||
|
||||
this.lineType = lineType;
|
||||
|
||||
validate();
|
||||
}
|
||||
|
||||
private void validate() {
|
||||
if ( name == null || type == null || description == null || lineType == null )
|
||||
throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s",
|
||||
super.getKey(), name, type, description, lineType ));
|
||||
|
||||
if ( type == VCFHeaderLineType.Flag && count != 0 ) {
|
||||
count = 0;
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
|
||||
System.err.println("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* make a string representation of this header line
|
||||
* @return a string representation
|
||||
*/
|
||||
protected String toStringEncoding() {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID", name);
|
||||
Object number;
|
||||
switch ( countType ) {
|
||||
case A: number = VCFConstants.PER_ALLELE_COUNT; break;
|
||||
case G: number = VCFConstants.PER_GENOTYPE_COUNT; break;
|
||||
case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break;
|
||||
case INTEGER:
|
||||
default: number = count;
|
||||
}
|
||||
map.put("Number", number);
|
||||
map.put("Type", type);
|
||||
map.put("Description", description);
|
||||
return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map);
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true if we're equal to another compounder header line
|
||||
* @param o a compound header line
|
||||
* @return true if equal
|
||||
*/
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFCompoundHeaderLine) )
|
||||
return false;
|
||||
VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o;
|
||||
return equalsExcludingDescription(other) &&
|
||||
description.equals(other.description);
|
||||
}
|
||||
|
||||
public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) {
|
||||
return count == other.count &&
|
||||
countType == other.countType &&
|
||||
type == other.type &&
|
||||
lineType == other.lineType &&
|
||||
name.equals(other.name);
|
||||
}
|
||||
|
||||
public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) {
|
||||
return lineType == other.lineType &&
|
||||
name.equals(other.name);
|
||||
}
|
||||
|
||||
/**
|
||||
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
|
||||
* @return true if we do, false otherwise
|
||||
*/
|
||||
abstract boolean allowFlagValues();
|
||||
|
||||
}
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public final class VCFConstants {
|
||||
public static final Locale VCF_LOCALE = Locale.US;
|
||||
|
||||
// reserved INFO/FORMAT field keys
|
||||
public static final String ANCESTRAL_ALLELE_KEY = "AA";
|
||||
public static final String ALLELE_COUNT_KEY = "AC";
|
||||
public static final String MLE_ALLELE_COUNT_KEY = "MLEAC";
|
||||
public static final String ALLELE_FREQUENCY_KEY = "AF";
|
||||
public static final String MLE_ALLELE_FREQUENCY_KEY = "MLEAF";
|
||||
public static final String MLE_PER_SAMPLE_ALLELE_COUNT_KEY = "MLPSAC";
|
||||
public static final String MLE_PER_SAMPLE_ALLELE_FRACTION_KEY = "MLPSAF";
|
||||
public static final String ALLELE_NUMBER_KEY = "AN";
|
||||
public static final String RMS_BASE_QUALITY_KEY = "BQ";
|
||||
public static final String CIGAR_KEY = "CIGAR";
|
||||
public static final String DBSNP_KEY = "DB";
|
||||
public static final String DEPTH_KEY = "DP";
|
||||
public static final String DOWNSAMPLED_KEY = "DS";
|
||||
public static final String EXPECTED_ALLELE_COUNT_KEY = "EC";
|
||||
public static final String END_KEY = "END";
|
||||
|
||||
public static final String GENOTYPE_FILTER_KEY = "FT";
|
||||
public static final String GENOTYPE_KEY = "GT";
|
||||
public static final String GENOTYPE_POSTERIORS_KEY = "GP";
|
||||
public static final String GENOTYPE_QUALITY_KEY = "GQ";
|
||||
public static final String GENOTYPE_ALLELE_DEPTHS = "AD";
|
||||
public static final String GENOTYPE_PL_KEY = "PL"; // phred-scaled genotype likelihoods
|
||||
@Deprecated public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods
|
||||
|
||||
public static final String HAPMAP2_KEY = "H2";
|
||||
public static final String HAPMAP3_KEY = "H3";
|
||||
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
|
||||
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
|
||||
public static final String MAPPING_QUALITY_ZERO_KEY = "MQ0";
|
||||
public static final String SAMPLE_NUMBER_KEY = "NS";
|
||||
public static final String PHASE_QUALITY_KEY = "PQ";
|
||||
public static final String PHASE_SET_KEY = "PS";
|
||||
public static final String OLD_DEPTH_KEY = "RD";
|
||||
public static final String STRAND_BIAS_KEY = "SB";
|
||||
public static final String SOMATIC_KEY = "SOMATIC";
|
||||
public static final String VALIDATED_KEY = "VALIDATED";
|
||||
public static final String THOUSAND_GENOMES_KEY = "1000G";
|
||||
|
||||
// separators
|
||||
public static final String FORMAT_FIELD_SEPARATOR = ":";
|
||||
public static final String GENOTYPE_FIELD_SEPARATOR = ":";
|
||||
public static final char GENOTYPE_FIELD_SEPARATOR_CHAR = ':';
|
||||
public static final String FIELD_SEPARATOR = "\t";
|
||||
public static final char FIELD_SEPARATOR_CHAR = '\t';
|
||||
public static final String FILTER_CODE_SEPARATOR = ";";
|
||||
public static final String INFO_FIELD_ARRAY_SEPARATOR = ",";
|
||||
public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ',';
|
||||
public static final String ID_FIELD_SEPARATOR = ";";
|
||||
public static final String INFO_FIELD_SEPARATOR = ";";
|
||||
public static final char INFO_FIELD_SEPARATOR_CHAR = ';';
|
||||
public static final String UNPHASED = "/";
|
||||
public static final String PHASED = "|";
|
||||
public static final String PHASED_SWITCH_PROB_v3 = "\\";
|
||||
public static final String PHASING_TOKENS = "/|\\";
|
||||
|
||||
// header lines
|
||||
public static final String FILTER_HEADER_START = "##FILTER";
|
||||
public static final String FORMAT_HEADER_START = "##FORMAT";
|
||||
public static final String INFO_HEADER_START = "##INFO";
|
||||
public static final String ALT_HEADER_START = "##ALT";
|
||||
public static final String CONTIG_HEADER_KEY = "contig";
|
||||
public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY;
|
||||
|
||||
// old indel alleles
|
||||
public static final char DELETION_ALLELE_v3 = 'D';
|
||||
public static final char INSERTION_ALLELE_v3 = 'I';
|
||||
|
||||
// missing/default values
|
||||
public static final String UNFILTERED = ".";
|
||||
public static final String PASSES_FILTERS_v3 = "0";
|
||||
public static final String PASSES_FILTERS_v4 = "PASS";
|
||||
public static final String EMPTY_ID_FIELD = ".";
|
||||
public static final String EMPTY_INFO_FIELD = ".";
|
||||
public static final String EMPTY_ALTERNATE_ALLELE_FIELD = ".";
|
||||
public static final String MISSING_VALUE_v4 = ".";
|
||||
public static final String MISSING_QUALITY_v3 = "-1";
|
||||
public static final Double MISSING_QUALITY_v3_DOUBLE = Double.valueOf(MISSING_QUALITY_v3);
|
||||
|
||||
public static final String MISSING_GENOTYPE_QUALITY_v3 = "-1";
|
||||
public static final String MISSING_HAPLOTYPE_QUALITY_v3 = "-1";
|
||||
public static final String MISSING_DEPTH_v3 = "-1";
|
||||
public static final String UNBOUNDED_ENCODING_v4 = ".";
|
||||
public static final String UNBOUNDED_ENCODING_v3 = "-1";
|
||||
public static final String PER_ALLELE_COUNT = "A";
|
||||
public static final String PER_GENOTYPE_COUNT = "G";
|
||||
public static final String EMPTY_ALLELE = ".";
|
||||
public static final String EMPTY_GENOTYPE = "./.";
|
||||
public static final int MAX_GENOTYPE_QUAL = 99;
|
||||
|
||||
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
|
||||
public static final String REFSAMPLE_DEPTH_KEY = "REFDEPTH";
|
||||
}
|
||||
|
|
@ -1,74 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A special class representing a contig VCF header line. Nows the true contig order and sorts on that
|
||||
*
|
||||
* @author mdepristo
|
||||
*/
|
||||
public class VCFContigHeaderLine extends VCFSimpleHeaderLine {
|
||||
final Integer contigIndex;
|
||||
|
||||
|
||||
/**
|
||||
* create a VCF contig header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
* @param key the key for this header line
|
||||
*/
|
||||
public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, int contigIndex) {
|
||||
super(line, version, key, null);
|
||||
this.contigIndex = contigIndex;
|
||||
}
|
||||
|
||||
public VCFContigHeaderLine(final Map<String, String> mapping, int contigIndex) {
|
||||
super(VCFHeader.CONTIG_KEY, mapping, null);
|
||||
this.contigIndex = contigIndex;
|
||||
}
|
||||
|
||||
public Integer getContigIndex() {
|
||||
return contigIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* IT IS CRITIAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public int compareTo(final Object other) {
|
||||
if ( other instanceof VCFContigHeaderLine )
|
||||
return contigIndex.compareTo(((VCFContigHeaderLine) other).contigIndex);
|
||||
else {
|
||||
return super.compareTo(other);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,63 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for FILTER fields in the VCF header
|
||||
*/
|
||||
public class VCFFilterHeaderLine extends VCFSimpleHeaderLine {
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFFilterHeaderLine(String name, String description) {
|
||||
super("FILTER", name, description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience constructor for FILTER whose description is the name
|
||||
* @param name
|
||||
*/
|
||||
public VCFFilterHeaderLine(String name) {
|
||||
super("FILTER", name, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
*/
|
||||
public VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, "FILTER", Arrays.asList("ID", "Description"));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFFormatHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
|
||||
*/
|
||||
public class VCFFormatHeaderLine extends VCFCompoundHeaderLine {
|
||||
|
||||
public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.FORMAT);
|
||||
if (type == VCFHeaderLineType.Flag)
|
||||
throw new IllegalArgumentException("Flag is an unsupported type for format fields");
|
||||
}
|
||||
|
||||
public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.FORMAT);
|
||||
}
|
||||
|
||||
public VCFFormatHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.FORMAT);
|
||||
}
|
||||
|
||||
// format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype).
|
||||
@Override
|
||||
boolean allowFlagValues() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,454 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* This class is really a POS. It allows duplicate entries in the metadata,
|
||||
* stores header lines in lots of places, and all around f*cking sucks.
|
||||
*
|
||||
* todo -- clean this POS up
|
||||
*
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFHeader
|
||||
* <p/>
|
||||
* A class representing the VCF header
|
||||
*/
|
||||
public class VCFHeader {
|
||||
|
||||
// the mandatory header fields
|
||||
public enum HEADER_FIELDS {
|
||||
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
||||
}
|
||||
|
||||
// the associated meta data
|
||||
private final Set<VCFHeaderLine> mMetaData = new LinkedHashSet<VCFHeaderLine>();
|
||||
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new LinkedHashMap<String, VCFInfoHeaderLine>();
|
||||
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new LinkedHashMap<String, VCFFormatHeaderLine>();
|
||||
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new LinkedHashMap<String, VCFFilterHeaderLine>();
|
||||
private final Map<String, VCFHeaderLine> mOtherMetaData = new LinkedHashMap<String, VCFHeaderLine>();
|
||||
private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();
|
||||
|
||||
// the list of auxillary tags
|
||||
private final List<String> mGenotypeSampleNames = new ArrayList<String>();
|
||||
|
||||
// the character string that indicates meta data
|
||||
public static final String METADATA_INDICATOR = "##";
|
||||
|
||||
// the header string indicator
|
||||
public static final String HEADER_INDICATOR = "#";
|
||||
|
||||
public static final String SOURCE_KEY = "source";
|
||||
public static final String REFERENCE_KEY = "reference";
|
||||
public static final String CONTIG_KEY = "contig";
|
||||
public static final String INTERVALS_KEY = "intervals";
|
||||
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
|
||||
public static final String INTERVAL_MERGING_KEY = "interval_merging";
|
||||
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
|
||||
public static final String INTERVAL_PADDING_KEY = "interval_padding";
|
||||
|
||||
// were the input samples sorted originally (or are we sorting them)?
|
||||
private boolean samplesWereAlreadySorted = true;
|
||||
|
||||
// cache for efficient conversion of VCF -> VariantContext
|
||||
private ArrayList<String> sampleNamesInOrder = null;
|
||||
private HashMap<String, Integer> sampleNameToOffset = null;
|
||||
|
||||
private boolean writeEngineHeaders = true;
|
||||
private boolean writeCommandLine = true;
|
||||
|
||||
/**
|
||||
* Create an empty VCF header with no header lines and no samples
|
||||
*/
|
||||
public VCFHeader() {
|
||||
this(Collections.<VCFHeaderLine>emptySet(), Collections.<String>emptySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData the meta data associated with this header
|
||||
*/
|
||||
public VCFHeader(Set<VCFHeaderLine> metaData) {
|
||||
mMetaData.addAll(metaData);
|
||||
loadVCFVersion();
|
||||
loadMetaDataMaps();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a shallow copy of the meta data in VCF header toCopy
|
||||
*
|
||||
* @param toCopy
|
||||
*/
|
||||
public VCFHeader(final VCFHeader toCopy) {
|
||||
this(toCopy.mMetaData);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData the meta data associated with this header
|
||||
* @param genotypeSampleNames the sample names
|
||||
*/
|
||||
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
|
||||
this(metaData, new ArrayList<String>(genotypeSampleNames));
|
||||
}
|
||||
|
||||
public VCFHeader(Set<VCFHeaderLine> metaData, List<String> genotypeSampleNames) {
|
||||
this(metaData);
|
||||
|
||||
if ( genotypeSampleNames.size() != new HashSet<String>(genotypeSampleNames).size() )
|
||||
throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names");
|
||||
|
||||
mGenotypeSampleNames.addAll(genotypeSampleNames);
|
||||
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
|
||||
buildVCFReaderMaps(genotypeSampleNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell this VCF header to use pre-calculated sample name ordering and the
|
||||
* sample name -> offset map. This assumes that all VariantContext created
|
||||
* using this header (i.e., read by the VCFCodec) will have genotypes
|
||||
* occurring in the same order
|
||||
*
|
||||
* @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearence
|
||||
*/
|
||||
private void buildVCFReaderMaps(Collection<String> genotypeSampleNamesInAppearenceOrder) {
|
||||
sampleNamesInOrder = new ArrayList<String>(genotypeSampleNamesInAppearenceOrder.size());
|
||||
sampleNameToOffset = new HashMap<String, Integer>(genotypeSampleNamesInAppearenceOrder.size());
|
||||
|
||||
int i = 0;
|
||||
for ( final String name : genotypeSampleNamesInAppearenceOrder ) {
|
||||
sampleNamesInOrder.add(name);
|
||||
sampleNameToOffset.put(name, i++);
|
||||
}
|
||||
Collections.sort(sampleNamesInOrder);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds a header line to the header metadata.
|
||||
*
|
||||
* @param headerLine Line to add to the existing metadata component.
|
||||
*/
|
||||
public void addMetaDataLine(VCFHeaderLine headerLine) {
|
||||
mMetaData.add(headerLine);
|
||||
loadMetaDataMaps();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
|
||||
*/
|
||||
public List<VCFContigHeaderLine> getContigLines() {
|
||||
return Collections.unmodifiableList(contigMetaData);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
|
||||
*/
|
||||
public List<VCFFilterHeaderLine> getFilterLines() {
|
||||
final List<VCFFilterHeaderLine> filters = new ArrayList<VCFFilterHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFFilterHeaderLine ) {
|
||||
filters.add((VCFFilterHeaderLine)line);
|
||||
}
|
||||
}
|
||||
return filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
|
||||
*/
|
||||
public List<VCFIDHeaderLine> getIDHeaderLines() {
|
||||
final List<VCFIDHeaderLine> filters = new ArrayList<VCFIDHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFIDHeaderLine ) {
|
||||
filters.add((VCFIDHeaderLine)line);
|
||||
}
|
||||
}
|
||||
return filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
|
||||
* or the version is not present
|
||||
*/
|
||||
public void loadVCFVersion() {
|
||||
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData )
|
||||
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
|
||||
toRemove.add(line);
|
||||
}
|
||||
// remove old header lines for now,
|
||||
mMetaData.removeAll(toRemove);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* load the format/info meta data maps (these are used for quick lookup by key name)
|
||||
*/
|
||||
private void loadMetaDataMaps() {
|
||||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
||||
addMetaDataMapBinding(mInfoMetaData, infoLine);
|
||||
} else if ( line instanceof VCFFormatHeaderLine ) {
|
||||
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
||||
addMetaDataMapBinding(mFormatMetaData, formatLine);
|
||||
} else if ( line instanceof VCFFilterHeaderLine ) {
|
||||
VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line;
|
||||
mFilterMetaData.put(filterLine.getID(), filterLine);
|
||||
} else if ( line instanceof VCFContigHeaderLine ) {
|
||||
contigMetaData.add((VCFContigHeaderLine)line);
|
||||
} else {
|
||||
mOtherMetaData.put(line.getKey(), line);
|
||||
}
|
||||
}
|
||||
|
||||
if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) {
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
|
||||
System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no "
|
||||
+ VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally"
|
||||
+ " automatically adding a corresponding PL field to your VCF header");
|
||||
}
|
||||
addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add line to map, issuing warnings about duplicates
|
||||
*
|
||||
* @param map
|
||||
* @param line
|
||||
* @param <T>
|
||||
*/
|
||||
private final <T extends VCFCompoundHeaderLine> void addMetaDataMapBinding(final Map<String, T> map, T line) {
|
||||
final String key = line.getID();
|
||||
if ( map.containsKey(key) ) {
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
|
||||
System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" );
|
||||
}
|
||||
}
|
||||
else {
|
||||
map.put(key, line);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header fields in order they're presented in the input file (which is now required to be
|
||||
* the order presented in the spec).
|
||||
*
|
||||
* @return a set of the header fields, in order
|
||||
*/
|
||||
public Set<HEADER_FIELDS> getHeaderFields() {
|
||||
return new LinkedHashSet<HEADER_FIELDS>(Arrays.asList(HEADER_FIELDS.values()));
|
||||
}
|
||||
|
||||
/**
|
||||
* get the meta data, associated with this header, in sorted order
|
||||
*
|
||||
* @return a set of the meta data
|
||||
*/
|
||||
public Set<VCFHeaderLine> getMetaDataInInputOrder() {
|
||||
return makeGetMetaDataSet(mMetaData);
|
||||
}
|
||||
|
||||
public Set<VCFHeaderLine> getMetaDataInSortedOrder() {
|
||||
return makeGetMetaDataSet(new TreeSet<VCFHeaderLine>(mMetaData));
|
||||
}
|
||||
|
||||
private static Set<VCFHeaderLine> makeGetMetaDataSet(final Set<VCFHeaderLine> headerLinesInSomeOrder) {
|
||||
final Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
|
||||
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString()));
|
||||
lines.addAll(headerLinesInSomeOrder);
|
||||
return Collections.unmodifiableSet(lines);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the VCFHeaderLine whose key equals key. Returns null if no such line exists
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public VCFHeaderLine getMetaDataLine(final String key) {
|
||||
for (final VCFHeaderLine line: mMetaData) {
|
||||
if ( line.getKey().equals(key) )
|
||||
return line;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the genotyping sample names
|
||||
*
|
||||
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
|
||||
*/
|
||||
public List<String> getGenotypeSamples() {
|
||||
return mGenotypeSampleNames;
|
||||
}
|
||||
|
||||
public int getNGenotypeSamples() {
|
||||
return mGenotypeSampleNames.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have genotyping data?
|
||||
*
|
||||
* @return true if we have genotyping columns, false otherwise
|
||||
*/
|
||||
public boolean hasGenotypingData() {
|
||||
return getNGenotypeSamples() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* were the input samples sorted originally?
|
||||
*
|
||||
* @return true if the input samples were sorted originally, false otherwise
|
||||
*/
|
||||
public boolean samplesWereAlreadySorted() {
|
||||
return samplesWereAlreadySorted;
|
||||
}
|
||||
|
||||
/** @return the column count */
|
||||
public int getColumnCount() {
|
||||
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the INFO HeaderLines in their original ordering
|
||||
*/
|
||||
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
|
||||
return mInfoMetaData.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the FORMAT HeaderLines in their original ordering
|
||||
*/
|
||||
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
|
||||
return mFormatMetaData.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param id the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFInfoHeaderLine getInfoHeaderLine(String id) {
|
||||
return mInfoMetaData.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param id the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFFormatHeaderLine getFormatHeaderLine(String id) {
|
||||
return mFormatMetaData.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param id the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFFilterHeaderLine getFilterHeaderLine(final String id) {
|
||||
return mFilterMetaData.get(id);
|
||||
}
|
||||
|
||||
public boolean hasInfoLine(final String id) {
|
||||
return getInfoHeaderLine(id) != null;
|
||||
}
|
||||
|
||||
public boolean hasFormatLine(final String id) {
|
||||
return getFormatHeaderLine(id) != null;
|
||||
}
|
||||
|
||||
public boolean hasFilterLine(final String id) {
|
||||
return getFilterHeaderLine(id) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFHeaderLine getOtherHeaderLine(String key) {
|
||||
return mOtherMetaData.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
|
||||
* @return true if additional engine headers will be written to the VCF
|
||||
*/
|
||||
public boolean isWriteEngineHeaders() {
|
||||
return writeEngineHeaders;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
|
||||
* @param writeEngineHeaders true if additional engine headers will be written to the VCF
|
||||
*/
|
||||
public void setWriteEngineHeaders(boolean writeEngineHeaders) {
|
||||
this.writeEngineHeaders = writeEngineHeaders;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
|
||||
* @return true if the command line will be written to the VCF
|
||||
*/
|
||||
public boolean isWriteCommandLine() {
|
||||
return writeCommandLine;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
|
||||
* @param writeCommandLine true if the command line will be written to the VCF
|
||||
*/
|
||||
public void setWriteCommandLine(boolean writeCommandLine) {
|
||||
this.writeCommandLine = writeCommandLine;
|
||||
}
|
||||
|
||||
public ArrayList<String> getSampleNamesInOrder() {
|
||||
return sampleNamesInOrder;
|
||||
}
|
||||
|
||||
public HashMap<String, Integer> getSampleNameToOffset() {
|
||||
return sampleNameToOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder b = new StringBuilder();
|
||||
b.append("[VCFHeader:");
|
||||
for ( final VCFHeaderLine line : mMetaData )
|
||||
b.append("\n\t").append(line);
|
||||
return b.append("\n]").toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,134 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry in the VCF header
|
||||
*/
|
||||
public class VCFHeaderLine implements Comparable {
|
||||
protected static final boolean ALLOW_UNBOUND_DESCRIPTIONS = true;
|
||||
protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header";
|
||||
|
||||
private String mKey = null;
|
||||
private String mValue = null;
|
||||
|
||||
|
||||
/**
|
||||
* create a VCF header line
|
||||
*
|
||||
* @param key the key for this header line
|
||||
* @param value the value for this header line
|
||||
*/
|
||||
public VCFHeaderLine(String key, String value) {
|
||||
if ( key == null )
|
||||
throw new IllegalArgumentException("VCFHeaderLine: key cannot be null");
|
||||
mKey = key;
|
||||
mValue = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the key
|
||||
*
|
||||
* @return the key
|
||||
*/
|
||||
public String getKey() {
|
||||
return mKey;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value
|
||||
*
|
||||
* @return the value
|
||||
*/
|
||||
public String getValue() {
|
||||
return mValue;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return toStringEncoding();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should be overloaded in sub classes to do subclass specific
|
||||
*
|
||||
* @return the string encoding
|
||||
*/
|
||||
protected String toStringEncoding() {
|
||||
return mKey + "=" + mValue;
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFHeaderLine) )
|
||||
return false;
|
||||
return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue());
|
||||
}
|
||||
|
||||
public int compareTo(Object other) {
|
||||
return toString().compareTo(other.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param line the line
|
||||
* @return true if the line is a VCF meta data line, or false if it is not
|
||||
*/
|
||||
public static boolean isHeaderLine(String line) {
|
||||
return line != null && line.length() > 0 && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1));
|
||||
}
|
||||
|
||||
/**
|
||||
* create a string of a mapping pair for the target VCF version
|
||||
* @param keyValues a mapping of the key->value pairs to output
|
||||
* @return a string, correctly formatted
|
||||
*/
|
||||
public static String toStringEncoding(Map<String, ? extends Object> keyValues) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("<");
|
||||
boolean start = true;
|
||||
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
|
||||
if (start) start = false;
|
||||
else builder.append(",");
|
||||
|
||||
if ( entry.getValue() == null ) throw new TribbleException.InternalCodecException("Header problem: unbound value at " + entry + " from " + keyValues);
|
||||
|
||||
builder.append(entry.getKey());
|
||||
builder.append("=");
|
||||
builder.append(entry.getValue().toString().contains(",") ||
|
||||
entry.getValue().toString().contains(" ") ||
|
||||
entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue());
|
||||
}
|
||||
builder.append(">");
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
/**
|
||||
* the count encodings we use for fields in VCF header lines
|
||||
*/
|
||||
public enum VCFHeaderLineCount {
|
||||
INTEGER, A, G, UNBOUNDED;
|
||||
}
|
||||
|
|
@ -1,153 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A class for translating between vcf header versions
|
||||
*/
|
||||
public class VCFHeaderLineTranslator {
|
||||
private static Map<VCFHeaderVersion,VCFLineParser> mapping;
|
||||
|
||||
static {
|
||||
mapping = new HashMap<VCFHeaderVersion,VCFLineParser>();
|
||||
mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser());
|
||||
mapping.put(VCFHeaderVersion.VCF4_1,new VCF4Parser());
|
||||
mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser());
|
||||
mapping.put(VCFHeaderVersion.VCF3_2,new VCF3Parser());
|
||||
}
|
||||
|
||||
public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder) {
|
||||
return mapping.get(version).parseLine(valueLine,expectedTagOrder);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
interface VCFLineParser {
|
||||
public Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* a class that handles the to and from disk for VCF 4 lines
|
||||
*/
|
||||
class VCF4Parser implements VCFLineParser {
|
||||
/**
|
||||
* parse a VCF4 line
|
||||
* @param valueLine the line
|
||||
* @return a mapping of the tags parsed out
|
||||
*/
|
||||
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||
// our return map
|
||||
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||
|
||||
// a builder to store up characters as we go
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// store the key when we're parsing out the values
|
||||
String key = "";
|
||||
|
||||
// where are we in the stream of characters?
|
||||
int index = 0;
|
||||
|
||||
// are we inside a quotation? we don't special case ',' then
|
||||
boolean inQuote = false;
|
||||
|
||||
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly [yes, but this machine is getting ugly now... MAD]
|
||||
for (char c: valueLine.toCharArray()) {
|
||||
if ( c == '\"' ) {
|
||||
inQuote = ! inQuote;
|
||||
} else if ( inQuote ) {
|
||||
builder.append(c);
|
||||
} else {
|
||||
switch (c) {
|
||||
case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it
|
||||
case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list
|
||||
case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder
|
||||
case (',') : ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; // drop the current key value to the return map
|
||||
default: builder.append(c); // otherwise simply append to the current string
|
||||
}
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
// validate the tags against the expected list
|
||||
index = 0;
|
||||
if ( expectedTagOrder != null ) {
|
||||
if ( ret.size() > expectedTagOrder.size() )
|
||||
throw new TribbleException.InvalidHeader("unexpected tag count " + ret.size() + " in line " + valueLine);
|
||||
for ( String str : ret.keySet() ) {
|
||||
if ( !expectedTagOrder.get(index).equals(str) )
|
||||
throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine);
|
||||
index++;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
class VCF3Parser implements VCFLineParser {
|
||||
|
||||
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||
// our return map
|
||||
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||
|
||||
// a builder to store up characters as we go
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// where are we in the stream of characters?
|
||||
int index = 0;
|
||||
// where in the expected tag order are we?
|
||||
int tagIndex = 0;
|
||||
|
||||
// are we inside a quotation? we don't special case ',' then
|
||||
boolean inQuote = false;
|
||||
|
||||
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
|
||||
for (char c: valueLine.toCharArray()) {
|
||||
switch (c) {
|
||||
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
|
||||
case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map
|
||||
default: builder.append(c); // otherwise simply append to the current string
|
||||
}
|
||||
index++;
|
||||
}
|
||||
ret.put(expectedTagOrder.get(tagIndex++),builder.toString());
|
||||
|
||||
// validate the tags against the expected list
|
||||
index = 0;
|
||||
if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size());
|
||||
for (String str : ret.keySet()){
|
||||
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||
index++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
/**
|
||||
* the type encodings we use for fields in VCF header lines
|
||||
*/
|
||||
public enum VCFHeaderLineType {
|
||||
Integer, Float, String, Character, Flag;
|
||||
}
|
||||
|
|
@ -1,116 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
|
||||
/**
|
||||
* information that identifies each header version
|
||||
*/
|
||||
public enum VCFHeaderVersion {
|
||||
VCF3_2("VCRv3.2","format"),
|
||||
VCF3_3("VCFv3.3","fileformat"),
|
||||
VCF4_0("VCFv4.0","fileformat"),
|
||||
VCF4_1("VCFv4.1","fileformat");
|
||||
|
||||
private final String versionString;
|
||||
private final String formatString;
|
||||
|
||||
/**
|
||||
* create the enum, privately, using:
|
||||
* @param vString the version string
|
||||
* @param fString the format string
|
||||
*/
|
||||
VCFHeaderVersion(String vString, String fString) {
|
||||
this.versionString = vString;
|
||||
this.formatString = fString;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header version
|
||||
* @param version the version string
|
||||
* @return a VCFHeaderVersion object
|
||||
*/
|
||||
public static VCFHeaderVersion toHeaderVersion(String version) {
|
||||
version = clean(version);
|
||||
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||
if (hv.versionString.equals(version))
|
||||
return hv;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* are we a valid version string of some type
|
||||
* @param version the version string
|
||||
* @return true if we're valid of some type, false otherwise
|
||||
*/
|
||||
public static boolean isVersionString(String version){
|
||||
return toHeaderVersion(version) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* are we a valid format string for some type
|
||||
* @param format the format string
|
||||
* @return true if we're valid of some type, false otherwise
|
||||
*/
|
||||
public static boolean isFormatString(String format){
|
||||
format = clean(format);
|
||||
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||
if (hv.formatString.equals(format))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public static VCFHeaderVersion getHeaderVersion(String versionLine) {
|
||||
String[] lineFields = versionLine.split("=");
|
||||
if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) )
|
||||
throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line");
|
||||
|
||||
if ( !isVersionString(lineFields[1]) )
|
||||
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
|
||||
|
||||
return toHeaderVersion(lineFields[1]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function to clean up a VCF header string
|
||||
*
|
||||
* @param s string
|
||||
* @return trimmed version of s
|
||||
*/
|
||||
private static String clean(String s) {
|
||||
return s.trim();
|
||||
}
|
||||
|
||||
|
||||
public String getVersionString() {
|
||||
return versionString;
|
||||
}
|
||||
|
||||
public String getFormatString() {
|
||||
return formatString;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
/** an interface for ID-based header lines **/
|
||||
public interface VCFIDHeaderLine {
|
||||
String getID();
|
||||
}
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFInfoHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry for INFO fields in the VCF header
|
||||
*/
|
||||
public class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
|
||||
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.INFO);
|
||||
}
|
||||
|
||||
public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.INFO);
|
||||
}
|
||||
|
||||
public VCFInfoHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.INFO);
|
||||
}
|
||||
|
||||
// info fields allow flag values
|
||||
@Override
|
||||
boolean allowFlagValues() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,106 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for simple VCF header types
|
||||
*/
|
||||
public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
|
||||
|
||||
private String name;
|
||||
private Map<String, String> genericFields = new LinkedHashMap<String, String>();
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param key the key for this header line
|
||||
* @param name the name for this header line
|
||||
* @param description description for this header line
|
||||
*/
|
||||
public VCFSimpleHeaderLine(String key, String name, String description) {
|
||||
super(key, "");
|
||||
Map<String, String> map = new LinkedHashMap<String, String>(1);
|
||||
map.put("Description", description);
|
||||
initialize(name, map);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
* @param key the key for this header line
|
||||
* @param expectedTagOrdering the tag ordering expected for this header line
|
||||
*/
|
||||
public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List<String> expectedTagOrdering) {
|
||||
this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering), expectedTagOrdering);
|
||||
}
|
||||
|
||||
public VCFSimpleHeaderLine(final String key, final Map<String, String> mapping, final List<String> expectedTagOrdering) {
|
||||
super(key, "");
|
||||
name = mapping.get("ID");
|
||||
initialize(name, mapping);
|
||||
}
|
||||
|
||||
protected void initialize(String name, Map<String, String> genericFields) {
|
||||
if ( name == null || genericFields == null || genericFields.isEmpty() )
|
||||
throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name));
|
||||
|
||||
this.name = name;
|
||||
this.genericFields.putAll(genericFields);
|
||||
}
|
||||
|
||||
protected String toStringEncoding() {
|
||||
Map<String, Object> map = new LinkedHashMap<String, Object>();
|
||||
map.put("ID", name);
|
||||
map.putAll(genericFields);
|
||||
return getKey() + "=" + VCFHeaderLine.toStringEncoding(map);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFSimpleHeaderLine) )
|
||||
return false;
|
||||
VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o;
|
||||
if ( !name.equals(other.name) || genericFields.size() != other.genericFields.size() )
|
||||
return false;
|
||||
for ( Map.Entry<String, String> entry : genericFields.entrySet() ) {
|
||||
if ( !entry.getValue().equals(other.genericFields.get(entry.getKey())) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public String getID() {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,264 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Manages header lines for standard VCF INFO and FORMAT fields
|
||||
*
|
||||
* Provides simple mechanisms for registering standard lines,
|
||||
* looking them up, and adding them to headers
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 6/12
|
||||
*/
|
||||
public class VCFStandardHeaderLines {
|
||||
/**
|
||||
* Enabling this causes us to repair header lines even if only their descriptions differ
|
||||
*/
|
||||
private final static boolean REPAIR_BAD_DESCRIPTIONS = false;
|
||||
private static Standards<VCFFormatHeaderLine> formatStandards = new Standards<VCFFormatHeaderLine>();
|
||||
private static Standards<VCFInfoHeaderLine> infoStandards = new Standards<VCFInfoHeaderLine>();
|
||||
|
||||
/**
|
||||
* Walks over the VCF header and repairs the standard VCF header lines in it, returning a freshly
|
||||
* allocated VCFHeader with standard VCF header lines repaired as necessary
|
||||
*
|
||||
* @param header
|
||||
* @return
|
||||
*/
|
||||
@Requires("header != null")
|
||||
@Ensures("result != null")
|
||||
public static VCFHeader repairStandardHeaderLines(final VCFHeader header) {
|
||||
final Set<VCFHeaderLine> newLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaDataInInputOrder().size());
|
||||
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line instanceof VCFFormatHeaderLine ) {
|
||||
line = formatStandards.repair((VCFFormatHeaderLine) line);
|
||||
} else if ( line instanceof VCFInfoHeaderLine) {
|
||||
line = infoStandards.repair((VCFInfoHeaderLine) line);
|
||||
}
|
||||
|
||||
newLines.add(line);
|
||||
}
|
||||
|
||||
return new VCFHeader(newLines, header.getGenotypeSamples());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds header lines for each of the format fields in IDs to header, returning the set of
|
||||
* IDs without standard descriptions, unless throwErrorForMissing is true, in which
|
||||
* case this situation results in a TribbleException
|
||||
*
|
||||
* @param IDs
|
||||
* @return
|
||||
*/
|
||||
public static Set<String> addStandardFormatLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final Collection<String> IDs) {
|
||||
return formatStandards.addToHeader(headerLines, IDs, throwErrorForMissing);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #addStandardFormatLines(java.util.Set, boolean, java.util.Collection)
|
||||
*
|
||||
* @param headerLines
|
||||
* @param throwErrorForMissing
|
||||
* @param IDs
|
||||
* @return
|
||||
*/
|
||||
public static Set<String> addStandardFormatLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final String ... IDs) {
|
||||
return addStandardFormatLines(headerLines, throwErrorForMissing, Arrays.asList(IDs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the standard format line for ID. If none exists, return null or throw an exception, depending
|
||||
* on throwErrorForMissing
|
||||
*
|
||||
* @param ID
|
||||
* @param throwErrorForMissing
|
||||
* @return
|
||||
*/
|
||||
public static VCFFormatHeaderLine getFormatLine(final String ID, final boolean throwErrorForMissing) {
|
||||
return formatStandards.get(ID, throwErrorForMissing);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the standard format line for ID. If none exists throw an exception
|
||||
*
|
||||
* @param ID
|
||||
* @return
|
||||
*/
|
||||
public static VCFFormatHeaderLine getFormatLine(final String ID) {
|
||||
return formatStandards.get(ID, true);
|
||||
}
|
||||
|
||||
private static void registerStandard(final VCFFormatHeaderLine line) {
|
||||
formatStandards.add(line);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds header lines for each of the info fields in IDs to header, returning the set of
|
||||
* IDs without standard descriptions, unless throwErrorForMissing is true, in which
|
||||
* case this situation results in a TribbleException
|
||||
*
|
||||
* @param IDs
|
||||
* @return
|
||||
*/
|
||||
public static Set<String> addStandardInfoLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final Collection<String> IDs) {
|
||||
return infoStandards.addToHeader(headerLines, IDs, throwErrorForMissing);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #addStandardFormatLines(java.util.Set, boolean, java.util.Collection)
|
||||
*
|
||||
* @param IDs
|
||||
* @return
|
||||
*/
|
||||
public static Set<String> addStandardInfoLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final String ... IDs) {
|
||||
return addStandardInfoLines(headerLines, throwErrorForMissing, Arrays.asList(IDs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the standard info line for ID. If none exists, return null or throw an exception, depending
|
||||
* on throwErrorForMissing
|
||||
*
|
||||
* @param ID
|
||||
* @param throwErrorForMissing
|
||||
* @return
|
||||
*/
|
||||
public static VCFInfoHeaderLine getInfoLine(final String ID, final boolean throwErrorForMissing) {
|
||||
return infoStandards.get(ID, throwErrorForMissing);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the standard info line for ID. If none exists throw an exception
|
||||
*
|
||||
* @param ID
|
||||
* @return
|
||||
*/
|
||||
public static VCFInfoHeaderLine getInfoLine(final String ID) {
|
||||
return getInfoLine(ID, true);
|
||||
}
|
||||
|
||||
private static void registerStandard(final VCFInfoHeaderLine line) {
|
||||
infoStandards.add(line);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// VCF header line constants
|
||||
//
|
||||
static {
|
||||
// FORMAT lines
|
||||
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality"));
|
||||
registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"));
|
||||
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
|
||||
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed"));
|
||||
registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter"));
|
||||
|
||||
// INFO lines
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality"));
|
||||
registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event"));
|
||||
}
|
||||
|
||||
private static class Standards<T extends VCFCompoundHeaderLine> {
|
||||
private final Map<String, T> standards = new HashMap<String, T>();
|
||||
|
||||
@Requires("line != null")
|
||||
@Ensures({"result != null", "result.getID().equals(line.getID())"})
|
||||
public T repair(final T line) {
|
||||
final T standard = get(line.getID(), false);
|
||||
if ( standard != null ) {
|
||||
final boolean badCountType = line.getCountType() != standard.getCountType();
|
||||
final boolean badCount = line.isFixedCount() && ! badCountType && line.getCount() != standard.getCount();
|
||||
final boolean badType = line.getType() != standard.getType();
|
||||
final boolean badDesc = ! line.getDescription().equals(standard.getDescription());
|
||||
final boolean needsRepair = badCountType || badCount || badType || (REPAIR_BAD_DESCRIPTIONS && badDesc);
|
||||
|
||||
if ( needsRepair ) {
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
|
||||
System.err.println("Repairing standard header line for field " + line.getID() + " because"
|
||||
+ (badCountType ? " -- count types disagree; header has " + line.getCountType() + " but standard is " + standard.getCountType() : "")
|
||||
+ (badType ? " -- type disagree; header has " + line.getType() + " but standard is " + standard.getType() : "")
|
||||
+ (badCount ? " -- counts disagree; header has " + line.getCount() + " but standard is " + standard.getCount() : "")
|
||||
+ (badDesc ? " -- descriptions disagree; header has '" + line.getDescription() + "' but standard is '" + standard.getDescription() + "'": ""));
|
||||
}
|
||||
return standard;
|
||||
} else
|
||||
return line;
|
||||
} else
|
||||
return line;
|
||||
}
|
||||
|
||||
@Requires("headerLines != null")
|
||||
@Ensures({"result != null", "result.isEmpty() || ! throwErrorForMissing", "IDs.containsAll(result)"})
|
||||
public Set<String> addToHeader(final Set<VCFHeaderLine> headerLines, final Collection<String> IDs, final boolean throwErrorForMissing) {
|
||||
final Set<String> missing = new HashSet<String>();
|
||||
for ( final String ID : IDs ) {
|
||||
final T line = get(ID, throwErrorForMissing);
|
||||
if ( line == null )
|
||||
missing.add(ID);
|
||||
else
|
||||
headerLines.add(line);
|
||||
}
|
||||
|
||||
return missing;
|
||||
}
|
||||
|
||||
@Requires("line != null")
|
||||
@Ensures({"standards.containsKey(line.getID())"})
|
||||
public void add(final T line) {
|
||||
if ( standards.containsKey(line.getID()) )
|
||||
throw new TribbleException("Attempting to add multiple standard header lines for ID " + line.getID());
|
||||
standards.put(line.getID(), line);
|
||||
}
|
||||
|
||||
@Requires("ID != null")
|
||||
@Ensures({"result != null || ! throwErrorForMissing"})
|
||||
public T get(final String ID, final boolean throwErrorForMissing) {
|
||||
final T x = standards.get(ID);
|
||||
if ( throwErrorForMissing && x == null )
|
||||
throw new TribbleException("Couldn't find a standard VCF header line for field " + ID);
|
||||
return x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,196 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
public class VCFUtils {
|
||||
|
||||
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers, boolean emitWarnings) throws IllegalStateException {
|
||||
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
|
||||
HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings);
|
||||
|
||||
// todo -- needs to remove all version headers from sources and add its own VCF version line
|
||||
for ( VCFHeader source : headers ) {
|
||||
//System.out.printf("Merging in header %s%n", source);
|
||||
for ( VCFHeaderLine line : source.getMetaDataInSortedOrder()) {
|
||||
|
||||
String key = line.getKey();
|
||||
if ( line instanceof VCFIDHeaderLine )
|
||||
key = key + "-" + ((VCFIDHeaderLine)line).getID();
|
||||
|
||||
if ( map.containsKey(key) ) {
|
||||
VCFHeaderLine other = map.get(key);
|
||||
if ( line.equals(other) ) {
|
||||
// continue;
|
||||
} else if ( ! line.getClass().equals(other.getClass()) ) {
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else if ( line instanceof VCFFilterHeaderLine ) {
|
||||
String lineName = ((VCFFilterHeaderLine) line).getID();
|
||||
String otherName = ((VCFFilterHeaderLine) other).getID();
|
||||
if ( ! lineName.equals(otherName) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
|
||||
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
|
||||
|
||||
// if the names are the same, but the values are different, we need to quit
|
||||
if (! (compLine).equalsExcludingDescription(compOther) ) {
|
||||
if ( compLine.getType().equals(compOther.getType()) ) {
|
||||
// The Number entry is an Integer that describes the number of values that can be
|
||||
// included with the INFO field. For example, if the INFO field contains a single
|
||||
// number, then this value should be 1. However, if the INFO field describes a pair
|
||||
// of numbers, then this value should be 2 and so on. If the number of possible
|
||||
// values varies, is unknown, or is unbounded, then this value should be '.'.
|
||||
conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other);
|
||||
compOther.setNumberToUnbounded();
|
||||
} else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) {
|
||||
// promote key to Float
|
||||
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
|
||||
map.put(key, compOther);
|
||||
} else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) {
|
||||
// promote key to Float
|
||||
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
|
||||
} else {
|
||||
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
|
||||
}
|
||||
}
|
||||
if ( ! compLine.getDescription().equals(compOther.getDescription()) )
|
||||
conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine);
|
||||
} else {
|
||||
// we are not equal, but we're not anything special either
|
||||
conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other);
|
||||
}
|
||||
} else {
|
||||
map.put(key, line);
|
||||
//System.out.printf("Adding header line %s%n", line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new HashSet<VCFHeaderLine>(map.values());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add / replace the contig header lines in the VCFHeader with the in the reference file and master reference dictionary
|
||||
*
|
||||
* @param oldHeader the header to update
|
||||
* @param referenceFile the file path to the reference sequence used to generate this vcf
|
||||
* @param refDict the SAM formatted reference sequence dictionary
|
||||
*/
|
||||
public static VCFHeader withUpdatedContigs(final VCFHeader oldHeader, final File referenceFile, final SAMSequenceDictionary refDict) {
|
||||
return new VCFHeader(withUpdatedContigsAsLines(oldHeader.getMetaDataInInputOrder(), referenceFile, refDict), oldHeader.getGenotypeSamples());
|
||||
}
|
||||
|
||||
public static Set<VCFHeaderLine> withUpdatedContigsAsLines(final Set<VCFHeaderLine> oldLines, final File referenceFile, final SAMSequenceDictionary refDict) {
|
||||
return withUpdatedContigsAsLines(oldLines, referenceFile, refDict, false);
|
||||
}
|
||||
|
||||
public static Set<VCFHeaderLine> withUpdatedContigsAsLines(final Set<VCFHeaderLine> oldLines, final File referenceFile, final SAMSequenceDictionary refDict, boolean referenceNameOnly) {
|
||||
final Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>(oldLines.size());
|
||||
|
||||
for ( final VCFHeaderLine line : oldLines ) {
|
||||
if ( line instanceof VCFContigHeaderLine )
|
||||
continue; // skip old contig lines
|
||||
if ( line.getKey().equals(VCFHeader.REFERENCE_KEY) )
|
||||
continue; // skip the old reference key
|
||||
lines.add(line);
|
||||
}
|
||||
|
||||
for ( final VCFHeaderLine contigLine : makeContigHeaderLines(refDict, referenceFile) )
|
||||
lines.add(contigLine);
|
||||
|
||||
String referenceValue;
|
||||
if (referenceFile != null) {
|
||||
if (referenceNameOnly)
|
||||
referenceValue = FilenameUtils.getBaseName(referenceFile.getName());
|
||||
else
|
||||
referenceValue = "file://" + referenceFile.getAbsolutePath();
|
||||
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, referenceValue));
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null
|
||||
* @param refDict reference dictionary
|
||||
* @param referenceFile for assembly name. May be null
|
||||
* @return list of vcf contig header lines
|
||||
*/
|
||||
public static List<VCFContigHeaderLine> makeContigHeaderLines(final SAMSequenceDictionary refDict,
|
||||
final File referenceFile) {
|
||||
final List<VCFContigHeaderLine> lines = new ArrayList<VCFContigHeaderLine>();
|
||||
final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null;
|
||||
for ( SAMSequenceRecord contig : refDict.getSequences() )
|
||||
lines.add(makeContigHeaderLine(contig, assembly));
|
||||
return lines;
|
||||
}
|
||||
|
||||
private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
|
||||
final Map<String, String> map = new LinkedHashMap<String, String>(3);
|
||||
map.put("ID", contig.getSequenceName());
|
||||
map.put("length", String.valueOf(contig.getSequenceLength()));
|
||||
if ( assembly != null ) map.put("assembly", assembly);
|
||||
return new VCFContigHeaderLine(map, contig.getSequenceIndex());
|
||||
}
|
||||
|
||||
private static String getReferenceAssembly(final String refPath) {
|
||||
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
|
||||
String assembly = null;
|
||||
if (refPath.contains("b37") || refPath.contains("v37"))
|
||||
assembly = "b37";
|
||||
else if (refPath.contains("b36"))
|
||||
assembly = "b36";
|
||||
else if (refPath.contains("hg18"))
|
||||
assembly = "hg18";
|
||||
else if (refPath.contains("hg19"))
|
||||
assembly = "hg19";
|
||||
return assembly;
|
||||
}
|
||||
|
||||
/** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */
|
||||
private static final class HeaderConflictWarner {
|
||||
boolean emitWarnings;
|
||||
Set<String> alreadyIssued = new HashSet<String>();
|
||||
|
||||
private HeaderConflictWarner( final boolean emitWarnings ) {
|
||||
this.emitWarnings = emitWarnings;
|
||||
}
|
||||
|
||||
public void warn(final VCFHeaderLine line, final String msg) {
|
||||
if ( GeneralUtils.DEBUG_MODE_ENABLED && emitWarnings && ! alreadyIssued.contains(line.getKey()) ) {
|
||||
alreadyIssued.add(line.getKey());
|
||||
System.err.println(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -31,9 +31,18 @@ import org.apache.log4j.Logger;
|
|||
import org.apache.log4j.PatternLayout;
|
||||
import org.apache.log4j.spi.LoggingEvent;
|
||||
import org.broadinstitute.sting.commandline.CommandLineUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.crypt.CryptUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.io.IOUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Codec;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
import org.testng.Assert;
|
||||
import org.testng.Reporter;
|
||||
import org.testng.SkipException;
|
||||
|
|
@ -343,4 +352,154 @@ public abstract class BaseTest {
|
|||
+ (message == null ? "" : "message: " + message));
|
||||
}
|
||||
}
|
||||
|
||||
public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) {
|
||||
Assert.assertNotNull(actual, "VariantContext expected not null");
|
||||
Assert.assertEquals(actual.getChr(), expected.getChr(), "chr");
|
||||
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
|
||||
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
|
||||
Assert.assertEquals(actual.getID(), expected.getID(), "id");
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
|
||||
|
||||
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
|
||||
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");
|
||||
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered");
|
||||
assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters");
|
||||
assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual());
|
||||
|
||||
Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes");
|
||||
if ( expected.hasGenotypes() ) {
|
||||
assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set");
|
||||
Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names");
|
||||
final Set<String> samples = expected.getSampleNames();
|
||||
for ( final String sample : samples ) {
|
||||
assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void assertVariantContextStreamsAreEqual(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) {
|
||||
final Iterator<VariantContext> actualIT = actual.iterator();
|
||||
final Iterator<VariantContext> expectedIT = expected.iterator();
|
||||
|
||||
while ( expectedIT.hasNext() ) {
|
||||
final VariantContext expectedVC = expectedIT.next();
|
||||
if ( expectedVC == null )
|
||||
continue;
|
||||
|
||||
VariantContext actualVC;
|
||||
do {
|
||||
Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual");
|
||||
actualVC = actualIT.next();
|
||||
} while ( actualIT.hasNext() && actualVC == null );
|
||||
|
||||
if ( actualVC == null )
|
||||
Assert.fail("Too few records in actual");
|
||||
|
||||
assertVariantContextsAreEqual(actualVC, expectedVC);
|
||||
}
|
||||
Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual");
|
||||
}
|
||||
|
||||
|
||||
public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) {
|
||||
Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names");
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles");
|
||||
Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string");
|
||||
Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type");
|
||||
|
||||
// filters are the same
|
||||
Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields");
|
||||
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered");
|
||||
|
||||
// inline attributes
|
||||
Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp");
|
||||
Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD()));
|
||||
Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq");
|
||||
Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL");
|
||||
Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD");
|
||||
Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ");
|
||||
Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP");
|
||||
|
||||
Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods");
|
||||
Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString");
|
||||
Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods");
|
||||
Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL()));
|
||||
|
||||
Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual");
|
||||
assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes());
|
||||
Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased");
|
||||
Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy");
|
||||
}
|
||||
|
||||
public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) {
|
||||
Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines");
|
||||
|
||||
// for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted?
|
||||
//Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder());
|
||||
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder());
|
||||
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder());
|
||||
for ( int i = 0; i < actualLines.size(); i++ ) {
|
||||
Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines");
|
||||
}
|
||||
}
|
||||
|
||||
public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException {
|
||||
final Pair<VCFHeader, GATKVCFUtils.VCIterable> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec());
|
||||
final Pair<VCFHeader, GATKVCFUtils.VCIterable> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec());
|
||||
assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst());
|
||||
assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond());
|
||||
}
|
||||
|
||||
private static void assertAttributeEquals(final String key, final Object actual, final Object expected) {
|
||||
if ( expected instanceof Double ) {
|
||||
// must be very tolerant because doubles are being rounded to 2 sig figs
|
||||
assertEqualsDoubleSmart(actual, (Double) expected, 1e-2);
|
||||
} else
|
||||
Assert.assertEquals(actual, expected, "Attribute " + key);
|
||||
}
|
||||
|
||||
private static void assertAttributesEquals(final Map<String, Object> actual, Map<String, Object> expected) {
|
||||
final Set<String> expectedKeys = new HashSet<String>(expected.keySet());
|
||||
|
||||
for ( final Map.Entry<String, Object> act : actual.entrySet() ) {
|
||||
final Object actualValue = act.getValue();
|
||||
if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) {
|
||||
final Object expectedValue = expected.get(act.getKey());
|
||||
if ( expectedValue instanceof List ) {
|
||||
final List<Object> expectedList = (List<Object>)expectedValue;
|
||||
Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't");
|
||||
final List<Object> actualList = (List<Object>)actualValue;
|
||||
Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size");
|
||||
for ( int i = 0; i < expectedList.size(); i++ )
|
||||
assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i));
|
||||
} else
|
||||
assertAttributeEquals(act.getKey(), actualValue, expectedValue);
|
||||
} else {
|
||||
// it's ok to have a binding in x -> null that's absent in y
|
||||
Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other");
|
||||
}
|
||||
expectedKeys.remove(act.getKey());
|
||||
}
|
||||
|
||||
// now expectedKeys contains only the keys found in expected but not in actual,
|
||||
// and they must all be null
|
||||
for ( final String missingExpected : expectedKeys ) {
|
||||
final Object value = expected.get(missingExpected);
|
||||
Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" );
|
||||
}
|
||||
}
|
||||
|
||||
private static final boolean isMissing(final Object value) {
|
||||
if ( value == null ) return true;
|
||||
else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true;
|
||||
else if ( value instanceof List ) {
|
||||
// handles the case where all elements are null or the list is empty
|
||||
for ( final Object elt : (List)value)
|
||||
if ( elt != null )
|
||||
return false;
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,7 +49,6 @@ import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
|||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextTestProvider;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
|
|
|
|||
|
|
@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.collections.Pair;
|
|||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextTestProvider;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterSuite;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
|
|
@ -82,7 +81,7 @@ public class WalkerTest extends BaseTest {
|
|||
if ( bcfFile != null && bcfFile.exists() ) {
|
||||
logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile);
|
||||
try {
|
||||
VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(resultFile, bcfFile);
|
||||
assertVCFandBCFFilesAreTheSame(resultFile, bcfFile);
|
||||
logger.warn(" Shadow BCF PASSED!");
|
||||
} catch ( Exception e ) {
|
||||
Assert.fail("Exception received reading shadow BCFFile " + bcfFile + " for test " + name, e);
|
||||
|
|
|
|||
|
|
@ -35,10 +35,12 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextTestProvider;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
|
@ -250,13 +252,13 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
|
|||
|
||||
final File file = new File(path);
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final VariantContextTestProvider.VariantContextContainer reader = VariantContextTestProvider.readAllVCs(file, codec);
|
||||
final Pair<VCFHeader, GATKVCFUtils.VCIterable> reader = GATKVCFUtils.readAllVCs(file, codec);
|
||||
|
||||
final List<ActiveRegion> incRegions = new ArrayList<ActiveRegion>();
|
||||
final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser);
|
||||
final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser);
|
||||
int pos = start;
|
||||
for ( final VariantContext vc : reader.getVCs() ) {
|
||||
for ( final VariantContext vc : reader.getSecond() ) {
|
||||
if ( vc == null ) continue;
|
||||
while ( pos < vc.getStart() ) {
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos);
|
||||
|
|
|
|||
|
|
@ -702,7 +702,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
|
|||
for ( int i = 0; i < biallelics.size(); i++ ) {
|
||||
final VariantContext actual = biallelics.get(i);
|
||||
final VariantContext expected = expectedBiallelics.get(i);
|
||||
VariantContextTestProvider.assertEquals(actual, expected);
|
||||
assertVariantContextsAreEqual(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,166 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant;
|
||||
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Base class for test classes within org.broadinstitute.variant
|
||||
*/
|
||||
public class VariantBaseTest {
|
||||
|
||||
public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
|
||||
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
|
||||
|
||||
// TODO: change this to an appropriate value once the move to the Picard repo takes place
|
||||
public static final String variantTestDataRoot = new File("private/testdata/").getAbsolutePath() + "/";
|
||||
|
||||
/**
|
||||
* Simple generic utility class to creating TestNG data providers:
|
||||
*
|
||||
* 1: inherit this class, as in
|
||||
*
|
||||
* private class SummarizeDifferenceTest extends TestDataProvider {
|
||||
* public SummarizeDifferenceTest() {
|
||||
* super(SummarizeDifferenceTest.class);
|
||||
* }
|
||||
* ...
|
||||
* }
|
||||
*
|
||||
* Provide a reference to your class to the TestDataProvider constructor.
|
||||
*
|
||||
* 2: Create instances of your subclass. Return from it the call to getTests, providing
|
||||
* the class type of your test
|
||||
*
|
||||
* @DataProvider(name = "summaries"
|
||||
* public Object[][] createSummaries() {
|
||||
* new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2");
|
||||
* new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1");
|
||||
* return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class);
|
||||
* }
|
||||
*
|
||||
* This class magically tracks created objects of this
|
||||
*/
|
||||
public static class TestDataProvider {
|
||||
private static final Map<Class, List<Object>> tests = new HashMap<Class, List<Object>>();
|
||||
protected String name;
|
||||
|
||||
/**
|
||||
* Create a new TestDataProvider instance bound to the class variable C
|
||||
* @param c
|
||||
*/
|
||||
public TestDataProvider(Class c, String name) {
|
||||
if ( ! tests.containsKey(c) )
|
||||
tests.put(c, new ArrayList<Object>());
|
||||
tests.get(c).add(this);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public TestDataProvider(Class c) {
|
||||
this(c, "");
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all of the data providers in the form expected by TestNG of type class C
|
||||
* @param c
|
||||
* @return
|
||||
*/
|
||||
public static Object[][] getTests(Class c) {
|
||||
List<Object[]> params2 = new ArrayList<Object[]>();
|
||||
for ( Object x : tests.get(c) ) params2.add(new Object[]{x});
|
||||
return params2.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TestDataProvider("+name+")";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a temp file that will be deleted on exit after tests are complete.
|
||||
* @param name Prefix of the file.
|
||||
* @param extension Extension to concat to the end of the file.
|
||||
* @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits.
|
||||
*/
|
||||
public static File createTempFile(String name, String extension) {
|
||||
try {
|
||||
File file = File.createTempFile(name, extension);
|
||||
file.deleteOnExit();
|
||||
return file;
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("Cannot create temp file: " + ex.getMessage(), ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1;
|
||||
|
||||
public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) {
|
||||
Assert.assertTrue(actual instanceof Double, "Not a double");
|
||||
assertEqualsDoubleSmart((double)(Double)actual, (double)expected);
|
||||
}
|
||||
|
||||
public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) {
|
||||
Assert.assertTrue(actual instanceof Double, "Not a double");
|
||||
assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance);
|
||||
}
|
||||
|
||||
public static final void assertEqualsDoubleSmart(final double actual, final double expected) {
|
||||
assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE);
|
||||
}
|
||||
|
||||
public static final <T> void assertEqualsSet(final Set<T> actual, final Set<T> expected, final String info) {
|
||||
final Set<T> actualSet = new HashSet<T>(actual);
|
||||
final Set<T> expectedSet = new HashSet<T>(expected);
|
||||
Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps
|
||||
}
|
||||
|
||||
public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) {
|
||||
assertEqualsDoubleSmart(actual, expected, tolerance, null);
|
||||
}
|
||||
|
||||
public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) {
|
||||
if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately
|
||||
Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not");
|
||||
else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately
|
||||
Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not");
|
||||
else {
|
||||
final double delta = Math.abs(actual - expected);
|
||||
final double ratio = Math.abs(actual / expected - 1.0);
|
||||
Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual
|
||||
+ " not within tolerance " + tolerance
|
||||
+ (message == null ? "" : "message: " + message));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,573 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
// the imports for unit testing.
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.variantcontext.writer.BCF2Encoder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class BCF2EncoderDecoderUnitTest extends VariantBaseTest {
|
||||
private final double FLOAT_TOLERANCE = 1e-6;
|
||||
final List<BCF2TypedValue> primitives = new ArrayList<BCF2TypedValue>();
|
||||
final List<BCF2TypedValue> basicTypes = new ArrayList<BCF2TypedValue>();
|
||||
final List<BCF2TypedValue> forCombinations = new ArrayList<BCF2TypedValue>();
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
basicTypes.add(new BCF2TypedValue(1, BCF2Type.INT8));
|
||||
basicTypes.add(new BCF2TypedValue(1000, BCF2Type.INT16));
|
||||
basicTypes.add(new BCF2TypedValue(1000000, BCF2Type.INT32));
|
||||
basicTypes.add(new BCF2TypedValue(1.2345e6, BCF2Type.FLOAT));
|
||||
basicTypes.add(new BCF2TypedValue("A", BCF2Type.CHAR));
|
||||
|
||||
// small ints
|
||||
primitives.add(new BCF2TypedValue(0, BCF2Type.INT8));
|
||||
primitives.add(new BCF2TypedValue(10, BCF2Type.INT8));
|
||||
primitives.add(new BCF2TypedValue(-1, BCF2Type.INT8));
|
||||
primitives.add(new BCF2TypedValue(100, BCF2Type.INT8));
|
||||
primitives.add(new BCF2TypedValue(-100, BCF2Type.INT8));
|
||||
primitives.add(new BCF2TypedValue(-127, BCF2Type.INT8)); // last value in range
|
||||
primitives.add(new BCF2TypedValue( 127, BCF2Type.INT8)); // last value in range
|
||||
|
||||
// medium ints
|
||||
primitives.add(new BCF2TypedValue(-1000, BCF2Type.INT16));
|
||||
primitives.add(new BCF2TypedValue(1000, BCF2Type.INT16));
|
||||
primitives.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range
|
||||
primitives.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range
|
||||
primitives.add(new BCF2TypedValue(-32767, BCF2Type.INT16)); // last value in range
|
||||
primitives.add(new BCF2TypedValue( 32767, BCF2Type.INT16)); // last value in range
|
||||
|
||||
// larger ints
|
||||
primitives.add(new BCF2TypedValue(-32768, BCF2Type.INT32)); // first value in range
|
||||
primitives.add(new BCF2TypedValue( 32768, BCF2Type.INT32)); // first value in range
|
||||
primitives.add(new BCF2TypedValue(-100000, BCF2Type.INT32));
|
||||
primitives.add(new BCF2TypedValue(100000, BCF2Type.INT32));
|
||||
primitives.add(new BCF2TypedValue(-2147483647, BCF2Type.INT32));
|
||||
primitives.add(new BCF2TypedValue(2147483647, BCF2Type.INT32));
|
||||
|
||||
// floats
|
||||
primitives.add(new BCF2TypedValue(0.0, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-0.0, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(1.0, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-1.0, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(1.1, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-1.1, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(5.0 / 3.0, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-5.0 / 3.0, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(1.23e3, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(1.23e6, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(1.23e9, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(1.23e12, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(1.23e15, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-1.23e3, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-1.23e6, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-1.23e9, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-1.23e12, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(-1.23e15, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(Float.MIN_VALUE, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(Float.MAX_VALUE, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(Double.NEGATIVE_INFINITY, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(Double.POSITIVE_INFINITY, BCF2Type.FLOAT));
|
||||
primitives.add(new BCF2TypedValue(Double.NaN, BCF2Type.FLOAT));
|
||||
|
||||
// strings
|
||||
//primitives.add(new BCF2TypedValue("", BCFType.CHAR)); <- will be null (which is right)
|
||||
primitives.add(new BCF2TypedValue("S", BCF2Type.CHAR));
|
||||
primitives.add(new BCF2TypedValue("S2", BCF2Type.CHAR));
|
||||
primitives.add(new BCF2TypedValue("12345678910", BCF2Type.CHAR));
|
||||
primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR));
|
||||
primitives.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR));
|
||||
|
||||
// missing values
|
||||
for ( BCF2Type type : BCF2Type.values() ) {
|
||||
primitives.add(new BCF2TypedValue(null, type));
|
||||
}
|
||||
|
||||
forCombinations.add(new BCF2TypedValue(10, BCF2Type.INT8));
|
||||
forCombinations.add(new BCF2TypedValue(100, BCF2Type.INT8));
|
||||
forCombinations.add(new BCF2TypedValue(-100, BCF2Type.INT8));
|
||||
forCombinations.add(new BCF2TypedValue(-128, BCF2Type.INT16)); // first value in range
|
||||
forCombinations.add(new BCF2TypedValue( 128, BCF2Type.INT16)); // first value in range
|
||||
forCombinations.add(new BCF2TypedValue(-100000, BCF2Type.INT32));
|
||||
forCombinations.add(new BCF2TypedValue(100000, BCF2Type.INT32));
|
||||
forCombinations.add(new BCF2TypedValue(0.0, BCF2Type.FLOAT));
|
||||
forCombinations.add(new BCF2TypedValue(1.23e6, BCF2Type.FLOAT));
|
||||
forCombinations.add(new BCF2TypedValue(-1.23e6, BCF2Type.FLOAT));
|
||||
forCombinations.add(new BCF2TypedValue("S", BCF2Type.CHAR));
|
||||
forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR));
|
||||
forCombinations.add(new BCF2TypedValue("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ", BCF2Type.CHAR));
|
||||
|
||||
// missing values
|
||||
for ( BCF2Type type : BCF2Type.values() ) {
|
||||
forCombinations.add(new BCF2TypedValue(null, type));
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// merge case Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class BCF2TypedValue {
|
||||
final BCF2Type type;
|
||||
final Object value;
|
||||
|
||||
private BCF2TypedValue(final int value, final BCF2Type type) {
|
||||
this(new Integer(value), type);
|
||||
}
|
||||
|
||||
private BCF2TypedValue(final double value, final BCF2Type type) {
|
||||
this(new Double(value), type);
|
||||
}
|
||||
|
||||
private BCF2TypedValue(final Object value, final BCF2Type type) {
|
||||
this.type = type;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public boolean isMissing() { return value == null; }
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("%s of %s", value, type);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Test encoding of basic types
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "BCF2EncodingTestProviderBasicTypes")
|
||||
public Object[][] BCF2EncodingTestProviderBasicTypes() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
for ( BCF2TypedValue tv : basicTypes )
|
||||
tests.add(new Object[]{Arrays.asList(tv)});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private interface EncodeMe {
|
||||
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException;
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
|
||||
public void testBCF2BasicTypesWithStaticCalls(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
testBCF2BasicTypesWithEncodeMe(toEncode,
|
||||
new EncodeMe() {
|
||||
@Override
|
||||
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException {
|
||||
switch ( tv.type ) {
|
||||
case INT8:
|
||||
case INT16:
|
||||
case INT32:
|
||||
encoder.encodeTypedInt((Integer)tv.value, tv.type);
|
||||
break;
|
||||
case FLOAT:
|
||||
encoder.encodeTypedFloat((Double)tv.value);
|
||||
break;
|
||||
case CHAR:
|
||||
encoder.encodeTypedString((String)tv.value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
|
||||
public void testBCF2BasicTypesWithObjectType(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
testBCF2BasicTypesWithEncodeMe(toEncode,
|
||||
new EncodeMe() {
|
||||
@Override
|
||||
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException {
|
||||
encoder.encodeTyped(tv.value, tv.type);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
|
||||
public void testBCF2BasicTypesWithObjectNoType(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
testBCF2BasicTypesWithEncodeMe(toEncode,
|
||||
new EncodeMe() {
|
||||
@Override
|
||||
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException {
|
||||
encoder.encode(tv.value);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public void testBCF2BasicTypesWithEncodeMe(final List<BCF2TypedValue> toEncode, final EncodeMe func) throws IOException {
|
||||
for ( final BCF2TypedValue tv : toEncode ) {
|
||||
BCF2Encoder encoder = new BCF2Encoder();
|
||||
func.encode(encoder, tv);
|
||||
|
||||
BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
|
||||
final Object decoded = decoder.decodeTypedValue();
|
||||
|
||||
Assert.assertNotNull(decoded);
|
||||
Assert.assertFalse(decoded instanceof List);
|
||||
myAssertEquals(tv, decoded);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
|
||||
public void testBCF2EncodingVectors(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
for ( final BCF2TypedValue tv : toEncode ) {
|
||||
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
|
||||
BCF2Encoder encoder = new BCF2Encoder();
|
||||
List<Object> expected = Collections.nCopies(length, tv.value);
|
||||
encoder.encodeTyped(expected, tv.type);
|
||||
|
||||
BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
|
||||
final Object decoded = decoder.decodeTypedValue();
|
||||
|
||||
Assert.assertTrue(decoded instanceof List);
|
||||
final List<Object> decodedList = (List<Object>)decoded;
|
||||
Assert.assertEquals(decodedList.size(), expected.size());
|
||||
for ( Object decodedValue : decodedList )
|
||||
myAssertEquals(tv, decodedValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "BCF2EncodingTestProviderSingletons")
|
||||
public Object[][] BCF2EncodingTestProviderSingletons() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
for ( BCF2TypedValue tv : primitives )
|
||||
tests.add(new Object[]{Arrays.asList(tv)});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderSingletons")
|
||||
public void testBCF2EncodingSingletons(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
final byte[] record = encodeRecord(toEncode);
|
||||
decodeRecord(toEncode, record);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Test encoding of vectors
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "BCF2EncodingTestProviderSequences")
|
||||
public Object[][] BCF2EncodingTestProviderSequences() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
for ( BCF2TypedValue tv1 : forCombinations )
|
||||
for ( BCF2TypedValue tv2 : forCombinations )
|
||||
for ( BCF2TypedValue tv3 : forCombinations )
|
||||
tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
|
||||
public void testBCF2EncodingVectorsWithMissing(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
for ( final BCF2TypedValue tv : toEncode ) {
|
||||
if ( tv.type != BCF2Type.CHAR ) {
|
||||
for ( final int length : Arrays.asList(2, 5, 10, 15, 20, 25) ) {
|
||||
final byte td = BCF2Utils.encodeTypeDescriptor(1, tv.type);
|
||||
|
||||
final BCF2Encoder encoder = new BCF2Encoder();
|
||||
for ( int i = 0; i < length; i++ ) {
|
||||
encoder.encodeRawValue(i % 2 == 0 ? null : tv.value, tv.type);
|
||||
}
|
||||
|
||||
final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
|
||||
|
||||
for ( int i = 0; i < length; i++ ) {
|
||||
final Object decoded = decoder.decodeTypedValue(td);
|
||||
myAssertEquals(i % 2 == 0 ? new BCF2TypedValue(null, tv.type) : tv, decoded);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons")
|
||||
public void testBCF2EncodingTestProviderSequences(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
final byte[] record = encodeRecord(toEncode);
|
||||
decodeRecord(toEncode, record);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Test strings and lists of strings
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "ListOfStrings")
|
||||
public Object[][] listOfStringsProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"});
|
||||
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"});
|
||||
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "ListOfStrings")
|
||||
public void testEncodingListOfString(List<String> strings, String expected) throws IOException {
|
||||
final String collapsed = BCF2Utils.collapseStringList(strings);
|
||||
Assert.assertEquals(collapsed, expected);
|
||||
Assert.assertEquals(BCF2Utils.explodeStringList(collapsed), strings);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Tests to determine the best type of arrays of integers
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "BestIntTypeTests")
|
||||
public Object[][] BestIntTypeTests() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
tests.add(new Object[]{Arrays.asList(1), BCF2Type.INT8});
|
||||
tests.add(new Object[]{Arrays.asList(1, 10), BCF2Type.INT8});
|
||||
tests.add(new Object[]{Arrays.asList(1, 10, 100), BCF2Type.INT8});
|
||||
tests.add(new Object[]{Arrays.asList(1, -1), BCF2Type.INT8});
|
||||
tests.add(new Object[]{Arrays.asList(1, 1000), BCF2Type.INT16});
|
||||
tests.add(new Object[]{Arrays.asList(1, 1000, 10), BCF2Type.INT16});
|
||||
tests.add(new Object[]{Arrays.asList(1, 1000, 100), BCF2Type.INT16});
|
||||
tests.add(new Object[]{Arrays.asList(1000), BCF2Type.INT16});
|
||||
tests.add(new Object[]{Arrays.asList(100000), BCF2Type.INT32});
|
||||
tests.add(new Object[]{Arrays.asList(100000, 10), BCF2Type.INT32});
|
||||
tests.add(new Object[]{Arrays.asList(100000, 100), BCF2Type.INT32});
|
||||
tests.add(new Object[]{Arrays.asList(100000, 1, -10), BCF2Type.INT32});
|
||||
tests.add(new Object[]{Arrays.asList(-100000, 1, -10), BCF2Type.INT32});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BestIntTypeTests")
|
||||
public void determineBestEncoding(final List<Integer> ints, final BCF2Type expectedType) throws IOException {
|
||||
BCF2Encoder encoder = new BCF2Encoder();
|
||||
Assert.assertEquals(BCF2Utils.determineIntegerType(ints), expectedType);
|
||||
Assert.assertEquals(BCF2Utils.determineIntegerType(ArrayUtils.toPrimitive(ints.toArray(new Integer[0]))), expectedType);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Tests managing and skipping multiple blocks
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences")
|
||||
public void testReadAndSkipWithMultipleBlocks(final List<BCF2TypedValue> block) throws IOException {
|
||||
testReadAndSkipWithMultipleBlocks(block, forCombinations);
|
||||
testReadAndSkipWithMultipleBlocks(forCombinations, block);
|
||||
}
|
||||
|
||||
public void testReadAndSkipWithMultipleBlocks(final List<BCF2TypedValue> block1, final List<BCF2TypedValue> block2) throws IOException {
|
||||
final byte[] record1 = encodeRecord(block1);
|
||||
final byte[] record2 = encodeRecord(block2);
|
||||
|
||||
// each record is individually good
|
||||
decodeRecord(block1, record1);
|
||||
decodeRecord(block2, record2);
|
||||
|
||||
BCF2Decoder decoder = new BCF2Decoder();
|
||||
|
||||
// test setting
|
||||
decoder.setRecordBytes(record1);
|
||||
decodeRecord(block1, decoder);
|
||||
decoder.setRecordBytes(record2);
|
||||
decodeRecord(block2, decoder);
|
||||
|
||||
// test combining the streams
|
||||
final byte[] combined = combineRecords(record1, record2);
|
||||
final List<BCF2TypedValue> combinedObjects = new ArrayList<BCF2TypedValue>(block1);
|
||||
combinedObjects.addAll(block2);
|
||||
|
||||
// the combined bytes is the same as the combined objects
|
||||
InputStream stream = new ByteArrayInputStream(combined);
|
||||
decoder.readNextBlock(record1.length, stream);
|
||||
decodeRecord(block1, decoder);
|
||||
decoder.readNextBlock(record2.length, stream);
|
||||
decodeRecord(block2, decoder);
|
||||
|
||||
// skipping the first block allows us to read the second block directly
|
||||
stream = new ByteArrayInputStream(combined);
|
||||
decoder.skipNextBlock(record1.length, stream);
|
||||
decoder.readNextBlock(record2.length, stream);
|
||||
decodeRecord(block2, decoder);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Test encoding / decoding arrays of ints
|
||||
//
|
||||
// This checks that we can encode and decode correctly with the
|
||||
// low-level decodeIntArray function arrays of values. This
|
||||
// has to be pretty comprehensive as decodeIntArray is a highly optimized
|
||||
// piece of code with lots of edge cases. The values we are encoding
|
||||
// don't really matter -- just that the values come back as expected.
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "IntArrays")
|
||||
public Object[][] makeIntArrays() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( int nValues : Arrays.asList(0, 1, 2, 5, 10, 100) ) {
|
||||
for ( int nPad : Arrays.asList(0, 1, 2, 5, 10, 100) ) {
|
||||
int nElements = nValues + nPad;
|
||||
|
||||
List<Integer> values = new ArrayList<Integer>(nElements);
|
||||
|
||||
// add nValues from 0 to nValues - 1
|
||||
for ( int i = 0; i < nValues; i++ )
|
||||
values.add(i);
|
||||
|
||||
// add nPad nulls
|
||||
for ( int i = 0; i < nPad; i++ )
|
||||
values.add(null);
|
||||
|
||||
tests.add(new Object[]{values});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "IntArrays")
|
||||
public void testIntArrays(final List<Integer> ints) throws IOException {
|
||||
final BCF2Encoder encoder = new BCF2Encoder();
|
||||
encoder.encodeTyped(ints, BCF2Type.INT16);
|
||||
|
||||
final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
|
||||
|
||||
final byte typeDescriptor = decoder.readTypeDescriptor();
|
||||
|
||||
// read the int[] with the low-level version
|
||||
final int size = decoder.decodeNumberOfElements(typeDescriptor);
|
||||
final int[] decoded = decoder.decodeIntArray(typeDescriptor, size);
|
||||
|
||||
if ( isMissing(ints) ) {
|
||||
// we expect that the result is null in this case
|
||||
Assert.assertNull(decoded, "Encoded all missing values -- expected null");
|
||||
} else {
|
||||
// we expect at least some values to come back
|
||||
Assert.assertTrue(decoded.length > 0, "Must have at least 1 element for non-null encoded data");
|
||||
|
||||
// check corresponding values
|
||||
for ( int i = 0; i < ints.size(); i++ ) {
|
||||
final Integer expected = ints.get(i);
|
||||
|
||||
if ( expected == null ) {
|
||||
Assert.assertTrue(decoded.length <= i, "we expect decoded to be truncated for missing values");
|
||||
} else {
|
||||
Assert.assertTrue(decoded.length > i, "we expected at least " + i + " values in decoded array");
|
||||
Assert.assertEquals(decoded[i], (int)expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// Helper routines
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
baos.write(record1);
|
||||
baos.write(record2);
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
private final byte[] encodeRecord(final List<BCF2TypedValue> toEncode) throws IOException {
|
||||
BCF2Encoder encoder = new BCF2Encoder();
|
||||
|
||||
for ( final BCF2TypedValue tv : toEncode ) {
|
||||
if ( tv.isMissing() )
|
||||
encoder.encodeTypedMissing(tv.type);
|
||||
else {
|
||||
final BCF2Type encodedType = encoder.encode(tv.value);
|
||||
if ( tv.type != null ) // only if we have an expectation
|
||||
Assert.assertEquals(encodedType, tv.type);
|
||||
}
|
||||
}
|
||||
|
||||
// check output
|
||||
final byte[] record = encoder.getRecordBytes();
|
||||
Assert.assertNotNull(record);
|
||||
Assert.assertTrue(record.length > 0);
|
||||
return record;
|
||||
}
|
||||
|
||||
private final void decodeRecord(final List<BCF2TypedValue> toEncode, final byte[] record) throws IOException {
|
||||
decodeRecord(toEncode, new BCF2Decoder(record));
|
||||
}
|
||||
|
||||
private final void decodeRecord(final List<BCF2TypedValue> toEncode, final BCF2Decoder decoder) throws IOException {
|
||||
for ( final BCF2TypedValue tv : toEncode ) {
|
||||
Assert.assertFalse(decoder.blockIsFullyDecoded());
|
||||
final Object decoded = decoder.decodeTypedValue();
|
||||
|
||||
myAssertEquals(tv, decoded);
|
||||
}
|
||||
|
||||
Assert.assertTrue(decoder.blockIsFullyDecoded());
|
||||
}
|
||||
|
||||
private final void myAssertEquals(final BCF2TypedValue tv, final Object decoded) {
|
||||
if ( tv.value == null ) { // special needs for instanceof double
|
||||
Assert.assertEquals(decoded, tv.value);
|
||||
} else if ( tv.type == BCF2Type.FLOAT ) { // need tolerance for floats, and they aren't null
|
||||
Assert.assertTrue(decoded instanceof Double);
|
||||
|
||||
final double valueFloat = (Double)tv.value;
|
||||
final double decodedFloat = (Double)decoded;
|
||||
|
||||
VariantBaseTest.assertEqualsDoubleSmart(decodedFloat, valueFloat, FLOAT_TOLERANCE);
|
||||
} else
|
||||
Assert.assertEquals(decoded, tv.value);
|
||||
}
|
||||
|
||||
private final boolean isMissing(final List<Integer> values) {
|
||||
if ( values != null )
|
||||
for ( Integer value : values )
|
||||
if ( value != null )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,153 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.bcf2;
|
||||
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.util.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Tests for BCF2Utils
|
||||
*/
|
||||
public final class BCF2UtilsUnitTest extends VariantBaseTest {
|
||||
@DataProvider(name = "CollapseExpandTest")
|
||||
public Object[][] makeCollapseExpandTest() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
tests.add(new Object[]{Arrays.asList("A"), "A", false});
|
||||
tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true});
|
||||
tests.add(new Object[]{Arrays.asList("AB"), "AB", false});
|
||||
tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true});
|
||||
tests.add(new Object[]{Arrays.asList(), "", false});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CollapseExpandTest")
|
||||
public void testCollapseExpandTest(final List<String> in, final String expectedCollapsed, final boolean isCollapsed) {
|
||||
final String actualCollapsed = BCF2Utils.collapseStringList(in);
|
||||
Assert.assertEquals(actualCollapsed, expectedCollapsed);
|
||||
Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed);
|
||||
if ( isCollapsed )
|
||||
Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in);
|
||||
}
|
||||
|
||||
@DataProvider(name = "HeaderOrderTestProvider")
|
||||
public Object[][] makeHeaderOrderTestProvider() {
|
||||
final List<VCFHeaderLine> inputLines = new ArrayList<VCFHeaderLine>();
|
||||
final List<VCFHeaderLine> extraLines = new ArrayList<VCFHeaderLine>();
|
||||
|
||||
int counter = 0;
|
||||
inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
|
||||
inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
|
||||
inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
|
||||
inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
|
||||
inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
final int inputLineCounter = counter;
|
||||
final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(inputLines));
|
||||
|
||||
extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
|
||||
extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
|
||||
extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
extraLines.add(new VCFHeaderLine("x", "misc"));
|
||||
extraLines.add(new VCFHeaderLine("y", "misc"));
|
||||
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) {
|
||||
final List<VCFHeaderLine> empty = Collections.emptyList();
|
||||
final List<List<VCFHeaderLine>> permutations = extrasToTake == 0
|
||||
? Collections.singletonList(empty)
|
||||
: GeneralUtils.makePermutations(extraLines, extrasToTake, false);
|
||||
for ( final List<VCFHeaderLine> permutation : permutations ) {
|
||||
for ( int i = -1; i < inputLines.size(); i++ ) {
|
||||
final List<VCFHeaderLine> allLines = new ArrayList<VCFHeaderLine>(inputLines);
|
||||
if ( i >= 0 )
|
||||
allLines.remove(i);
|
||||
allLines.addAll(permutation);
|
||||
final VCFHeader testHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(allLines));
|
||||
final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter);
|
||||
tests.add(new Object[]{inputHeader, testHeader, expectedConsistent});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sample name tests
|
||||
final List<List<String>> sampleNameTests = Arrays.asList(
|
||||
new ArrayList<String>(),
|
||||
Arrays.asList("A"),
|
||||
Arrays.asList("A", "B"),
|
||||
Arrays.asList("A", "B", "C"));
|
||||
for ( final List<String> inSamples : sampleNameTests ) {
|
||||
for ( final List<String> testSamples : sampleNameTests ) {
|
||||
final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples);
|
||||
|
||||
final List<List<String>> permutations = testSamples.isEmpty()
|
||||
? Collections.singletonList(testSamples)
|
||||
: GeneralUtils.makePermutations(testSamples, testSamples.size(), false);
|
||||
for ( final List<String> testSamplesPermutation : permutations ) {
|
||||
final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation);
|
||||
final boolean expectedConsistent = testSamples.equals(inSamples);
|
||||
tests.add(new Object[]{inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) {
|
||||
final List<Integer> ids = new ArrayList<Integer>();
|
||||
for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) {
|
||||
if ( line instanceof VCFIDHeaderLine ) {
|
||||
ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID()));
|
||||
}
|
||||
}
|
||||
|
||||
// as long as the start contains all of the ids up to minCounterForInputLines in order
|
||||
for ( int i = 0; i < minCounterForInputLines; i++ )
|
||||
if ( i >= ids.size() || ids.get(i) != i )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// Test to make sure that we detect correctly the case where we can preserve the genotypes data in a BCF2
|
||||
// even when the header file is slightly different
|
||||
//
|
||||
@Test(dataProvider = "HeaderOrderTestProvider")
|
||||
public void testHeaderOrder(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) {
|
||||
final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testHeader, inputHeader);
|
||||
Assert.assertEquals(actualOrderConsistency, expectedConsistent);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,180 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
// public Allele(byte[] bases, boolean isRef) {
|
||||
// public Allele(boolean isRef) {
|
||||
// public Allele(String bases, boolean isRef) {
|
||||
// public boolean isReference() { return isRef; }
|
||||
// public boolean isNonReference() { return ! isReference(); }
|
||||
// public byte[] getBases() { return bases; }
|
||||
// public boolean equals(Allele other) {
|
||||
// public int length() {
|
||||
|
||||
/**
|
||||
* Basic unit test for RecalData
|
||||
*/
|
||||
public class AlleleUnitTest extends VariantBaseTest {
|
||||
Allele ARef, A, T, ATIns, ATCIns, NoCall;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
A = Allele.create("A");
|
||||
ARef = Allele.create("A", true);
|
||||
T = Allele.create("T");
|
||||
|
||||
ATIns = Allele.create("AT");
|
||||
ATCIns = Allele.create("ATC");
|
||||
|
||||
NoCall = Allele.create(".");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingSNPAlleles() {
|
||||
Assert.assertTrue(A.isNonReference());
|
||||
Assert.assertFalse(A.isReference());
|
||||
Assert.assertTrue(A.basesMatch("A"));
|
||||
Assert.assertEquals(A.length(), 1);
|
||||
|
||||
Assert.assertTrue(ARef.isReference());
|
||||
Assert.assertFalse(ARef.isNonReference());
|
||||
Assert.assertTrue(ARef.basesMatch("A"));
|
||||
Assert.assertFalse(ARef.basesMatch("T"));
|
||||
|
||||
Assert.assertTrue(T.isNonReference());
|
||||
Assert.assertFalse(T.isReference());
|
||||
Assert.assertTrue(T.basesMatch("T"));
|
||||
Assert.assertFalse(T.basesMatch("A"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingNoCallAlleles() {
|
||||
Assert.assertTrue(NoCall.isNonReference());
|
||||
Assert.assertFalse(NoCall.isReference());
|
||||
Assert.assertFalse(NoCall.basesMatch("."));
|
||||
Assert.assertEquals(NoCall.length(), 0);
|
||||
Assert.assertTrue(NoCall.isNoCall());
|
||||
Assert.assertFalse(NoCall.isCalled());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCreatingIndelAlleles() {
|
||||
Assert.assertEquals(ATIns.length(), 2);
|
||||
Assert.assertEquals(ATCIns.length(), 3);
|
||||
Assert.assertEquals(ATIns.getBases(), "AT".getBytes());
|
||||
Assert.assertEquals(ATCIns.getBases(), "ATC".getBytes());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testConstructors1() {
|
||||
Allele a1 = Allele.create("A");
|
||||
Allele a2 = Allele.create("A".getBytes());
|
||||
Allele a3 = Allele.create("A");
|
||||
Allele a4 = Allele.create("A", true);
|
||||
|
||||
Assert.assertTrue(a1.equals(a2));
|
||||
Assert.assertTrue(a1.equals(a3));
|
||||
Assert.assertFalse(a1.equals(a4));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInsConstructors() {
|
||||
Allele a1 = Allele.create("AC");
|
||||
Allele a2 = Allele.create("AC".getBytes());
|
||||
Allele a3 = Allele.create("AC");
|
||||
Allele a4 = Allele.create("AC", true);
|
||||
|
||||
Assert.assertTrue(a1.equals(a2));
|
||||
Assert.assertTrue(a1.equals(a3));
|
||||
Assert.assertFalse(a1.equals(a4));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEquals() {
|
||||
Assert.assertTrue(ARef.basesMatch(A));
|
||||
Assert.assertFalse(ARef.equals(A));
|
||||
Assert.assertFalse(ARef.equals(ATIns));
|
||||
Assert.assertFalse(ARef.equals(ATCIns));
|
||||
|
||||
Assert.assertTrue(T.basesMatch(T));
|
||||
Assert.assertFalse(T.basesMatch(A));
|
||||
Assert.assertFalse(T.equals(A));
|
||||
|
||||
Assert.assertTrue(ATIns.equals(ATIns));
|
||||
Assert.assertFalse(ATIns.equals(ATCIns));
|
||||
Assert.assertTrue(ATIns.basesMatch("AT"));
|
||||
Assert.assertFalse(ATIns.basesMatch("A"));
|
||||
Assert.assertFalse(ATIns.basesMatch("ATC"));
|
||||
|
||||
Assert.assertTrue(ATIns.basesMatch("AT"));
|
||||
Assert.assertFalse(ATIns.basesMatch("ATC"));
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = IllegalArgumentException.class)
|
||||
public void testBadConstructorArgs1() {
|
||||
byte[] foo = null;
|
||||
Allele.create(foo);
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = IllegalArgumentException.class)
|
||||
public void testBadConstructorArgs2() {
|
||||
Allele.create("x");
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = IllegalArgumentException.class)
|
||||
public void testBadConstructorArgs3() {
|
||||
Allele.create("--");
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = IllegalArgumentException.class)
|
||||
public void testBadConstructorArgs4() {
|
||||
Allele.create("-A");
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = IllegalArgumentException.class)
|
||||
public void testBadConstructorArgs5() {
|
||||
Allele.create("A A");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtend() {
|
||||
Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString());
|
||||
Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString());
|
||||
Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString());
|
||||
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString());
|
||||
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,203 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Basic unit test for Genotype likelihoods objects
|
||||
*/
|
||||
public class GenotypeLikelihoodsUnitTest extends VariantBaseTest {
|
||||
double [] v = new double[]{-10.5, -1.25, -5.11};
|
||||
final static String vGLString = "-10.50,-1.25,-5.11";
|
||||
final static String vPLString = "93,0,39";
|
||||
double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC
|
||||
|
||||
@Test
|
||||
public void testFromVector2() {
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(v);
|
||||
assertDoubleArraysAreEqual(gl.getAsVector(), v);
|
||||
Assert.assertEquals(gl.getAsString(), vPLString);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFromString1() {
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromPLField(vPLString);
|
||||
assertDoubleArraysAreEqual(gl.getAsVector(), new double[]{-9.3, 0, -3.9});
|
||||
Assert.assertEquals(gl.getAsString(), vPLString);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFromString2() {
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromGLField(vGLString);
|
||||
assertDoubleArraysAreEqual(gl.getAsVector(), v);
|
||||
Assert.assertEquals(gl.getAsString(), vPLString);
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = TribbleException.class)
|
||||
public void testErrorBadFormat() {
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromPLField("adf,b,c");
|
||||
gl.getAsVector();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetAsMap(){
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(v);
|
||||
//Log scale
|
||||
EnumMap<GenotypeType,Double> glMap = gl.getAsMap(false);
|
||||
Assert.assertEquals(v[GenotypeType.HOM_REF.ordinal()-1],glMap.get(GenotypeType.HOM_REF));
|
||||
Assert.assertEquals(v[GenotypeType.HET.ordinal()-1],glMap.get(GenotypeType.HET));
|
||||
Assert.assertEquals(v[GenotypeType.HOM_VAR.ordinal()-1],glMap.get(GenotypeType.HOM_VAR));
|
||||
|
||||
//Linear scale
|
||||
glMap = gl.getAsMap(true);
|
||||
double [] vl = GeneralUtils.normalizeFromLog10(v);
|
||||
Assert.assertEquals(vl[GenotypeType.HOM_REF.ordinal()-1],glMap.get(GenotypeType.HOM_REF));
|
||||
Assert.assertEquals(vl[GenotypeType.HET.ordinal()-1],glMap.get(GenotypeType.HET));
|
||||
Assert.assertEquals(vl[GenotypeType.HOM_VAR.ordinal()-1],glMap.get(GenotypeType.HOM_VAR));
|
||||
|
||||
//Test missing likelihoods
|
||||
gl = GenotypeLikelihoods.fromPLField(".");
|
||||
glMap = gl.getAsMap(false);
|
||||
Assert.assertNull(glMap);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCalculateNumLikelihoods() {
|
||||
|
||||
for (int nAlleles=2; nAlleles<=5; nAlleles++)
|
||||
// simplest case: diploid
|
||||
Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2);
|
||||
|
||||
// some special cases: ploidy = 20, #alleles = 4
|
||||
Assert.assertEquals(GenotypeLikelihoods.numLikelihoods(4, 20), 1771);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetLog10GQ(){
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromPLField(vPLString);
|
||||
|
||||
//GQ for the best guess genotype
|
||||
Assert.assertEquals(gl.getLog10GQ(GenotypeType.HET),-3.9);
|
||||
|
||||
double[] test = GeneralUtils.normalizeFromLog10(gl.getAsVector());
|
||||
|
||||
//GQ for the other genotypes
|
||||
Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_REF), Math.log10(1.0 - test[GenotypeType.HOM_REF.ordinal()-1]));
|
||||
Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_VAR), Math.log10(1.0 - test[GenotypeType.HOM_VAR.ordinal()-1]));
|
||||
|
||||
//Test missing likelihoods
|
||||
gl = GenotypeLikelihoods.fromPLField(".");
|
||||
Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_REF),Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(gl.getLog10GQ(GenotypeType.HET),Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(gl.getLog10GQ(GenotypeType.HOM_VAR),Double.NEGATIVE_INFINITY);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testgetQualFromLikelihoods() {
|
||||
double[] likelihoods = new double[]{-1, 0, -2};
|
||||
// qual values we expect for each possible "best" genotype
|
||||
double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294};
|
||||
|
||||
for ( int i = 0; i < likelihoods.length; i++ ) {
|
||||
Assert.assertEquals(GenotypeLikelihoods.getGQLog10FromLikelihoods(i, likelihoods), expectedQuals[i], 1e-6,
|
||||
"GQ value for genotype " + i + " was not calculated correctly");
|
||||
}
|
||||
}
|
||||
|
||||
// this test is completely broken, the method is wrong.
|
||||
public void testGetQualFromLikelihoodsMultiAllelicBroken() {
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
|
||||
double actualGQ = gl.getLog10GQ(GenotypeType.HET);
|
||||
double expectedGQ = 1.6;
|
||||
Assert.assertEquals(actualGQ,expectedGQ);
|
||||
}
|
||||
|
||||
public void testGetQualFromLikelihoodsMultiAllelic() {
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
|
||||
Allele ref = Allele.create((byte)'A',true);
|
||||
Allele alt1 = Allele.create((byte)'C');
|
||||
Allele alt2 = Allele.create((byte)'T');
|
||||
List<Allele> allAlleles = Arrays.asList(ref,alt1,alt2);
|
||||
List<Allele> gtAlleles = Arrays.asList(alt1,alt2);
|
||||
GenotypeBuilder gtBuilder = new GenotypeBuilder();
|
||||
gtBuilder.alleles(gtAlleles);
|
||||
double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles);
|
||||
double expectedGQ = 1.6;
|
||||
Assert.assertEquals(actualGQ,expectedGQ);
|
||||
}
|
||||
|
||||
private void assertDoubleArraysAreEqual(double[] v1, double[] v2) {
|
||||
Assert.assertEquals(v1.length, v2.length);
|
||||
for ( int i = 0; i < v1.length; i++ ) {
|
||||
Assert.assertEquals(v1[i], v2[i], 1e-6);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCalculatePLindex(){
|
||||
int counter = 0;
|
||||
for ( int i = 0; i <= 3; i++ ) {
|
||||
for ( int j = i; j <= 3; j++ ) {
|
||||
Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), GenotypeLikelihoods.PLindexConversion[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetAllelePair(){
|
||||
allelePairTest(0, 0, 0);
|
||||
allelePairTest(1, 0, 1);
|
||||
allelePairTest(2, 1, 1);
|
||||
allelePairTest(3, 0, 2);
|
||||
allelePairTest(4, 1, 2);
|
||||
allelePairTest(5, 2, 2);
|
||||
allelePairTest(6, 0, 3);
|
||||
allelePairTest(7, 1, 3);
|
||||
allelePairTest(8, 2, 3);
|
||||
allelePairTest(9, 3, 3);
|
||||
}
|
||||
|
||||
private void allelePairTest(int PLindex, int allele1, int allele2) {
|
||||
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex1, allele1, "allele index " + allele1 + " from PL index " + PLindex + " was not calculated correctly");
|
||||
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex2, allele2, "allele index " + allele2 + " from PL index " + PLindex + " was not calculated correctly");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,101 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
public class GenotypeUnitTest extends VariantBaseTest {
|
||||
Allele A, Aref, T;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
A = Allele.create("A");
|
||||
Aref = Allele.create("A", true);
|
||||
T = Allele.create("T");
|
||||
}
|
||||
|
||||
private static final GenotypeBuilder makeGB() {
|
||||
return new GenotypeBuilder("misc");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilters() {
|
||||
Assert.assertFalse(makeGB().make().isFiltered(), "by default Genotypes must be PASS");
|
||||
Assert.assertNull(makeGB().make().getFilters(), "by default Genotypes must be PASS => getFilters() == null");
|
||||
Assert.assertFalse(makeGB().filter(null).make().isFiltered(), "setting filter == null => Genotypes must be PASS");
|
||||
Assert.assertNull(makeGB().filter(null).make().getFilters(), "Genotypes PASS => getFilters == null");
|
||||
Assert.assertFalse(makeGB().filter("PASS").make().isFiltered(), "setting filter == PASS => Genotypes must be PASS");
|
||||
Assert.assertNull(makeGB().filter("PASS").make().getFilters(), "Genotypes PASS => getFilters == null");
|
||||
Assert.assertTrue(makeGB().filter("x").make().isFiltered(), "setting filter != null => Genotypes must be PASS");
|
||||
Assert.assertEquals(makeGB().filter("x").make().getFilters(), "x", "Should get back the expected filter string");
|
||||
Assert.assertEquals(makeGB().filters("x", "y").make().getFilters(), "x;y", "Multiple filter field values should be joined with ;");
|
||||
Assert.assertEquals(makeGB().filters("x", "y", "z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;");
|
||||
Assert.assertTrue(makeGB().filters("x", "y", "z").make().isFiltered(), "Multiple filter values should be filtered");
|
||||
Assert.assertEquals(makeGB().filter("x;y;z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;");
|
||||
}
|
||||
|
||||
// public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
|
||||
// public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased, double[] log10Likelihoods) {
|
||||
// public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, double[] log10Likelihoods)
|
||||
// public Genotype(String sampleName, List<Allele> alleles, double negLog10PError)
|
||||
// public Genotype(String sampleName, List<Allele> alleles)
|
||||
// public List<Allele> getAlleles()
|
||||
// public List<Allele> getAlleles(Allele allele)
|
||||
// public Allele getAllele(int i)
|
||||
// public boolean isPhased()
|
||||
// public int getPloidy()
|
||||
// public Type getType()
|
||||
// public boolean isHom()
|
||||
// public boolean isHomRef()
|
||||
// public boolean isHomVar()
|
||||
// public boolean isHet()
|
||||
// public boolean isNoCall()
|
||||
// public boolean isCalled()
|
||||
// public boolean isAvailable()
|
||||
// public boolean hasLikelihoods()
|
||||
// public GenotypeLikelihoods getLikelihoods()
|
||||
// public boolean sameGenotype(Genotype other)
|
||||
// public boolean sameGenotype(Genotype other, boolean ignorePhase)
|
||||
// public String getSampleName()
|
||||
// public boolean hasLog10PError()
|
||||
// public double getLog10PError()
|
||||
// public double getPhredScaledQual()
|
||||
// public boolean hasExtendedAttribute(String key)
|
||||
// public Object getExtendedAttribute(String key)
|
||||
// public Object getExtendedAttribute(String key, Object defaultValue)
|
||||
// public String getAttributeAsString(String key, String defaultValue)
|
||||
// public int getAttributeAsInt(String key, int defaultValue)
|
||||
// public double getAttributeAsDouble(String key, double defaultValue)
|
||||
// public boolean getAttributeAsBoolean(String key, boolean defaultValue)
|
||||
}
|
||||
|
|
@ -1,309 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class GenotypesContextUnitTest extends VariantBaseTest {
|
||||
Allele Aref, C, T;
|
||||
Genotype AA, AT, TT, AC, CT, CC, MISSING;
|
||||
List<Genotype> allGenotypes;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
C = Allele.create("C");
|
||||
Aref = Allele.create("A", true);
|
||||
T = Allele.create("T");
|
||||
AA = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
AT = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
TT = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
AC = GenotypeBuilder.create("AC", Arrays.asList(Aref, C));
|
||||
CT = GenotypeBuilder.create("CT", Arrays.asList(C, T));
|
||||
CC = GenotypeBuilder.create("CC", Arrays.asList(C, C));
|
||||
MISSING = GenotypeBuilder.create("MISSING", Arrays.asList(C, C));
|
||||
|
||||
allGenotypes = Arrays.asList(AA, AT, TT, AC, CT, CC);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private interface ContextMaker {
|
||||
public GenotypesContext make(List<Genotype> initialSamples);
|
||||
}
|
||||
|
||||
private ContextMaker baseMaker = new ContextMaker() {
|
||||
@Override
|
||||
public GenotypesContext make(final List<Genotype> initialSamples) {
|
||||
return GenotypesContext.copy(initialSamples);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "GenotypesContext";
|
||||
}
|
||||
};
|
||||
|
||||
private final class lazyMaker implements LazyGenotypesContext.LazyParser, ContextMaker {
|
||||
@Override
|
||||
public LazyGenotypesContext.LazyData parse(final Object data) {
|
||||
GenotypesContext gc = GenotypesContext.copy((List<Genotype>)data);
|
||||
gc.ensureSampleNameMap();
|
||||
gc.ensureSampleOrdering();
|
||||
return new LazyGenotypesContext.LazyData(gc.notToBeDirectlyAccessedGenotypes, gc.sampleNamesInOrder, gc.sampleNameToOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenotypesContext make(final List<Genotype> initialSamples) {
|
||||
return new LazyGenotypesContext(this, initialSamples, initialSamples.size());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "LazyGenotypesContext";
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<ContextMaker> allMakers = Arrays.asList(baseMaker, new lazyMaker());
|
||||
|
||||
private class GenotypesContextProvider extends TestDataProvider {
|
||||
ContextMaker maker;
|
||||
final List<Genotype> initialSamples;
|
||||
|
||||
private GenotypesContextProvider(ContextMaker maker, List<Genotype> initialSamples) {
|
||||
super(GenotypesContextProvider.class, String.format("%s with %d samples", maker.toString(), initialSamples.size()));
|
||||
this.maker = maker;
|
||||
this.initialSamples = initialSamples;
|
||||
}
|
||||
|
||||
public GenotypesContext makeContext() {
|
||||
return maker.make(initialSamples);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "GenotypesContextProvider")
|
||||
public Object[][] MakeSampleNamesTest() {
|
||||
for ( ContextMaker maker : allMakers ) {
|
||||
for ( int i = 0; i < allGenotypes.size(); i++ ) {
|
||||
List<Genotype> samples = allGenotypes.subList(0, i);
|
||||
// sorted
|
||||
new GenotypesContextProvider(maker, samples);
|
||||
// unsorted
|
||||
new GenotypesContextProvider(maker, GeneralUtils.reverse(samples));
|
||||
}
|
||||
}
|
||||
|
||||
return GenotypesContextProvider.getTests(GenotypesContextProvider.class);
|
||||
}
|
||||
|
||||
private final static void testIterable(Iterable<Genotype> genotypeIterable, Set<String> expectedNames) {
|
||||
int count = 0;
|
||||
for ( final Genotype g : genotypeIterable ) {
|
||||
Assert.assertTrue(expectedNames.contains(g.getSampleName()));
|
||||
count++;
|
||||
}
|
||||
Assert.assertEquals(count, expectedNames.size(), "Iterable returned unexpected number of genotypes");
|
||||
}
|
||||
|
||||
@Test(dataProvider = "GenotypesContextProvider")
|
||||
public void testInitialSamplesAreAsExpected(GenotypesContextProvider cfg) {
|
||||
testGenotypesContextContainsExpectedSamples(cfg.makeContext(), cfg.initialSamples);
|
||||
}
|
||||
|
||||
private final void testGenotypesContextContainsExpectedSamples(GenotypesContext gc, List<Genotype> expectedSamples) {
|
||||
Assert.assertEquals(gc.isEmpty(), expectedSamples.isEmpty());
|
||||
Assert.assertEquals(gc.size(), expectedSamples.size());
|
||||
|
||||
// get(index) is doing the right thing
|
||||
for ( int i = 0; i < expectedSamples.size(); i++ ) {
|
||||
Assert.assertEquals(gc.get(i), expectedSamples.get(i));
|
||||
}
|
||||
Assert.assertFalse(gc.containsSample(MISSING.getSampleName()));
|
||||
|
||||
// we can fetch samples by name
|
||||
final Set<String> genotypeNames = VariantContextUtils.genotypeNames(expectedSamples);
|
||||
for ( final String name : genotypeNames ) {
|
||||
Assert.assertTrue(gc.containsSample(name));
|
||||
}
|
||||
Assert.assertFalse(gc.containsSample(MISSING.getSampleName()));
|
||||
|
||||
// all of the iterators are working
|
||||
testIterable(gc.iterateInSampleNameOrder(), genotypeNames);
|
||||
testIterable(gc, genotypeNames);
|
||||
testIterable(gc.iterateInSampleNameOrder(genotypeNames), genotypeNames);
|
||||
if ( ! genotypeNames.isEmpty() ) {
|
||||
Set<String> first = Collections.singleton(genotypeNames.iterator().next());
|
||||
testIterable(gc.iterateInSampleNameOrder(first), first);
|
||||
}
|
||||
|
||||
// misc. utils are working as expected
|
||||
Assert.assertEquals(gc.getSampleNames(), genotypeNames);
|
||||
Assert.assertTrue(ParsingUtils.isSorted(gc.getSampleNamesOrderedByName()));
|
||||
Assert.assertTrue(ParsingUtils.isSorted(gc.iterateInSampleNameOrder()));
|
||||
Assert.assertTrue(gc.containsSamples(genotypeNames));
|
||||
|
||||
final Set<String> withMissing = new HashSet<String>(Arrays.asList(MISSING.getSampleName()));
|
||||
withMissing.addAll(genotypeNames);
|
||||
Assert.assertFalse(gc.containsSamples(withMissing));
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testImmutable(GenotypesContextProvider cfg) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
Assert.assertEquals(gc.isMutable(), true);
|
||||
gc.immutable();
|
||||
Assert.assertEquals(gc.isMutable(), false);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider", expectedExceptions = Throwable.class )
|
||||
public void testImmutableCall1(GenotypesContextProvider cfg) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
gc.immutable();
|
||||
gc.add(MISSING);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testClear(GenotypesContextProvider cfg) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
gc.clear();
|
||||
testGenotypesContextContainsExpectedSamples(gc, Collections.<Genotype>emptyList());
|
||||
}
|
||||
|
||||
private static final List<Genotype> with(List<Genotype> genotypes, Genotype ... add) {
|
||||
List<Genotype> l = new ArrayList<Genotype>(genotypes);
|
||||
l.addAll(Arrays.asList(add));
|
||||
return l;
|
||||
}
|
||||
|
||||
private static final List<Genotype> without(List<Genotype> genotypes, Genotype ... remove) {
|
||||
List<Genotype> l = new ArrayList<Genotype>(genotypes);
|
||||
l.removeAll(Arrays.asList(remove));
|
||||
return l;
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testAdds(GenotypesContextProvider cfg) {
|
||||
Genotype add1 = GenotypeBuilder.create("add1", Arrays.asList(Aref, Aref));
|
||||
Genotype add2 = GenotypeBuilder.create("add2", Arrays.asList(Aref, Aref));
|
||||
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
gc.add(add1);
|
||||
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1));
|
||||
|
||||
gc = cfg.makeContext();
|
||||
gc.add(add1);
|
||||
gc.add(add2);
|
||||
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));
|
||||
|
||||
gc = cfg.makeContext();
|
||||
gc.addAll(Arrays.asList(add1, add2));
|
||||
testGenotypesContextContainsExpectedSamples(gc, with(cfg.initialSamples, add1, add2));
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testRemoves(GenotypesContextProvider cfg) {
|
||||
Genotype rm1 = AA;
|
||||
Genotype rm2 = AC;
|
||||
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
if (gc.size() > 1) {
|
||||
Genotype rm = gc.get(0);
|
||||
gc.remove(rm);
|
||||
testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm));
|
||||
}
|
||||
|
||||
gc = cfg.makeContext();
|
||||
gc.remove(rm1);
|
||||
testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1));
|
||||
|
||||
gc = cfg.makeContext();
|
||||
gc.remove(rm1);
|
||||
gc.remove(rm2);
|
||||
testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2));
|
||||
|
||||
gc = cfg.makeContext();
|
||||
gc.removeAll(Arrays.asList(rm1, rm2));
|
||||
testGenotypesContextContainsExpectedSamples(gc, without(cfg.initialSamples, rm1, rm2));
|
||||
|
||||
gc = cfg.makeContext();
|
||||
HashSet<Genotype> expected = new HashSet<Genotype>();
|
||||
if ( gc.contains(rm1) ) expected.add(rm1);
|
||||
if ( gc.contains(rm2) ) expected.add(rm2);
|
||||
gc.retainAll(Arrays.asList(rm1, rm2));
|
||||
|
||||
// ensure that the two lists are the same
|
||||
Assert.assertEquals(new HashSet<Genotype>(gc.getGenotypes()), expected);
|
||||
// because the list order can change, we use the gc's list itself
|
||||
testGenotypesContextContainsExpectedSamples(gc, gc.getGenotypes());
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testSet(GenotypesContextProvider cfg) {
|
||||
Genotype set = GenotypeBuilder.create("replace", Arrays.asList(Aref, Aref));
|
||||
int n = cfg.makeContext().size();
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
Genotype setted = gc.set(i, set);
|
||||
Assert.assertNotNull(setted);
|
||||
ArrayList<Genotype> l = new ArrayList<Genotype>(cfg.initialSamples);
|
||||
l.set(i, set);
|
||||
testGenotypesContextContainsExpectedSamples(gc, l);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GenotypesContextProvider")
|
||||
public void testReplace(GenotypesContextProvider cfg) {
|
||||
int n = cfg.makeContext().size();
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
GenotypesContext gc = cfg.makeContext();
|
||||
Genotype toReplace = gc.get(i);
|
||||
Genotype replacement = GenotypeBuilder.create(toReplace.getSampleName(), Arrays.asList(Aref, Aref));
|
||||
gc.replace(replacement);
|
||||
ArrayList<Genotype> l = new ArrayList<Genotype>(cfg.initialSamples);
|
||||
l.set(i, replacement);
|
||||
Assert.assertEquals(replacement, gc.get(i));
|
||||
testGenotypesContextContainsExpectedSamples(gc, l);
|
||||
}
|
||||
}
|
||||
|
||||
// subset to samples tested in VariantContextUnitTest
|
||||
}
|
||||
|
|
@ -1,974 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Codec;
|
||||
import org.broadinstitute.variant.utils.GeneralUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.broadinstitute.variant.variantcontext.writer.Options;
|
||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Routines for generating all sorts of VCs for testing
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class VariantContextTestProvider {
|
||||
final private static boolean ENABLE_GENOTYPE_TESTS = true;
|
||||
final private static boolean ENABLE_A_AND_G_TESTS = true;
|
||||
final private static boolean ENABLE_VARARRAY_TESTS = true;
|
||||
final private static boolean ENABLE_PLOIDY_TESTS = true;
|
||||
final private static boolean ENABLE_PL_TESTS = true;
|
||||
final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = true;
|
||||
final private static boolean ENABLE_SOURCE_VCF_TESTS = true;
|
||||
final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true;
|
||||
final private static List<Integer> TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
|
||||
|
||||
private static VCFHeader syntheticHeader;
|
||||
final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>();
|
||||
private static VariantContext ROOT;
|
||||
|
||||
private final static List<File> testSourceVCFs = new ArrayList<File>();
|
||||
static {
|
||||
testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"));
|
||||
testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "ex2.vcf"));
|
||||
testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "dbsnp_135.b37.1000.vcf"));
|
||||
if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) {
|
||||
testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "diagnosis_targets_testfile.vcf"));
|
||||
testSourceVCFs.add(new File(VariantBaseTest.variantTestDataRoot + "VQSR.mixedTest.recal"));
|
||||
}
|
||||
}
|
||||
|
||||
public static class VariantContextContainer {
|
||||
private VCFHeader header;
|
||||
private Iterable<VariantContext> vcs;
|
||||
|
||||
public VariantContextContainer( VCFHeader header, Iterable<VariantContext> vcs ) {
|
||||
this.header = header;
|
||||
this.vcs = vcs;
|
||||
}
|
||||
|
||||
public VCFHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
public Iterable<VariantContext> getVCs() {
|
||||
return vcs;
|
||||
}
|
||||
}
|
||||
|
||||
public abstract static class VariantContextIOTest {
|
||||
public String toString() {
|
||||
return "VariantContextIOTest:" + getExtension();
|
||||
}
|
||||
public abstract String getExtension();
|
||||
public abstract FeatureCodec<VariantContext> makeCodec();
|
||||
public abstract VariantContextWriter makeWriter(final File outputFile, final EnumSet<Options> baseOptions);
|
||||
|
||||
public List<VariantContext> preprocess(final VCFHeader header, List<VariantContext> vcsBeforeIO) {
|
||||
return vcsBeforeIO;
|
||||
}
|
||||
|
||||
public List<VariantContext> postprocess(final VCFHeader header, List<VariantContext> vcsAfterIO) {
|
||||
return vcsAfterIO;
|
||||
}
|
||||
}
|
||||
|
||||
public static class VariantContextTestData {
|
||||
public final VCFHeader header;
|
||||
public List<VariantContext> vcs;
|
||||
|
||||
public VariantContextTestData(final VCFHeader header, final VariantContextBuilder builder) {
|
||||
this(header, Collections.singletonList(builder.fullyDecoded(true).make()));
|
||||
}
|
||||
|
||||
public VariantContextTestData(final VCFHeader header, final List<VariantContext> vcs) {
|
||||
final Set<String> samples = new HashSet<String>();
|
||||
for ( final VariantContext vc : vcs )
|
||||
if ( vc.hasGenotypes() )
|
||||
samples.addAll(vc.getSampleNames());
|
||||
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaDataInSortedOrder(), samples);
|
||||
this.vcs = vcs;
|
||||
}
|
||||
|
||||
public boolean hasGenotypes() {
|
||||
return vcs.get(0).hasGenotypes();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append("VariantContextTestData: [");
|
||||
final VariantContext vc = vcs.get(0);
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
builder.noGenotypes();
|
||||
b.append(builder.make().toString());
|
||||
if ( vc.getNSamples() < 5 ) {
|
||||
for ( final Genotype g : vc.getGenotypes() )
|
||||
b.append(g.toString());
|
||||
} else {
|
||||
b.append(" nGenotypes = ").append(vc.getNSamples());
|
||||
}
|
||||
|
||||
if ( vcs.size() > 1 ) b.append(" ----- with another ").append(vcs.size() - 1).append(" VariantContext records");
|
||||
b.append("]");
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private final static VariantContextBuilder builder() {
|
||||
return new VariantContextBuilder(ROOT);
|
||||
}
|
||||
|
||||
private final static void add(VariantContextBuilder builder) {
|
||||
TEST_DATAs.add(new VariantContextTestData(syntheticHeader, builder));
|
||||
}
|
||||
|
||||
public static void initializeTests() throws IOException {
|
||||
createSyntheticHeader();
|
||||
makeSyntheticTests();
|
||||
makeEmpiricalTests();
|
||||
}
|
||||
|
||||
private static void makeEmpiricalTests() throws IOException {
|
||||
if ( ENABLE_SOURCE_VCF_TESTS ) {
|
||||
for ( final File file : testSourceVCFs ) {
|
||||
VCFCodec codec = new VCFCodec();
|
||||
VariantContextContainer x = readAllVCs( file, codec );
|
||||
List<VariantContext> fullyDecoded = new ArrayList<VariantContext>();
|
||||
|
||||
for ( final VariantContext raw : x.getVCs() ) {
|
||||
if ( raw != null )
|
||||
fullyDecoded.add(raw.fullyDecode(x.getHeader(), false));
|
||||
}
|
||||
|
||||
TEST_DATAs.add(new VariantContextTestData(x.getHeader(), fullyDecoded));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final int count, final VCFHeaderLineType type) {
|
||||
metaData.add(new VCFInfoHeaderLine(id, count, type, "x"));
|
||||
if ( type != VCFHeaderLineType.Flag )
|
||||
metaData.add(new VCFFormatHeaderLine(id, count, type, "x"));
|
||||
}
|
||||
|
||||
private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final VCFHeaderLineCount count, final VCFHeaderLineType type) {
|
||||
metaData.add(new VCFInfoHeaderLine(id, count, type, "x"));
|
||||
if ( type != VCFHeaderLineType.Flag )
|
||||
metaData.add(new VCFFormatHeaderLine(id, count, type, "x"));
|
||||
}
|
||||
|
||||
private static void createSyntheticHeader() {
|
||||
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
|
||||
|
||||
addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
|
||||
|
||||
addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String);
|
||||
|
||||
// prep the header
|
||||
metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0));
|
||||
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER1"));
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER2"));
|
||||
|
||||
addHeaderLine(metaData, "INT1", 1, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "INT3", 3, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "INT20", 20, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "FLOAT1", 1, VCFHeaderLineType.Float);
|
||||
addHeaderLine(metaData, "FLOAT3", 3, VCFHeaderLineType.Float);
|
||||
addHeaderLine(metaData, "FLAG", 0, VCFHeaderLineType.Flag);
|
||||
|
||||
syntheticHeader = new VCFHeader(metaData);
|
||||
}
|
||||
|
||||
|
||||
private static void makeSyntheticTests() {
|
||||
VariantContextBuilder rootBuilder = new VariantContextBuilder();
|
||||
rootBuilder.source("test");
|
||||
rootBuilder.loc("1", 10, 10);
|
||||
rootBuilder.alleles("A", "C");
|
||||
rootBuilder.unfiltered();
|
||||
ROOT = rootBuilder.make();
|
||||
|
||||
add(builder());
|
||||
add(builder().alleles("A"));
|
||||
add(builder().alleles("A", "C", "T"));
|
||||
add(builder().alleles("A", "AC"));
|
||||
add(builder().alleles("A", "ACAGT"));
|
||||
add(builder().loc("1", 10, 11).alleles("AC", "A"));
|
||||
add(builder().loc("1", 10, 13).alleles("ACGT", "A"));
|
||||
|
||||
// make sure filters work
|
||||
add(builder().unfiltered());
|
||||
add(builder().passFilters());
|
||||
add(builder().filters("FILTER1"));
|
||||
add(builder().filters("FILTER1", "FILTER2"));
|
||||
|
||||
add(builder().log10PError(VariantContext.NO_LOG10_PERROR));
|
||||
add(builder().log10PError(-1));
|
||||
add(builder().log10PError(-1.234e6));
|
||||
|
||||
add(builder().noID());
|
||||
add(builder().id("rsID12345"));
|
||||
|
||||
|
||||
add(builder().attribute("INT1", 1));
|
||||
add(builder().attribute("INT1", 100));
|
||||
add(builder().attribute("INT1", 1000));
|
||||
add(builder().attribute("INT1", 100000));
|
||||
add(builder().attribute("INT1", null));
|
||||
add(builder().attribute("INT3", Arrays.asList(1, 2, 3)));
|
||||
add(builder().attribute("INT3", Arrays.asList(1000, 2000, 3000)));
|
||||
add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000)));
|
||||
add(builder().attribute("INT3", null));
|
||||
add(builder().attribute("INT20", TWENTY_INTS));
|
||||
|
||||
add(builder().attribute("FLOAT1", 1.0));
|
||||
add(builder().attribute("FLOAT1", 100.0));
|
||||
add(builder().attribute("FLOAT1", 1000.0));
|
||||
add(builder().attribute("FLOAT1", 100000.0));
|
||||
add(builder().attribute("FLOAT1", null));
|
||||
add(builder().attribute("FLOAT3", Arrays.asList(1.0, 2.0, 3.0)));
|
||||
add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0)));
|
||||
add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0)));
|
||||
add(builder().attribute("FLOAT3", null));
|
||||
|
||||
add(builder().attribute("FLAG", true));
|
||||
//add(builder().attribute("FLAG", false)); // NOTE -- VCF doesn't allow false flags
|
||||
|
||||
add(builder().attribute("STRING1", "s1"));
|
||||
add(builder().attribute("STRING1", null));
|
||||
add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3")));
|
||||
add(builder().attribute("STRING3", null));
|
||||
add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20")));
|
||||
|
||||
add(builder().attribute("VAR.INFO.STRING", "s1"));
|
||||
add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2")));
|
||||
add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3")));
|
||||
add(builder().attribute("VAR.INFO.STRING", null));
|
||||
|
||||
if ( ENABLE_GENOTYPE_TESTS ) {
|
||||
addGenotypesToTestData();
|
||||
addComplexGenotypesTest();
|
||||
}
|
||||
|
||||
if ( ENABLE_A_AND_G_TESTS )
|
||||
addGenotypesAndGTests();
|
||||
|
||||
if ( ENABLE_SYMBOLIC_ALLELE_TESTS )
|
||||
addSymbolicAlleleTests();
|
||||
}
|
||||
|
||||
private static void addSymbolicAlleleTests() {
|
||||
// two tests to ensure that the end is computed correctly when there's (and not) an END field present
|
||||
add(builder().alleles("N", "<VQSR>").start(10).stop(11).attribute("END", 11));
|
||||
add(builder().alleles("N", "<VQSR>").start(10).stop(10));
|
||||
}
|
||||
|
||||
private static void addGenotypesToTestData() {
|
||||
final ArrayList<VariantContext> sites = new ArrayList<VariantContext>();
|
||||
|
||||
sites.add(builder().alleles("A").make());
|
||||
sites.add(builder().alleles("A", "C", "T").make());
|
||||
sites.add(builder().alleles("A", "AC").make());
|
||||
sites.add(builder().alleles("A", "ACAGT").make());
|
||||
|
||||
for ( VariantContext site : sites ) {
|
||||
addGenotypes(site);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addGenotypeTests( final VariantContext site, Genotype ... genotypes ) {
|
||||
// for each sites VC, we are going to add create two root genotypes.
|
||||
// The first is the primary, and will be added to each new test
|
||||
// The second is variable. In some tests it's absent (testing 1 genotype), in others it is duplicated
|
||||
// 1 once, 10, 100, or 1000 times to test scaling
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(site);
|
||||
|
||||
// add a single context
|
||||
builder.genotypes(genotypes[0]);
|
||||
add(builder);
|
||||
|
||||
if ( genotypes.length > 1 ) {
|
||||
// add all
|
||||
add(builder.genotypes(Arrays.asList(genotypes)));
|
||||
|
||||
// add all with the last replicated 10x and 100x times
|
||||
for ( int nCopiesOfLast : Arrays.asList(10, 100, 1000) ) {
|
||||
final GenotypesContext gc = new GenotypesContext();
|
||||
final Genotype last = genotypes[genotypes.length-1];
|
||||
for ( int i = 0; i < genotypes.length - 1; i++ )
|
||||
gc.add(genotypes[i]);
|
||||
for ( int i = 0; i < nCopiesOfLast; i++ )
|
||||
gc.add(new GenotypeBuilder(last).name("copy" + i).make());
|
||||
add(builder.genotypes(gc));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void addGenotypes( final VariantContext site) {
|
||||
// test ref/ref
|
||||
final Allele ref = site.getReference();
|
||||
final Allele alt1 = site.getNAlleles() > 1 ? site.getAlternateAllele(0) : null;
|
||||
final Genotype homRef = GenotypeBuilder.create("homRef", Arrays.asList(ref, ref));
|
||||
addGenotypeTests(site, homRef);
|
||||
|
||||
if ( alt1 != null ) {
|
||||
final Genotype het = GenotypeBuilder.create("het", Arrays.asList(ref, alt1));
|
||||
final Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(alt1, alt1));
|
||||
addGenotypeTests(site, homRef, het);
|
||||
addGenotypeTests(site, homRef, het, homVar);
|
||||
|
||||
// test no GT at all
|
||||
addGenotypeTests(site, new GenotypeBuilder("noGT", new ArrayList<Allele>(0)).attribute("INT1", 10).make());
|
||||
|
||||
final List<Allele> noCall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
||||
// ploidy
|
||||
if ( ENABLE_PLOIDY_TESTS ) {
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
|
||||
GenotypeBuilder.create("hap", Arrays.asList(ref)));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("noCall", noCall),
|
||||
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
|
||||
GenotypeBuilder.create("hap", Arrays.asList(ref)));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("noCall", noCall),
|
||||
GenotypeBuilder.create("noCall2", noCall),
|
||||
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
|
||||
GenotypeBuilder.create("hap", Arrays.asList(ref)));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
|
||||
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("noCall", noCall),
|
||||
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
|
||||
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("noCall", noCall),
|
||||
GenotypeBuilder.create("noCall2", noCall),
|
||||
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
|
||||
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("nocall", noCall),
|
||||
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
|
||||
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
// TESTING PHASE
|
||||
//
|
||||
//
|
||||
final Genotype gUnphased = new GenotypeBuilder("gUnphased", Arrays.asList(ref, alt1)).make();
|
||||
final Genotype gPhased = new GenotypeBuilder("gPhased", Arrays.asList(ref, alt1)).phased(true).make();
|
||||
final Genotype gPhased2 = new GenotypeBuilder("gPhased2", Arrays.asList(alt1, alt1)).phased(true).make();
|
||||
final Genotype gPhased3 = new GenotypeBuilder("gPhased3", Arrays.asList(ref, ref)).phased(true).make();
|
||||
final Genotype haploidNoPhase = new GenotypeBuilder("haploidNoPhase", Arrays.asList(ref)).make();
|
||||
addGenotypeTests(site, gUnphased, gPhased);
|
||||
addGenotypeTests(site, gUnphased, gPhased2);
|
||||
addGenotypeTests(site, gUnphased, gPhased3);
|
||||
addGenotypeTests(site, gPhased, gPhased2);
|
||||
addGenotypeTests(site, gPhased, gPhased3);
|
||||
addGenotypeTests(site, gPhased2, gPhased3);
|
||||
addGenotypeTests(site, haploidNoPhase, gPhased);
|
||||
addGenotypeTests(site, haploidNoPhase, gPhased2);
|
||||
addGenotypeTests(site, haploidNoPhase, gPhased3);
|
||||
addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2);
|
||||
addGenotypeTests(site, haploidNoPhase, gPhased, gPhased3);
|
||||
addGenotypeTests(site, haploidNoPhase, gPhased2, gPhased3);
|
||||
addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2, gPhased3);
|
||||
|
||||
final Genotype gUnphasedTet = new GenotypeBuilder("gUnphasedTet", Arrays.asList(ref, alt1, ref, alt1)).make();
|
||||
final Genotype gPhasedTet = new GenotypeBuilder("gPhasedTet", Arrays.asList(ref, alt1, alt1, alt1)).phased(true).make();
|
||||
addGenotypeTests(site, gUnphasedTet, gPhasedTet);
|
||||
}
|
||||
|
||||
if ( ENABLE_PL_TESTS ) {
|
||||
if ( site.getNAlleles() == 2 ) {
|
||||
// testing PLs
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2}),
|
||||
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3}));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}),
|
||||
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3}));
|
||||
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}),
|
||||
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2000, -1000}));
|
||||
|
||||
addGenotypeTests(site, // missing PLs
|
||||
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}),
|
||||
GenotypeBuilder.create("g2", Arrays.asList(ref, ref)));
|
||||
}
|
||||
else if ( site.getNAlleles() == 3 ) {
|
||||
// testing PLs
|
||||
addGenotypeTests(site,
|
||||
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2, -3, -4, -5}),
|
||||
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3, -4, -5, -6}));
|
||||
}
|
||||
}
|
||||
|
||||
// test attributes
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT1", 1),
|
||||
attr("g2", ref, "INT1", 2));
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT1", 1),
|
||||
attr("g2", ref, "INT1"));
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT3", 1, 2, 3),
|
||||
attr("g2", ref, "INT3", 4, 5, 6));
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT3", 1, 2, 3),
|
||||
attr("g2", ref, "INT3"));
|
||||
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT20", TWENTY_INTS),
|
||||
attr("g2", ref, "INT20", TWENTY_INTS));
|
||||
|
||||
|
||||
if (ENABLE_VARARRAY_TESTS) {
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT.VAR", 1, 2, 3),
|
||||
attr("g2", ref, "INT.VAR", 4, 5),
|
||||
attr("g3", ref, "INT.VAR", 6));
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "INT.VAR", 1, 2, 3),
|
||||
attr("g2", ref, "INT.VAR"),
|
||||
attr("g3", ref, "INT.VAR", 5));
|
||||
}
|
||||
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "FLOAT1", 1.0),
|
||||
attr("g2", ref, "FLOAT1", 2.0));
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "FLOAT1", 1.0),
|
||||
attr("g2", ref, "FLOAT1"));
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0),
|
||||
attr("g2", ref, "FLOAT3", 4.0, 5.0, 6.0));
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0),
|
||||
attr("g2", ref, "FLOAT3"));
|
||||
|
||||
if (ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS) {
|
||||
//
|
||||
//
|
||||
// TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD
|
||||
//
|
||||
//
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "GS", Arrays.asList("S1", "S2")),
|
||||
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
|
||||
|
||||
addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1
|
||||
attr("g1", ref, "FLOAT1", 1.0),
|
||||
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
|
||||
|
||||
// variable sized lists
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "GV", "S1"),
|
||||
attr("g2", ref, "GV", Arrays.asList("S3", "S4")));
|
||||
|
||||
addGenotypeTests(site,
|
||||
attr("g1", ref, "GV", Arrays.asList("S1", "S2")),
|
||||
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
|
||||
|
||||
addGenotypeTests(site, // missing value in varlist of string
|
||||
attr("g1", ref, "FLOAT1", 1.0),
|
||||
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// TESTING GENOTYPE FILTERS
|
||||
//
|
||||
//
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
}
|
||||
|
||||
private static void addGenotypesAndGTests() {
|
||||
// for ( final int ploidy : Arrays.asList(2)) {
|
||||
for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) {
|
||||
final List<List<String>> alleleCombinations =
|
||||
Arrays.asList(
|
||||
Arrays.asList("A"),
|
||||
Arrays.asList("A", "C"),
|
||||
Arrays.asList("A", "C", "G"),
|
||||
Arrays.asList("A", "C", "G", "T"));
|
||||
|
||||
for ( final List<String> alleles : alleleCombinations ) {
|
||||
final VariantContextBuilder vcb = builder().alleles(alleles);
|
||||
final VariantContext site = vcb.make();
|
||||
final int nAlleles = site.getNAlleles();
|
||||
final Allele ref = site.getReference();
|
||||
|
||||
// base genotype is ref/.../ref up to ploidy
|
||||
final List<Allele> baseGenotype = new ArrayList<Allele>(ploidy);
|
||||
for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref);
|
||||
final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy);
|
||||
|
||||
// ada is 0, 1, ..., nAlleles - 1
|
||||
final List<Integer> ada = new ArrayList<Integer>(nAlleles);
|
||||
for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i);
|
||||
|
||||
// pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy)
|
||||
final int[] pl = new int[nPLs];
|
||||
for ( int i = 0; i < pl.length; i++ ) pl[i] = i;
|
||||
|
||||
final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE");
|
||||
gb.alleles(baseGenotype);
|
||||
gb.PL(pl);
|
||||
gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada);
|
||||
vcb.genotypes(gb.make());
|
||||
|
||||
add(vcb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) {
|
||||
if ( value.length == 0 )
|
||||
return GenotypeBuilder.create(name, Arrays.asList(ref, ref));
|
||||
else {
|
||||
final Object toAdd = value.length == 1 ? value[0] : Arrays.asList(value);
|
||||
return new GenotypeBuilder(name, Arrays.asList(ref, ref)).attribute(key, toAdd).make();
|
||||
}
|
||||
}
|
||||
|
||||
public static List<VariantContextTestData> generateSiteTests() {
|
||||
return TEST_DATAs;
|
||||
}
|
||||
|
||||
public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
|
||||
final int nSamples = data.header.getNGenotypeSamples();
|
||||
if ( nSamples > 2 ) {
|
||||
for ( final VariantContext vc : data.vcs )
|
||||
if ( vc.isSymbolic() )
|
||||
// cannot handle symbolic alleles because they may be weird non-call VCFs
|
||||
return;
|
||||
|
||||
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
|
||||
tmpFile.deleteOnExit();
|
||||
|
||||
// write expected to disk
|
||||
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
|
||||
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
|
||||
|
||||
final Set<String> samplesInVCF = new HashSet<String>(data.header.getGenotypeSamples());
|
||||
final List<String> missingSamples = Arrays.asList("MISSING1", "MISSING2");
|
||||
final List<String> allSamples = new ArrayList<String>(missingSamples);
|
||||
allSamples.addAll(samplesInVCF);
|
||||
|
||||
final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples);
|
||||
writeVCsToFile(writer, header, data.vcs);
|
||||
|
||||
// ensure writing of expected == actual
|
||||
final VariantContextContainer p = readAllVCs(tmpFile, tester.makeCodec());
|
||||
final Iterable<VariantContext> actual = p.getVCs();
|
||||
|
||||
int i = 0;
|
||||
for ( final VariantContext readVC : actual ) {
|
||||
if ( readVC == null ) continue; // sometimes we read null records...
|
||||
final VariantContext expected = data.vcs.get(i++);
|
||||
for ( final Genotype g : readVC.getGenotypes() ) {
|
||||
Assert.assertTrue(allSamples.contains(g.getSampleName()));
|
||||
if ( samplesInVCF.contains(g.getSampleName()) ) {
|
||||
assertEquals(g, expected.getGenotype(g.getSampleName()));
|
||||
} else {
|
||||
// missing
|
||||
Assert.assertTrue(g.isNoCall());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
|
||||
testReaderWriter(tester, data.header, data.vcs, data.vcs, true);
|
||||
}
|
||||
|
||||
public static void testReaderWriter(final VariantContextIOTest tester,
|
||||
final VCFHeader header,
|
||||
final List<VariantContext> expected,
|
||||
final Iterable<VariantContext> vcs,
|
||||
final boolean recurse) throws IOException {
|
||||
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
|
||||
tmpFile.deleteOnExit();
|
||||
|
||||
// write expected to disk
|
||||
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
|
||||
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
|
||||
writeVCsToFile(writer, header, vcs);
|
||||
|
||||
// ensure writing of expected == actual
|
||||
final VariantContextContainer p = readAllVCs(tmpFile, tester.makeCodec());
|
||||
final Iterable<VariantContext> actual = p.getVCs();
|
||||
assertEquals(actual, expected);
|
||||
|
||||
if ( recurse ) {
|
||||
// if we are doing a recursive test, grab a fresh iterator over the written values
|
||||
final Iterable<VariantContext> read = readAllVCs(tmpFile, tester.makeCodec()).getVCs();
|
||||
testReaderWriter(tester, p.getHeader(), expected, read, false);
|
||||
}
|
||||
}
|
||||
|
||||
private static void writeVCsToFile(final VariantContextWriter writer, final VCFHeader header, final Iterable<VariantContext> vcs) {
|
||||
// write
|
||||
writer.writeHeader(header);
|
||||
for ( VariantContext vc : vcs )
|
||||
if (vc != null)
|
||||
writer.add(vc);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility class to read all of the VC records from a file
|
||||
*
|
||||
* @param source
|
||||
* @param codec
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public final static VariantContextContainer readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
|
||||
// read in the features
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
pbs.close();
|
||||
|
||||
pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
pbs.skip(header.getHeaderEnd());
|
||||
|
||||
final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
|
||||
return new VariantContextContainer(vcfHeader, new VCIterable(pbs, codec, vcfHeader));
|
||||
}
|
||||
|
||||
public static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
|
||||
final PositionalBufferedStream pbs;
|
||||
final FeatureCodec<VariantContext> codec;
|
||||
final VCFHeader header;
|
||||
|
||||
private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec<VariantContext> codec, final VCFHeader header) {
|
||||
this.pbs = pbs;
|
||||
this.codec = codec;
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<VariantContext> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
return ! pbs.isDone();
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public VariantContext next() {
|
||||
try {
|
||||
final VariantContext vc = codec.decode(pbs);
|
||||
return vc == null ? null : vc.fullyDecode(header, false);
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
//To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
||||
public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException {
|
||||
final VariantContextContainer vcfData = readAllVCs(vcfFile, new VCFCodec());
|
||||
final VariantContextContainer bcfData = readAllVCs(bcfFile, new BCF2Codec());
|
||||
assertEquals(bcfData.getHeader(), vcfData.getHeader());
|
||||
assertEquals(bcfData.getVCs(), vcfData.getVCs());
|
||||
}
|
||||
|
||||
public static void assertEquals(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) {
|
||||
final Iterator<VariantContext> actualIT = actual.iterator();
|
||||
final Iterator<VariantContext> expectedIT = expected.iterator();
|
||||
|
||||
while ( expectedIT.hasNext() ) {
|
||||
final VariantContext expectedVC = expectedIT.next();
|
||||
if ( expectedVC == null )
|
||||
continue;
|
||||
|
||||
VariantContext actualVC;
|
||||
do {
|
||||
Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual");
|
||||
actualVC = actualIT.next();
|
||||
} while ( actualIT.hasNext() && actualVC == null );
|
||||
|
||||
if ( actualVC == null )
|
||||
Assert.fail("Too few records in actual");
|
||||
|
||||
assertEquals(actualVC, expectedVC);
|
||||
}
|
||||
Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual");
|
||||
}
|
||||
|
||||
/**
|
||||
* Assert that two variant contexts are actually equal
|
||||
* @param actual
|
||||
* @param expected
|
||||
*/
|
||||
public static void assertEquals( final VariantContext actual, final VariantContext expected ) {
|
||||
Assert.assertNotNull(actual, "VariantContext expected not null");
|
||||
Assert.assertEquals(actual.getChr(), expected.getChr(), "chr");
|
||||
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
|
||||
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
|
||||
Assert.assertEquals(actual.getID(), expected.getID(), "id");
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
|
||||
|
||||
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
|
||||
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");
|
||||
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered");
|
||||
VariantBaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters");
|
||||
VariantBaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual());
|
||||
|
||||
Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes");
|
||||
if ( expected.hasGenotypes() ) {
|
||||
VariantBaseTest.assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set");
|
||||
Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names");
|
||||
final Set<String> samples = expected.getSampleNames();
|
||||
for ( final String sample : samples ) {
|
||||
assertEquals(actual.getGenotype(sample), expected.getGenotype(sample));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void assertEquals(final Genotype actual, final Genotype expected) {
|
||||
Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names");
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles");
|
||||
Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string");
|
||||
Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type");
|
||||
|
||||
// filters are the same
|
||||
Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields");
|
||||
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered");
|
||||
|
||||
// inline attributes
|
||||
Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp");
|
||||
Assert.assertEquals(actual.getAD(), expected.getAD(), "Genotype ad");
|
||||
Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq");
|
||||
Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL");
|
||||
Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD");
|
||||
Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ");
|
||||
Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP");
|
||||
|
||||
Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods");
|
||||
Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString");
|
||||
Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods");
|
||||
Assert.assertEquals(actual.getPL(), expected.getPL(), "Genotype getPL");
|
||||
|
||||
Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual");
|
||||
assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes());
|
||||
Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased");
|
||||
Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy");
|
||||
}
|
||||
|
||||
private static void assertAttributesEquals(final Map<String, Object> actual, Map<String, Object> expected) {
|
||||
final Set<String> expectedKeys = new HashSet<String>(expected.keySet());
|
||||
|
||||
for ( final Map.Entry<String, Object> act : actual.entrySet() ) {
|
||||
final Object actualValue = act.getValue();
|
||||
if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) {
|
||||
final Object expectedValue = expected.get(act.getKey());
|
||||
if ( expectedValue instanceof List ) {
|
||||
final List<Object> expectedList = (List<Object>)expectedValue;
|
||||
Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't");
|
||||
final List<Object> actualList = (List<Object>)actualValue;
|
||||
Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size");
|
||||
for ( int i = 0; i < expectedList.size(); i++ )
|
||||
assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i));
|
||||
} else
|
||||
assertAttributeEquals(act.getKey(), actualValue, expectedValue);
|
||||
} else {
|
||||
// it's ok to have a binding in x -> null that's absent in y
|
||||
Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other");
|
||||
}
|
||||
expectedKeys.remove(act.getKey());
|
||||
}
|
||||
|
||||
// now expectedKeys contains only the keys found in expected but not in actual,
|
||||
// and they must all be null
|
||||
for ( final String missingExpected : expectedKeys ) {
|
||||
final Object value = expected.get(missingExpected);
|
||||
Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" );
|
||||
}
|
||||
}
|
||||
|
||||
private static final boolean isMissing(final Object value) {
|
||||
if ( value == null ) return true;
|
||||
else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true;
|
||||
else if ( value instanceof List ) {
|
||||
// handles the case where all elements are null or the list is empty
|
||||
for ( final Object elt : (List)value)
|
||||
if ( elt != null )
|
||||
return false;
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
private static void assertAttributeEquals(final String key, final Object actual, final Object expected) {
|
||||
if ( expected instanceof Double ) {
|
||||
// must be very tolerant because doubles are being rounded to 2 sig figs
|
||||
VariantBaseTest.assertEqualsDoubleSmart(actual, (Double)expected, 1e-2);
|
||||
} else
|
||||
Assert.assertEquals(actual, expected, "Attribute " + key);
|
||||
}
|
||||
|
||||
public static void addComplexGenotypesTest() {
|
||||
final List<Allele> allAlleles = Arrays.asList(
|
||||
Allele.create("A", true),
|
||||
Allele.create("C", false),
|
||||
Allele.create("G", false));
|
||||
|
||||
for ( int nAlleles : Arrays.asList(2, 3) ) {
|
||||
for ( int highestPloidy : Arrays.asList(1, 2, 3) ) {
|
||||
// site alleles
|
||||
final List<Allele> siteAlleles = allAlleles.subList(0, nAlleles);
|
||||
|
||||
// possible alleles for genotypes
|
||||
final List<Allele> possibleGenotypeAlleles = new ArrayList<Allele>(siteAlleles);
|
||||
possibleGenotypeAlleles.add(Allele.NO_CALL);
|
||||
|
||||
// there are n^ploidy possible genotypes
|
||||
final List<List<Allele>> possibleGenotypes = makeAllGenotypes(possibleGenotypeAlleles, highestPloidy);
|
||||
final int nPossibleGenotypes = possibleGenotypes.size();
|
||||
|
||||
VariantContextBuilder vb = new VariantContextBuilder("unittest", "1", 1, 1, siteAlleles);
|
||||
|
||||
// first test -- create n copies of each genotype
|
||||
for ( int i = 0; i < nPossibleGenotypes; i++ ) {
|
||||
final List<Genotype> samples = new ArrayList<Genotype>(3);
|
||||
samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i)));
|
||||
add(vb.genotypes(samples));
|
||||
}
|
||||
|
||||
// second test -- create one sample with each genotype
|
||||
{
|
||||
final List<Genotype> samples = new ArrayList<Genotype>(nPossibleGenotypes);
|
||||
for ( int i = 0; i < nPossibleGenotypes; i++ ) {
|
||||
samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i)));
|
||||
}
|
||||
add(vb.genotypes(samples));
|
||||
}
|
||||
|
||||
// test mixed ploidy
|
||||
for ( int i = 0; i < nPossibleGenotypes; i++ ) {
|
||||
for ( int ploidy = 1; ploidy < highestPloidy; ploidy++ ) {
|
||||
final List<Genotype> samples = new ArrayList<Genotype>(highestPloidy);
|
||||
final List<Allele> genotype = possibleGenotypes.get(i).subList(0, ploidy);
|
||||
samples.add(GenotypeBuilder.create("sample" + i, genotype));
|
||||
add(vb.genotypes(samples));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<List<Allele>> makeAllGenotypes(final List<Allele> alleles, final int highestPloidy) {
|
||||
return GeneralUtils.makePermutations(alleles, highestPloidy, true);
|
||||
}
|
||||
|
||||
public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {
|
||||
Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines");
|
||||
|
||||
// for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted?
|
||||
//Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder());
|
||||
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder());
|
||||
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder());
|
||||
for ( int i = 0; i < actualLines.size(); i++ ) {
|
||||
Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines");
|
||||
}
|
||||
}
|
||||
|
||||
public static void main( String argv[] ) {
|
||||
final File variants1 = new File(argv[0]);
|
||||
final File variants2 = new File(argv[1]);
|
||||
try {
|
||||
VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(variants1, variants2);
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,918 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class VariantContextUnitTest extends VariantBaseTest {
|
||||
Allele A, Aref, C, T, Tref;
|
||||
Allele del, delRef, ATC, ATCref;
|
||||
|
||||
// A [ref] / T at 10
|
||||
String snpLoc = "chr1";
|
||||
int snpLocStart = 10;
|
||||
int snpLocStop = 10;
|
||||
|
||||
// - / ATC [ref] from 20-22
|
||||
String delLoc = "chr1";
|
||||
int delLocStart = 20;
|
||||
int delLocStop = 22;
|
||||
|
||||
// - [ref] / ATC from 20-20
|
||||
String insLoc = "chr1";
|
||||
int insLocStart = 20;
|
||||
int insLocStop = 20;
|
||||
|
||||
VariantContextBuilder basicBuilder, snpBuilder, insBuilder;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
del = Allele.create("A");
|
||||
delRef = Allele.create("A", true);
|
||||
|
||||
A = Allele.create("A");
|
||||
C = Allele.create("C");
|
||||
Aref = Allele.create("A", true);
|
||||
T = Allele.create("T");
|
||||
Tref = Allele.create("T", true);
|
||||
|
||||
ATC = Allele.create("ATC");
|
||||
ATCref = Allele.create("ATC", true);
|
||||
}
|
||||
|
||||
@BeforeMethod
|
||||
public void beforeTest() {
|
||||
basicBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T));
|
||||
snpBuilder = new VariantContextBuilder("test", snpLoc,snpLocStart, snpLocStop, Arrays.asList(Aref, T));
|
||||
insBuilder = new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATC));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDetermineTypes() {
|
||||
Allele ACref = Allele.create("AC", true);
|
||||
Allele AC = Allele.create("AC");
|
||||
Allele AT = Allele.create("AT");
|
||||
Allele C = Allele.create("C");
|
||||
Allele CAT = Allele.create("CAT");
|
||||
Allele TAref = Allele.create("TA", true);
|
||||
Allele TA = Allele.create("TA");
|
||||
Allele TC = Allele.create("TC");
|
||||
Allele symbolic = Allele.create("<FOO>");
|
||||
|
||||
// test REF
|
||||
List<Allele> alleles = Arrays.asList(Tref);
|
||||
VariantContext vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION);
|
||||
|
||||
// test SNPs
|
||||
alleles = Arrays.asList(Tref, A);
|
||||
vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.SNP);
|
||||
|
||||
alleles = Arrays.asList(Tref, A, C);
|
||||
vc = snpBuilder.alleles(alleles).stop(snpLocStop).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.SNP);
|
||||
|
||||
// test MNPs
|
||||
alleles = Arrays.asList(ACref, TA);
|
||||
vc = snpBuilder.alleles(alleles).stop(snpLocStop+1).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.MNP);
|
||||
|
||||
alleles = Arrays.asList(ATCref, CAT, Allele.create("GGG"));
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.MNP);
|
||||
|
||||
// test INDELs
|
||||
alleles = Arrays.asList(Aref, ATC);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
|
||||
|
||||
alleles = Arrays.asList(ATCref, A);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
|
||||
|
||||
alleles = Arrays.asList(Tref, TA, TC);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
|
||||
|
||||
alleles = Arrays.asList(ATCref, A, AC);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
|
||||
|
||||
alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC"));
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop+2).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
|
||||
|
||||
// test MIXED
|
||||
alleles = Arrays.asList(TAref, T, TC);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
|
||||
|
||||
alleles = Arrays.asList(TAref, T, AC);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
|
||||
|
||||
alleles = Arrays.asList(ACref, ATC, AT);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop+1).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
|
||||
|
||||
alleles = Arrays.asList(Aref, T, symbolic);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED);
|
||||
|
||||
// test SYMBOLIC
|
||||
alleles = Arrays.asList(Tref, symbolic);
|
||||
vc = basicBuilder.alleles(alleles).stop(snpLocStop).make();
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleSNPAlleleOrdering() {
|
||||
final List<Allele> allelesNaturalOrder = Arrays.asList(Aref, C, T);
|
||||
final List<Allele> allelesUnnaturalOrder = Arrays.asList(Aref, T, C);
|
||||
VariantContext naturalVC = snpBuilder.alleles(allelesNaturalOrder).make();
|
||||
VariantContext unnaturalVC = snpBuilder.alleles(allelesUnnaturalOrder).make();
|
||||
Assert.assertEquals(new ArrayList<Allele>(naturalVC.getAlleles()), allelesNaturalOrder);
|
||||
Assert.assertEquals(new ArrayList<Allele>(unnaturalVC.getAlleles()), allelesUnnaturalOrder);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingSNPVariantContext() {
|
||||
|
||||
List<Allele> alleles = Arrays.asList(Aref, T);
|
||||
VariantContext vc = snpBuilder.alleles(alleles).make();
|
||||
|
||||
Assert.assertEquals(vc.getChr(), snpLoc);
|
||||
Assert.assertEquals(vc.getStart(), snpLocStart);
|
||||
Assert.assertEquals(vc.getEnd(), snpLocStop);
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.SNP);
|
||||
Assert.assertTrue(vc.isSNP());
|
||||
Assert.assertFalse(vc.isIndel());
|
||||
Assert.assertFalse(vc.isSimpleInsertion());
|
||||
Assert.assertFalse(vc.isSimpleDeletion());
|
||||
Assert.assertFalse(vc.isMixed());
|
||||
Assert.assertTrue(vc.isBiallelic());
|
||||
Assert.assertEquals(vc.getNAlleles(), 2);
|
||||
|
||||
Assert.assertEquals(vc.getReference(), Aref);
|
||||
Assert.assertEquals(vc.getAlleles().size(), 2);
|
||||
Assert.assertEquals(vc.getAlternateAlleles().size(), 1);
|
||||
Assert.assertEquals(vc.getAlternateAllele(0), T);
|
||||
|
||||
Assert.assertFalse(vc.hasGenotypes());
|
||||
|
||||
Assert.assertEquals(vc.getSampleNames().size(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingRefVariantContext() {
|
||||
List<Allele> alleles = Arrays.asList(Aref);
|
||||
VariantContext vc = snpBuilder.alleles(alleles).make();
|
||||
|
||||
Assert.assertEquals(vc.getChr(), snpLoc);
|
||||
Assert.assertEquals(vc.getStart(), snpLocStart);
|
||||
Assert.assertEquals(vc.getEnd(), snpLocStop);
|
||||
Assert.assertEquals(VariantContext.Type.NO_VARIATION, vc.getType());
|
||||
Assert.assertFalse(vc.isSNP());
|
||||
Assert.assertFalse(vc.isIndel());
|
||||
Assert.assertFalse(vc.isSimpleInsertion());
|
||||
Assert.assertFalse(vc.isSimpleDeletion());
|
||||
Assert.assertFalse(vc.isMixed());
|
||||
Assert.assertFalse(vc.isBiallelic());
|
||||
Assert.assertEquals(vc.getNAlleles(), 1);
|
||||
|
||||
Assert.assertEquals(vc.getReference(), Aref);
|
||||
Assert.assertEquals(vc.getAlleles().size(), 1);
|
||||
Assert.assertEquals(vc.getAlternateAlleles().size(), 0);
|
||||
//Assert.assertEquals(vc.getAlternateAllele(0), T);
|
||||
|
||||
Assert.assertFalse(vc.hasGenotypes());
|
||||
Assert.assertEquals(vc.getSampleNames().size(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingDeletionVariantContext() {
|
||||
List<Allele> alleles = Arrays.asList(ATCref, del);
|
||||
VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make();
|
||||
|
||||
Assert.assertEquals(vc.getChr(), delLoc);
|
||||
Assert.assertEquals(vc.getStart(), delLocStart);
|
||||
Assert.assertEquals(vc.getEnd(), delLocStop);
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
|
||||
Assert.assertFalse(vc.isSNP());
|
||||
Assert.assertTrue(vc.isIndel());
|
||||
Assert.assertFalse(vc.isSimpleInsertion());
|
||||
Assert.assertTrue(vc.isSimpleDeletion());
|
||||
Assert.assertFalse(vc.isMixed());
|
||||
Assert.assertTrue(vc.isBiallelic());
|
||||
Assert.assertEquals(vc.getNAlleles(), 2);
|
||||
|
||||
Assert.assertEquals(vc.getReference(), ATCref);
|
||||
Assert.assertEquals(vc.getAlleles().size(), 2);
|
||||
Assert.assertEquals(vc.getAlternateAlleles().size(), 1);
|
||||
Assert.assertEquals(vc.getAlternateAllele(0), del);
|
||||
|
||||
Assert.assertFalse(vc.hasGenotypes());
|
||||
|
||||
Assert.assertEquals(vc.getSampleNames().size(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatchingAlleles() {
|
||||
List<Allele> alleles = Arrays.asList(ATCref, del);
|
||||
VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make();
|
||||
VariantContext vc2 = new VariantContextBuilder("test2", delLoc, delLocStart+12, delLocStop+12, alleles).make();
|
||||
|
||||
Assert.assertTrue(vc.hasSameAllelesAs(vc2));
|
||||
Assert.assertTrue(vc.hasSameAlternateAllelesAs(vc2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingInsertionVariantContext() {
|
||||
List<Allele> alleles = Arrays.asList(delRef, ATC);
|
||||
VariantContext vc = insBuilder.alleles(alleles).make();
|
||||
|
||||
Assert.assertEquals(vc.getChr(), insLoc);
|
||||
Assert.assertEquals(vc.getStart(), insLocStart);
|
||||
Assert.assertEquals(vc.getEnd(), insLocStop);
|
||||
Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL);
|
||||
Assert.assertFalse(vc.isSNP());
|
||||
Assert.assertTrue(vc.isIndel());
|
||||
Assert.assertTrue(vc.isSimpleInsertion());
|
||||
Assert.assertFalse(vc.isSimpleDeletion());
|
||||
Assert.assertFalse(vc.isMixed());
|
||||
Assert.assertTrue(vc.isBiallelic());
|
||||
Assert.assertEquals(vc.getNAlleles(), 2);
|
||||
|
||||
Assert.assertEquals(vc.getReference(), delRef);
|
||||
Assert.assertEquals(vc.getAlleles().size(), 2);
|
||||
Assert.assertEquals(vc.getAlternateAlleles().size(), 1);
|
||||
Assert.assertEquals(vc.getAlternateAllele(0), ATC);
|
||||
Assert.assertFalse(vc.hasGenotypes());
|
||||
|
||||
Assert.assertEquals(vc.getSampleNames().size(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingPartiallyCalledGenotype() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, C);
|
||||
Genotype g = GenotypeBuilder.create("foo", Arrays.asList(C, Allele.NO_CALL));
|
||||
VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g).make();
|
||||
|
||||
Assert.assertTrue(vc.isSNP());
|
||||
Assert.assertEquals(vc.getNAlleles(), 2);
|
||||
Assert.assertTrue(vc.hasGenotypes());
|
||||
Assert.assertFalse(vc.isMonomorphicInSamples());
|
||||
Assert.assertTrue(vc.isPolymorphicInSamples());
|
||||
Assert.assertEquals(vc.getGenotype("foo"), g);
|
||||
Assert.assertEquals(vc.getCalledChrCount(), 1); // we only have 1 called chromosomes, we exclude the NO_CALL one isn't called
|
||||
Assert.assertEquals(vc.getCalledChrCount(Aref), 0);
|
||||
Assert.assertEquals(vc.getCalledChrCount(C), 1);
|
||||
Assert.assertFalse(vc.getGenotype("foo").isHet());
|
||||
Assert.assertFalse(vc.getGenotype("foo").isHom());
|
||||
Assert.assertFalse(vc.getGenotype("foo").isNoCall());
|
||||
Assert.assertFalse(vc.getGenotype("foo").isHom());
|
||||
Assert.assertTrue(vc.getGenotype("foo").isMixed());
|
||||
Assert.assertEquals(vc.getGenotype("foo").getType(), GenotypeType.MIXED);
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Exception.class)
|
||||
public void testBadConstructorArgs1() {
|
||||
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Exception.class)
|
||||
public void testBadConstructorArgs2() {
|
||||
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Exception.class)
|
||||
public void testBadConstructorArgs3() {
|
||||
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Throwable.class)
|
||||
public void testBadConstructorArgs4() {
|
||||
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Collections.<Allele>emptyList()).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Exception.class)
|
||||
public void testBadConstructorArgsDuplicateAlleles1() {
|
||||
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, T, T)).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Exception.class)
|
||||
public void testBadConstructorArgsDuplicateAlleles2() {
|
||||
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(Aref, A)).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Throwable.class)
|
||||
public void testBadLoc1() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T, del);
|
||||
new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Throwable.class)
|
||||
public void testBadID1() {
|
||||
new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id(null).make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Exception.class)
|
||||
public void testBadID2() {
|
||||
new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, Arrays.asList(Aref, T)).id("").make();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = Throwable.class)
|
||||
public void testBadPError() {
|
||||
new VariantContextBuilder("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref)).log10PError(0.5).make();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAccessingSimpleSNPGenotypes() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T);
|
||||
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
|
||||
VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles)
|
||||
.genotypes(g1, g2, g3).make();
|
||||
|
||||
Assert.assertTrue(vc.hasGenotypes());
|
||||
Assert.assertFalse(vc.isMonomorphicInSamples());
|
||||
Assert.assertTrue(vc.isPolymorphicInSamples());
|
||||
Assert.assertEquals(vc.getSampleNames().size(), 3);
|
||||
|
||||
Assert.assertEquals(vc.getGenotypes().size(), 3);
|
||||
Assert.assertEquals(vc.getGenotypes().get("AA"), g1);
|
||||
Assert.assertEquals(vc.getGenotype("AA"), g1);
|
||||
Assert.assertEquals(vc.getGenotypes().get("AT"), g2);
|
||||
Assert.assertEquals(vc.getGenotype("AT"), g2);
|
||||
Assert.assertEquals(vc.getGenotypes().get("TT"), g3);
|
||||
Assert.assertEquals(vc.getGenotype("TT"), g3);
|
||||
|
||||
Assert.assertTrue(vc.hasGenotype("AA"));
|
||||
Assert.assertTrue(vc.hasGenotype("AT"));
|
||||
Assert.assertTrue(vc.hasGenotype("TT"));
|
||||
Assert.assertFalse(vc.hasGenotype("foo"));
|
||||
Assert.assertFalse(vc.hasGenotype("TTT"));
|
||||
Assert.assertFalse(vc.hasGenotype("at"));
|
||||
Assert.assertFalse(vc.hasGenotype("tt"));
|
||||
|
||||
Assert.assertEquals(vc.getCalledChrCount(), 6);
|
||||
Assert.assertEquals(vc.getCalledChrCount(Aref), 3);
|
||||
Assert.assertEquals(vc.getCalledChrCount(T), 3);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAccessingCompleteGenotypes() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T, ATC);
|
||||
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
Genotype g4 = GenotypeBuilder.create("Td", Arrays.asList(T, ATC));
|
||||
Genotype g5 = GenotypeBuilder.create("dd", Arrays.asList(ATC, ATC));
|
||||
Genotype g6 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
|
||||
|
||||
VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles)
|
||||
.genotypes(g1, g2, g3, g4, g5, g6).make();
|
||||
|
||||
Assert.assertTrue(vc.hasGenotypes());
|
||||
Assert.assertFalse(vc.isMonomorphicInSamples());
|
||||
Assert.assertTrue(vc.isPolymorphicInSamples());
|
||||
Assert.assertEquals(vc.getGenotypes().size(), 6);
|
||||
|
||||
Assert.assertEquals(3, vc.getGenotypes(Arrays.asList("AA", "Td", "dd")).size());
|
||||
|
||||
Assert.assertEquals(10, vc.getCalledChrCount());
|
||||
Assert.assertEquals(3, vc.getCalledChrCount(Aref));
|
||||
Assert.assertEquals(4, vc.getCalledChrCount(T));
|
||||
Assert.assertEquals(3, vc.getCalledChrCount(ATC));
|
||||
Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAccessingRefGenotypes() {
|
||||
List<Allele> alleles1 = Arrays.asList(Aref, T);
|
||||
List<Allele> alleles2 = Arrays.asList(Aref);
|
||||
List<Allele> alleles3 = Arrays.asList(Aref, T);
|
||||
for ( List<Allele> alleles : Arrays.asList(alleles1, alleles2, alleles3)) {
|
||||
Genotype g1 = GenotypeBuilder.create("AA1", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AA2", Arrays.asList(Aref, Aref));
|
||||
Genotype g3 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
|
||||
VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles)
|
||||
.genotypes(g1, g2, g3).make();
|
||||
|
||||
Assert.assertTrue(vc.hasGenotypes());
|
||||
Assert.assertTrue(vc.isMonomorphicInSamples());
|
||||
Assert.assertFalse(vc.isPolymorphicInSamples());
|
||||
Assert.assertEquals(vc.getGenotypes().size(), 3);
|
||||
|
||||
Assert.assertEquals(4, vc.getCalledChrCount());
|
||||
Assert.assertEquals(4, vc.getCalledChrCount(Aref));
|
||||
Assert.assertEquals(0, vc.getCalledChrCount(T));
|
||||
Assert.assertEquals(2, vc.getCalledChrCount(Allele.NO_CALL));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilters() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T);
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
|
||||
VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1, g2).make();
|
||||
|
||||
Assert.assertTrue(vc.isNotFiltered());
|
||||
Assert.assertFalse(vc.isFiltered());
|
||||
Assert.assertEquals(0, vc.getFilters().size());
|
||||
Assert.assertFalse(vc.filtersWereApplied());
|
||||
Assert.assertNull(vc.getFiltersMaybeNull());
|
||||
|
||||
vc = new VariantContextBuilder(vc).filters("BAD_SNP_BAD!").make();
|
||||
|
||||
Assert.assertFalse(vc.isNotFiltered());
|
||||
Assert.assertTrue(vc.isFiltered());
|
||||
Assert.assertEquals(1, vc.getFilters().size());
|
||||
Assert.assertTrue(vc.filtersWereApplied());
|
||||
Assert.assertNotNull(vc.getFiltersMaybeNull());
|
||||
|
||||
Set<String> filters = new HashSet<String>(Arrays.asList("BAD_SNP_BAD!", "REALLY_BAD_SNP", "CHRIST_THIS_IS_TERRIBLE"));
|
||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||
|
||||
Assert.assertFalse(vc.isNotFiltered());
|
||||
Assert.assertTrue(vc.isFiltered());
|
||||
Assert.assertEquals(3, vc.getFilters().size());
|
||||
Assert.assertTrue(vc.filtersWereApplied());
|
||||
Assert.assertNotNull(vc.getFiltersMaybeNull());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetGenotypeCounts() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T);
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
Genotype g4 = GenotypeBuilder.create("A.", Arrays.asList(Aref, Allele.NO_CALL));
|
||||
Genotype g5 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
|
||||
|
||||
// we need to create a new VariantContext each time
|
||||
VariantContext vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make();
|
||||
Assert.assertEquals(1, vc.getHetCount());
|
||||
vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make();
|
||||
Assert.assertEquals(1, vc.getHomRefCount());
|
||||
vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make();
|
||||
Assert.assertEquals(1, vc.getHomVarCount());
|
||||
vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make();
|
||||
Assert.assertEquals(1, vc.getMixedCount());
|
||||
vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make();
|
||||
Assert.assertEquals(1, vc.getNoCallCount());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCFfromGenotypes() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T);
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
Genotype g4 = GenotypeBuilder.create("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
|
||||
VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4).make();
|
||||
|
||||
VariantContext vc12 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g1.getSampleName(), g2.getSampleName())), true);
|
||||
VariantContext vc1 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g1.getSampleName())), true);
|
||||
VariantContext vc23 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g2.getSampleName(), g3.getSampleName())), true);
|
||||
VariantContext vc4 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g4.getSampleName())), true);
|
||||
VariantContext vc14 = vc.subContextFromSamples(new HashSet<String>(Arrays.asList(g1.getSampleName(), g4.getSampleName())), true);
|
||||
|
||||
Assert.assertTrue(vc12.isPolymorphicInSamples());
|
||||
Assert.assertTrue(vc23.isPolymorphicInSamples());
|
||||
Assert.assertTrue(vc1.isMonomorphicInSamples());
|
||||
Assert.assertTrue(vc4.isMonomorphicInSamples());
|
||||
Assert.assertTrue(vc14.isMonomorphicInSamples());
|
||||
|
||||
Assert.assertTrue(vc12.isSNP());
|
||||
Assert.assertTrue(vc12.isVariant());
|
||||
Assert.assertTrue(vc12.isBiallelic());
|
||||
|
||||
Assert.assertFalse(vc1.isSNP());
|
||||
Assert.assertFalse(vc1.isVariant());
|
||||
Assert.assertFalse(vc1.isBiallelic());
|
||||
|
||||
Assert.assertTrue(vc23.isSNP());
|
||||
Assert.assertTrue(vc23.isVariant());
|
||||
Assert.assertTrue(vc23.isBiallelic());
|
||||
|
||||
Assert.assertFalse(vc4.isSNP());
|
||||
Assert.assertFalse(vc4.isVariant());
|
||||
Assert.assertFalse(vc4.isBiallelic());
|
||||
|
||||
Assert.assertFalse(vc14.isSNP());
|
||||
Assert.assertFalse(vc14.isVariant());
|
||||
Assert.assertFalse(vc14.isBiallelic());
|
||||
|
||||
Assert.assertEquals(3, vc12.getCalledChrCount(Aref));
|
||||
Assert.assertEquals(1, vc23.getCalledChrCount(Aref));
|
||||
Assert.assertEquals(2, vc1.getCalledChrCount(Aref));
|
||||
Assert.assertEquals(0, vc4.getCalledChrCount(Aref));
|
||||
Assert.assertEquals(2, vc14.getCalledChrCount(Aref));
|
||||
}
|
||||
|
||||
public void testGetGenotypeMethods() {
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
GenotypesContext gc = GenotypesContext.create(g1, g2, g3);
|
||||
VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make();
|
||||
|
||||
Assert.assertEquals(vc.getGenotype("AA"), g1);
|
||||
Assert.assertEquals(vc.getGenotype("AT"), g2);
|
||||
Assert.assertEquals(vc.getGenotype("TT"), g3);
|
||||
Assert.assertEquals(vc.getGenotype("CC"), null);
|
||||
|
||||
Assert.assertEquals(vc.getGenotypes(), gc);
|
||||
Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT")), Arrays.asList(g1, g2));
|
||||
Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "TT")), Arrays.asList(g1, g3));
|
||||
Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "TT")), Arrays.asList(g1, g2, g3));
|
||||
Assert.assertEquals(vc.getGenotypes(Arrays.asList("AA", "AT", "CC")), Arrays.asList(g1, g2));
|
||||
|
||||
Assert.assertEquals(vc.getGenotype(0), g1);
|
||||
Assert.assertEquals(vc.getGenotype(1), g2);
|
||||
Assert.assertEquals(vc.getGenotype(2), g3);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test allele merging
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class GetAllelesTest extends TestDataProvider {
|
||||
List<Allele> alleles;
|
||||
|
||||
private GetAllelesTest(String name, Allele... arg) {
|
||||
super(GetAllelesTest.class, name);
|
||||
this.alleles = Arrays.asList(arg);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), alleles);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "getAlleles")
|
||||
public Object[][] mergeAllelesData() {
|
||||
new GetAllelesTest("A*", Aref);
|
||||
new GetAllelesTest("A*/C", Aref, C);
|
||||
new GetAllelesTest("A*/C/T", Aref, C, T);
|
||||
new GetAllelesTest("A*/T/C", Aref, T, C);
|
||||
new GetAllelesTest("A*/C/T/ATC", Aref, C, T, ATC);
|
||||
new GetAllelesTest("A*/T/C/ATC", Aref, T, C, ATC);
|
||||
new GetAllelesTest("A*/ATC/T/C", Aref, ATC, T, C);
|
||||
|
||||
return GetAllelesTest.getTests(GetAllelesTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "getAlleles")
|
||||
public void testMergeAlleles(GetAllelesTest cfg) {
|
||||
final List<Allele> altAlleles = cfg.alleles.subList(1, cfg.alleles.size());
|
||||
final VariantContext vc = new VariantContextBuilder("test", snpLoc, snpLocStart, snpLocStop, cfg.alleles).make();
|
||||
|
||||
Assert.assertEquals(vc.getAlleles(), cfg.alleles, "VC alleles not the same as input alleles");
|
||||
Assert.assertEquals(vc.getNAlleles(), cfg.alleles.size(), "VC getNAlleles not the same as input alleles size");
|
||||
Assert.assertEquals(vc.getAlternateAlleles(), altAlleles, "VC alt alleles not the same as input alt alleles");
|
||||
|
||||
|
||||
for ( int i = 0; i < cfg.alleles.size(); i++ ) {
|
||||
final Allele inputAllele = cfg.alleles.get(i);
|
||||
|
||||
Assert.assertTrue(vc.hasAllele(inputAllele));
|
||||
if ( inputAllele.isReference() ) {
|
||||
final Allele nonRefVersion = Allele.create(inputAllele.getBases(), false);
|
||||
Assert.assertTrue(vc.hasAllele(nonRefVersion, true));
|
||||
Assert.assertFalse(vc.hasAllele(nonRefVersion, false));
|
||||
}
|
||||
|
||||
Assert.assertEquals(inputAllele, vc.getAllele(inputAllele.getBaseString()));
|
||||
Assert.assertEquals(inputAllele, vc.getAllele(inputAllele.getBases()));
|
||||
|
||||
if ( i > 0 ) { // it's an alt allele
|
||||
Assert.assertEquals(inputAllele, vc.getAlternateAllele(i-1));
|
||||
}
|
||||
}
|
||||
|
||||
final Allele missingAllele = Allele.create("AACCGGTT"); // does not exist
|
||||
Assert.assertNull(vc.getAllele(missingAllele.getBases()));
|
||||
Assert.assertFalse(vc.hasAllele(missingAllele));
|
||||
Assert.assertFalse(vc.hasAllele(missingAllele, true));
|
||||
}
|
||||
|
||||
private class SitesAndGenotypesVC extends TestDataProvider {
|
||||
VariantContext vc, copy;
|
||||
|
||||
private SitesAndGenotypesVC(String name, VariantContext original) {
|
||||
super(SitesAndGenotypesVC.class, name);
|
||||
this.vc = original;
|
||||
this.copy = new VariantContextBuilder(original).make();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), vc);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "SitesAndGenotypesVC")
|
||||
public Object[][] MakeSitesAndGenotypesVCs() {
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
|
||||
VariantContext sites = new VariantContextBuilder("sites", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).make();
|
||||
VariantContext genotypes = new VariantContextBuilder(sites).source("genotypes").genotypes(g1, g2, g3).make();
|
||||
|
||||
new SitesAndGenotypesVC("sites", sites);
|
||||
new SitesAndGenotypesVC("genotypes", genotypes);
|
||||
|
||||
return SitesAndGenotypesVC.getTests(SitesAndGenotypesVC.class);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test modifying routines
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
@Test(dataProvider = "SitesAndGenotypesVC")
|
||||
public void runModifyVCTests(SitesAndGenotypesVC cfg) {
|
||||
VariantContext modified = new VariantContextBuilder(cfg.vc).loc("chr2", 123, 123).make();
|
||||
Assert.assertEquals(modified.getChr(), "chr2");
|
||||
Assert.assertEquals(modified.getStart(), 123);
|
||||
Assert.assertEquals(modified.getEnd(), 123);
|
||||
|
||||
modified = new VariantContextBuilder(cfg.vc).id("newID").make();
|
||||
Assert.assertEquals(modified.getID(), "newID");
|
||||
|
||||
Set<String> newFilters = Collections.singleton("newFilter");
|
||||
modified = new VariantContextBuilder(cfg.vc).filters(newFilters).make();
|
||||
Assert.assertEquals(modified.getFilters(), newFilters);
|
||||
|
||||
// test the behavior when the builder's attribute object is null
|
||||
modified = new VariantContextBuilder(modified).attributes(null).make();
|
||||
Assert.assertTrue(modified.getAttributes().isEmpty());
|
||||
modified = new VariantContextBuilder(modified).attributes(null).rmAttribute("AC").make();
|
||||
Assert.assertTrue(modified.getAttributes().isEmpty());
|
||||
modified = new VariantContextBuilder(modified).attributes(null).attribute("AC", 1).make();
|
||||
Assert.assertEquals(modified.getAttribute("AC"), 1);
|
||||
|
||||
// test the behavior when the builder's attribute object is not initialized
|
||||
modified = new VariantContextBuilder(modified.getSource(), modified.getChr(), modified.getStart(), modified.getEnd(), modified.getAlleles()).attribute("AC", 1).make();
|
||||
|
||||
// test normal attribute modification
|
||||
modified = new VariantContextBuilder(cfg.vc).attribute("AC", 1).make();
|
||||
Assert.assertEquals(modified.getAttribute("AC"), 1);
|
||||
modified = new VariantContextBuilder(modified).attribute("AC", 2).make();
|
||||
Assert.assertEquals(modified.getAttribute("AC"), 2);
|
||||
|
||||
Genotype g1 = GenotypeBuilder.create("AA2", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT2", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT2", Arrays.asList(T, T));
|
||||
GenotypesContext gc = GenotypesContext.create(g1,g2,g3);
|
||||
modified = new VariantContextBuilder(cfg.vc).genotypes(gc).make();
|
||||
Assert.assertEquals(modified.getGenotypes(), gc);
|
||||
modified = new VariantContextBuilder(cfg.vc).noGenotypes().make();
|
||||
Assert.assertTrue(modified.getGenotypes().isEmpty());
|
||||
|
||||
// test that original hasn't changed
|
||||
Assert.assertEquals(cfg.vc.getChr(), cfg.copy.getChr());
|
||||
Assert.assertEquals(cfg.vc.getStart(), cfg.copy.getStart());
|
||||
Assert.assertEquals(cfg.vc.getEnd(), cfg.copy.getEnd());
|
||||
Assert.assertEquals(cfg.vc.getAlleles(), cfg.copy.getAlleles());
|
||||
Assert.assertEquals(cfg.vc.getAttributes(), cfg.copy.getAttributes());
|
||||
Assert.assertEquals(cfg.vc.getID(), cfg.copy.getID());
|
||||
Assert.assertEquals(cfg.vc.getGenotypes(), cfg.copy.getGenotypes());
|
||||
Assert.assertEquals(cfg.vc.getLog10PError(), cfg.copy.getLog10PError());
|
||||
Assert.assertEquals(cfg.vc.getFilters(), cfg.copy.getFilters());
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test subcontext
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
private class SubContextTest extends TestDataProvider {
|
||||
Set<String> samples;
|
||||
boolean updateAlleles;
|
||||
|
||||
private SubContextTest(Collection<String> samples, boolean updateAlleles) {
|
||||
super(SubContextTest.class);
|
||||
this.samples = new HashSet<String>(samples);
|
||||
this.updateAlleles = updateAlleles;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s samples=%s updateAlleles=%b", super.toString(), samples, updateAlleles);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "SubContextTest")
|
||||
public Object[][] MakeSubContextTest() {
|
||||
for ( boolean updateAlleles : Arrays.asList(true, false)) {
|
||||
new SubContextTest(Collections.<String>emptySet(), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("MISSING"), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("AA"), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("AT"), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("TT"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT", "MISSING"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT", "TT", "MISSING"), updateAlleles);
|
||||
}
|
||||
|
||||
return SubContextTest.getTests(SubContextTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SubContextTest")
|
||||
public void runSubContextTest(SubContextTest cfg) {
|
||||
Genotype g1 = GenotypeBuilder.create("AA", Arrays.asList(Aref, Aref));
|
||||
Genotype g2 = GenotypeBuilder.create("AT", Arrays.asList(Aref, T));
|
||||
Genotype g3 = GenotypeBuilder.create("TT", Arrays.asList(T, T));
|
||||
|
||||
GenotypesContext gc = GenotypesContext.create(g1, g2, g3);
|
||||
VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make();
|
||||
VariantContext sub = vc.subContextFromSamples(cfg.samples, cfg.updateAlleles);
|
||||
|
||||
// unchanged attributes should be the same
|
||||
Assert.assertEquals(sub.getChr(), vc.getChr());
|
||||
Assert.assertEquals(sub.getStart(), vc.getStart());
|
||||
Assert.assertEquals(sub.getEnd(), vc.getEnd());
|
||||
Assert.assertEquals(sub.getLog10PError(), vc.getLog10PError());
|
||||
Assert.assertEquals(sub.getFilters(), vc.getFilters());
|
||||
Assert.assertEquals(sub.getID(), vc.getID());
|
||||
Assert.assertEquals(sub.getAttributes(), vc.getAttributes());
|
||||
|
||||
Set<Genotype> expectedGenotypes = new HashSet<Genotype>();
|
||||
if ( cfg.samples.contains(g1.getSampleName()) ) expectedGenotypes.add(g1);
|
||||
if ( cfg.samples.contains(g2.getSampleName()) ) expectedGenotypes.add(g2);
|
||||
if ( cfg.samples.contains(g3.getSampleName()) ) expectedGenotypes.add(g3);
|
||||
GenotypesContext expectedGC = GenotypesContext.copy(expectedGenotypes);
|
||||
|
||||
// these values depend on the results of sub
|
||||
if ( cfg.updateAlleles ) {
|
||||
// do the work to see what alleles should be here, and which not
|
||||
Set<Allele> alleles = new HashSet<Allele>();
|
||||
for ( final Genotype g : expectedGC ) alleles.addAll(g.getAlleles());
|
||||
if ( ! alleles.contains(Aref) ) alleles.add(Aref); // always have the reference
|
||||
Assert.assertEquals(new HashSet<Allele>(sub.getAlleles()), alleles);
|
||||
} else {
|
||||
// not updating alleles -- should be the same
|
||||
Assert.assertEquals(sub.getAlleles(), vc.getAlleles());
|
||||
}
|
||||
|
||||
// same sample names => success
|
||||
Assert.assertEquals(sub.getGenotypes().getSampleNames(), expectedGC.getSampleNames());
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test sample name functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
private class SampleNamesTest extends TestDataProvider {
|
||||
List<String> sampleNames;
|
||||
List<String> sampleNamesInOrder;
|
||||
|
||||
private SampleNamesTest(List<String> sampleNames, List<String> sampleNamesInOrder) {
|
||||
super(SampleNamesTest.class);
|
||||
this.sampleNamesInOrder = sampleNamesInOrder;
|
||||
this.sampleNames = sampleNames;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s samples=%s order=%s", super.toString(), sampleNames, sampleNamesInOrder);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "SampleNamesTest")
|
||||
public Object[][] MakeSampleNamesTest() {
|
||||
new SampleNamesTest(Arrays.asList("1"), Arrays.asList("1"));
|
||||
new SampleNamesTest(Arrays.asList("2", "1"), Arrays.asList("1", "2"));
|
||||
new SampleNamesTest(Arrays.asList("1", "2"), Arrays.asList("1", "2"));
|
||||
new SampleNamesTest(Arrays.asList("1", "2", "3"), Arrays.asList("1", "2", "3"));
|
||||
new SampleNamesTest(Arrays.asList("2", "1", "3"), Arrays.asList("1", "2", "3"));
|
||||
new SampleNamesTest(Arrays.asList("2", "3", "1"), Arrays.asList("1", "2", "3"));
|
||||
new SampleNamesTest(Arrays.asList("3", "1", "2"), Arrays.asList("1", "2", "3"));
|
||||
new SampleNamesTest(Arrays.asList("3", "2", "1"), Arrays.asList("1", "2", "3"));
|
||||
new SampleNamesTest(Arrays.asList("NA2", "NA1"), Arrays.asList("NA1", "NA2"));
|
||||
return SampleNamesTest.getTests(SampleNamesTest.class);
|
||||
}
|
||||
|
||||
private final static void assertGenotypesAreInOrder(Iterable<Genotype> gIt, List<String> names) {
|
||||
int i = 0;
|
||||
for ( final Genotype g : gIt ) {
|
||||
Assert.assertEquals(g.getSampleName(), names.get(i), "Unexpected genotype ordering");
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "SampleNamesTest")
|
||||
public void runSampleNamesTest(SampleNamesTest cfg) {
|
||||
GenotypesContext gc = GenotypesContext.create(cfg.sampleNames.size());
|
||||
for ( final String name : cfg.sampleNames ) {
|
||||
gc.add(GenotypeBuilder.create(name, Arrays.asList(Aref, T)));
|
||||
}
|
||||
|
||||
VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make();
|
||||
|
||||
// same sample names => success
|
||||
Assert.assertEquals(vc.getSampleNames(), new HashSet<String>(cfg.sampleNames), "vc.getSampleNames() = " + vc.getSampleNames());
|
||||
Assert.assertEquals(vc.getSampleNamesOrderedByName(), cfg.sampleNamesInOrder, "vc.getSampleNamesOrderedByName() = " + vc.getSampleNamesOrderedByName());
|
||||
|
||||
assertGenotypesAreInOrder(vc.getGenotypesOrderedByName(), cfg.sampleNamesInOrder);
|
||||
assertGenotypesAreInOrder(vc.getGenotypesOrderedBy(cfg.sampleNames), cfg.sampleNames);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenotypeCounting() {
|
||||
Genotype noCall = GenotypeBuilder.create("nocall", Arrays.asList(Allele.NO_CALL));
|
||||
Genotype mixed = GenotypeBuilder.create("mixed", Arrays.asList(Aref, Allele.NO_CALL));
|
||||
Genotype homRef = GenotypeBuilder.create("homRef", Arrays.asList(Aref, Aref));
|
||||
Genotype het = GenotypeBuilder.create("het", Arrays.asList(Aref, T));
|
||||
Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(T, T));
|
||||
|
||||
List<Genotype> allGenotypes = Arrays.asList(noCall, mixed, homRef, het, homVar);
|
||||
final int nCycles = allGenotypes.size() * 10;
|
||||
|
||||
for ( int i = 0; i < nCycles; i++ ) {
|
||||
int nNoCall = 0, nNoCallAlleles = 0, nA = 0, nT = 0, nMixed = 0, nHomRef = 0, nHet = 0, nHomVar = 0;
|
||||
int nSamples = 0;
|
||||
GenotypesContext gc = GenotypesContext.create();
|
||||
for ( int j = 0; j < i; j++ ) {
|
||||
nSamples++;
|
||||
Genotype g = allGenotypes.get(j % allGenotypes.size());
|
||||
final String name = String.format("%s_%d%d", g.getSampleName(), i, j);
|
||||
gc.add(GenotypeBuilder.create(name, g.getAlleles()));
|
||||
switch ( g.getType() ) {
|
||||
case NO_CALL: nNoCall++; nNoCallAlleles++; break;
|
||||
case HOM_REF: nA += 2; nHomRef++; break;
|
||||
case HET: nA++; nT++; nHet++; break;
|
||||
case HOM_VAR: nT += 2; nHomVar++; break;
|
||||
case MIXED: nA++; nNoCallAlleles++; nMixed++; break;
|
||||
default: throw new RuntimeException("Unexpected genotype type " + g.getType());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
VariantContext vc = new VariantContextBuilder("genotypes", snpLoc, snpLocStart, snpLocStop, Arrays.asList(Aref, T)).genotypes(gc).make();
|
||||
Assert.assertEquals(vc.getNSamples(), nSamples);
|
||||
if ( nSamples > 0 ) {
|
||||
Assert.assertEquals(vc.isPolymorphicInSamples(), nT > 0);
|
||||
Assert.assertEquals(vc.isMonomorphicInSamples(), nT == 0);
|
||||
}
|
||||
Assert.assertEquals(vc.getCalledChrCount(), nA + nT);
|
||||
|
||||
Assert.assertEquals(vc.getCalledChrCount(Allele.NO_CALL), nNoCallAlleles);
|
||||
Assert.assertEquals(vc.getCalledChrCount(Aref), nA);
|
||||
Assert.assertEquals(vc.getCalledChrCount(T), nT);
|
||||
|
||||
Assert.assertEquals(vc.getNoCallCount(), nNoCall);
|
||||
Assert.assertEquals(vc.getHomRefCount(), nHomRef);
|
||||
Assert.assertEquals(vc.getHetCount(), nHet);
|
||||
Assert.assertEquals(vc.getHomVarCount(), nHomVar);
|
||||
Assert.assertEquals(vc.getMixedCount(), nMixed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,130 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext;
|
||||
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class VariantJEXLContextUnitTest
|
||||
*
|
||||
* Test out parts of the VariantJEXLContext
|
||||
*/
|
||||
public class VariantJEXLContextUnitTest extends VariantBaseTest {
|
||||
|
||||
private static String expression = "QUAL > 500.0";
|
||||
private static VariantContextUtils.JexlVCMatchExp exp;
|
||||
|
||||
Allele A, Aref, T, Tref;
|
||||
|
||||
Allele ATC, ATCref;
|
||||
// A [ref] / T at 10
|
||||
|
||||
// - / ATC [ref] from 20-23
|
||||
|
||||
@BeforeClass
|
||||
public void beforeClass() {
|
||||
try {
|
||||
exp = new VariantContextUtils.JexlVCMatchExp("name", VariantContextUtils.engine.createExpression(expression));
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Unable to create expression" + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@BeforeMethod
|
||||
public void before() {
|
||||
A = Allele.create("A");
|
||||
Aref = Allele.create("A", true);
|
||||
T = Allele.create("T");
|
||||
Tref = Allele.create("T", true);
|
||||
|
||||
ATC = Allele.create("ATC");
|
||||
ATCref = Allele.create("ATC", true);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testGetValue() {
|
||||
Map<VariantContextUtils.JexlVCMatchExp, Boolean> map = getVarContext();
|
||||
|
||||
// make sure the context has a value
|
||||
Assert.assertTrue(!map.isEmpty());
|
||||
Assert.assertEquals(map.size(), 1);
|
||||
|
||||
// eval our known expression
|
||||
Assert.assertTrue(!map.get(exp));
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=UnsupportedOperationException.class)
|
||||
public void testContainsValue() {
|
||||
Map<VariantContextUtils.JexlVCMatchExp, Boolean> map = getVarContext();
|
||||
|
||||
map.containsValue(exp);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=UnsupportedOperationException.class)
|
||||
public void testRemove() {
|
||||
Map<VariantContextUtils.JexlVCMatchExp, Boolean> map = getVarContext();
|
||||
|
||||
map.remove(exp);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=UnsupportedOperationException.class)
|
||||
public void testEntrySet() {
|
||||
Map<VariantContextUtils.JexlVCMatchExp, Boolean> map = getVarContext();
|
||||
|
||||
map.entrySet();
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=UnsupportedOperationException.class)
|
||||
public void testClear() {
|
||||
Map<VariantContextUtils.JexlVCMatchExp, Boolean> map = getVarContext();
|
||||
|
||||
map.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* helper method
|
||||
* @return a VariantJEXLContext
|
||||
*/
|
||||
private JEXLMap getVarContext() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T);
|
||||
|
||||
VariantContext vc = new VariantContextBuilder("test", "chr1", 10, 10, alleles).make();
|
||||
return new JEXLMap(Arrays.asList(exp),vc);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,200 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broad.tribble.AbstractFeatureReader;
|
||||
import org.broad.tribble.FeatureReader;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderVersion;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFWriterUnitTest
|
||||
* <p/>
|
||||
* This class tests out the ability of the VCF writer to correctly write VCF files
|
||||
*/
|
||||
public class VCFWriterUnitTest extends VariantBaseTest {
|
||||
private Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
|
||||
private Set<String> additionalColumns = new HashSet<String>();
|
||||
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
|
||||
private IndexedFastaSequenceFile seq;
|
||||
|
||||
@BeforeClass
|
||||
public void beforeTests() {
|
||||
File referenceFile = new File(hg19Reference);
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(referenceFile);
|
||||
}
|
||||
catch(FileNotFoundException ex) {
|
||||
throw new RuntimeException(referenceFile.getAbsolutePath(), ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** test, using the writer and reader, that we can output and input a VCF file without problems */
|
||||
@Test
|
||||
public void testBasicWriteAndRead() {
|
||||
VCFHeader header = createFakeHeader(metaData,additionalColumns);
|
||||
final EnumSet<Options> options = EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
|
||||
VariantContextWriter writer = VariantContextWriterFactory.create(fakeVCFFile, seq.getSequenceDictionary(), options);
|
||||
writer.writeHeader(header);
|
||||
writer.add(createVC(header));
|
||||
writer.add(createVC(header));
|
||||
writer.close();
|
||||
VCFCodec codec = new VCFCodec();
|
||||
VCFHeader headerFromFile = null;
|
||||
FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(fakeVCFFile.getAbsolutePath(), codec, false);
|
||||
headerFromFile = (VCFHeader)reader.getHeader();
|
||||
|
||||
int counter = 0;
|
||||
|
||||
// validate what we're reading in
|
||||
validateHeader(headerFromFile);
|
||||
|
||||
try {
|
||||
Iterator<VariantContext> it = reader.iterator();
|
||||
while(it.hasNext()) {
|
||||
VariantContext vc = it.next();
|
||||
counter++;
|
||||
}
|
||||
Assert.assertEquals(counter, 2);
|
||||
Tribble.indexFile(fakeVCFFile).delete();
|
||||
fakeVCFFile.delete();
|
||||
}
|
||||
catch (IOException e ) {
|
||||
throw new RuntimeException(e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* create a fake header of known quantity
|
||||
* @param metaData the header lines
|
||||
* @param additionalColumns the additional column names
|
||||
* @return a fake VCF header
|
||||
*/
|
||||
public static VCFHeader createFakeHeader(Set<VCFHeaderLine> metaData, Set<String> additionalColumns) {
|
||||
metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
|
||||
metaData.add(new VCFHeaderLine("two", "2"));
|
||||
additionalColumns.add("extra1");
|
||||
additionalColumns.add("extra2");
|
||||
return new VCFHeader(metaData, additionalColumns);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a fake VCF record
|
||||
* @param header the VCF header
|
||||
* @return a VCFRecord
|
||||
*/
|
||||
private VariantContext createVC(VCFHeader header) {
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
Set<String> filters = null;
|
||||
Map<String, Object> attributes = new HashMap<String,Object>();
|
||||
GenotypesContext genotypes = GenotypesContext.create(header.getGenotypeSamples().size());
|
||||
|
||||
alleles.add(Allele.create("A",true));
|
||||
alleles.add(Allele.create("ACC",false));
|
||||
|
||||
attributes.put("DP","50");
|
||||
for (String name : header.getGenotypeSamples()) {
|
||||
Genotype gt = new GenotypeBuilder(name,alleles.subList(1,2)).GQ(0).attribute("BB", "1").phased(true).make();
|
||||
genotypes.add(gt);
|
||||
}
|
||||
return new VariantContextBuilder("RANDOM", "chr1", 1, 1, alleles)
|
||||
.genotypes(genotypes).attributes(attributes).make();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* validate a VCF header
|
||||
* @param header the header to validate
|
||||
*/
|
||||
public void validateHeader(VCFHeader header) {
|
||||
// check the fields
|
||||
int index = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
|
||||
Assert.assertEquals(VCFHeader.HEADER_FIELDS.values()[index], field);
|
||||
index++;
|
||||
}
|
||||
Assert.assertEquals(header.getMetaDataInSortedOrder().size(), metaData.size());
|
||||
index = 0;
|
||||
for (String key : header.getGenotypeSamples()) {
|
||||
Assert.assertTrue(additionalColumns.contains(key));
|
||||
index++;
|
||||
}
|
||||
Assert.assertEquals(index, additionalColumns.size());
|
||||
}
|
||||
|
||||
@DataProvider(name = "VCFWriterDoubleFormatTestData")
|
||||
public Object[][] makeVCFWriterDoubleFormatTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
tests.add(new Object[]{1.0, "1.00"});
|
||||
tests.add(new Object[]{10.1, "10.10"});
|
||||
tests.add(new Object[]{10.01, "10.01"});
|
||||
tests.add(new Object[]{10.012, "10.01"});
|
||||
tests.add(new Object[]{10.015, "10.02"});
|
||||
tests.add(new Object[]{0.0, "0.00"});
|
||||
tests.add(new Object[]{0.5, "0.500"});
|
||||
tests.add(new Object[]{0.55, "0.550"});
|
||||
tests.add(new Object[]{0.555, "0.555"});
|
||||
tests.add(new Object[]{0.5555, "0.556"});
|
||||
tests.add(new Object[]{0.1, "0.100"});
|
||||
tests.add(new Object[]{0.050, "0.050"});
|
||||
tests.add(new Object[]{0.010, "0.010"});
|
||||
tests.add(new Object[]{0.012, "0.012"});
|
||||
tests.add(new Object[]{0.0012, "1.200e-03"});
|
||||
tests.add(new Object[]{1.2e-4, "1.200e-04"});
|
||||
tests.add(new Object[]{1.21e-4, "1.210e-04"});
|
||||
tests.add(new Object[]{1.212e-5, "1.212e-05"});
|
||||
tests.add(new Object[]{1.2123e-6, "1.212e-06"});
|
||||
tests.add(new Object[]{Double.POSITIVE_INFINITY, "Infinity"});
|
||||
tests.add(new Object[]{Double.NEGATIVE_INFINITY, "-Infinity"});
|
||||
tests.add(new Object[]{Double.NaN, "NaN"});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VCFWriterDoubleFormatTestData")
|
||||
public void testVCFWriterDoubleFormatTestData(final double d, final String expected) {
|
||||
Assert.assertEquals(VCFWriter.formatVCFDouble(d), expected, "Failed to pretty print double in VCFWriter");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,146 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.variantcontext.writer;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.bcf2.BCF2Codec;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextTestProvider;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class VariantContextWritersUnitTest extends VariantBaseTest {
|
||||
private SAMSequenceDictionary dictionary;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() throws IOException {
|
||||
final File source = new File(b37KGReference);
|
||||
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(source);
|
||||
dictionary = seq.getSequenceDictionary();
|
||||
VariantContextTestProvider.initializeTests();
|
||||
}
|
||||
|
||||
@DataProvider(name = "VariantContextTest_SingleContexts")
|
||||
public Object[][] SiteVCsTest() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
for ( VariantContextTestProvider.VariantContextTestData testData : VariantContextTestProvider.generateSiteTests() )
|
||||
tests.add(new Object[]{testData});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test BCF2 reader / writer
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Test(dataProvider = "VariantContextTest_SingleContexts")
|
||||
public void testBCF2WriterReader(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
|
||||
VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VariantContextTest_SingleContexts")
|
||||
public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
|
||||
VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData);
|
||||
}
|
||||
|
||||
private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest {
|
||||
@Override
|
||||
public String getExtension() {
|
||||
return ".bcf";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FeatureCodec<VariantContext> makeCodec() {
|
||||
return new BCF2Codec();
|
||||
}
|
||||
|
||||
@Override
|
||||
public VariantContextWriter makeWriter(final File file, final EnumSet<Options> baseOptions) {
|
||||
return VariantContextWriterFactory.create(file, dictionary, baseOptions);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test VCF reader / writer
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts")
|
||||
public void testVCF4WriterReader(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
|
||||
VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts")
|
||||
public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
|
||||
VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData);
|
||||
}
|
||||
|
||||
private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest {
|
||||
@Override
|
||||
public String getExtension() {
|
||||
return ".vcf";
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<VariantContext> postprocess(final VCFHeader header, final List<VariantContext> vcsAfterIO) {
|
||||
final List<VariantContext> fullyDecoded = new ArrayList<VariantContext>(vcsAfterIO.size());
|
||||
|
||||
for ( final VariantContext withStrings : vcsAfterIO )
|
||||
fullyDecoded.add(withStrings.fullyDecode(header, false));
|
||||
|
||||
return fullyDecoded;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FeatureCodec<VariantContext> makeCodec() {
|
||||
return new VCFCodec();
|
||||
}
|
||||
|
||||
@Override
|
||||
public VariantContextWriter makeWriter(final File file, final EnumSet<Options> baseOptions) {
|
||||
return VariantContextWriterFactory.create(file, dictionary, baseOptions);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.AbstractFeatureReader;
|
||||
import org.broad.tribble.CloseableTribbleIterator;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.writer.Options;
|
||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
|
||||
/**
|
||||
* tests out the various functions in the index factory class
|
||||
*/
|
||||
public class IndexFactoryUnitTest extends VariantBaseTest {
|
||||
|
||||
File inputFile = new File(variantTestDataRoot + "HiSeq.10000.vcf");
|
||||
File outputFile = new File(variantTestDataRoot + "onTheFlyOutputTest.vcf");
|
||||
File outputFileIndex = Tribble.indexFile(outputFile);
|
||||
|
||||
private SAMSequenceDictionary dict;
|
||||
|
||||
@BeforeTest
|
||||
public void setup() {
|
||||
try {
|
||||
dict = new IndexedFastaSequenceFile(new File(b37KGReference)).getSequenceDictionary();
|
||||
}
|
||||
catch(FileNotFoundException ex) {
|
||||
throw new RuntimeException(b37KGReference,ex);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// test out scoring the indexes
|
||||
//
|
||||
@Test
|
||||
public void testOnTheFlyIndexing1() throws IOException {
|
||||
Index indexFromInputFile = IndexFactory.createDynamicIndex(inputFile, new VCFCodec());
|
||||
if ( outputFileIndex.exists() ) {
|
||||
System.err.println("Deleting " + outputFileIndex);
|
||||
outputFileIndex.delete();
|
||||
}
|
||||
|
||||
for ( int maxRecords : Arrays.asList(0, 1, 10, 100, 1000, -1)) {
|
||||
AbstractFeatureReader<VariantContext> source = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), new VCFCodec(), indexFromInputFile);
|
||||
|
||||
int counter = 0;
|
||||
final EnumSet<Options> options = EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
|
||||
VariantContextWriter writer = VariantContextWriterFactory.create(outputFile, dict, options);
|
||||
writer.writeHeader((VCFHeader)source.getHeader());
|
||||
CloseableTribbleIterator<VariantContext> it = source.iterator();
|
||||
while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) {
|
||||
VariantContext vc = it.next();
|
||||
writer.add(vc);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
// test that the input index is the same as the one created from the identical input file
|
||||
// test that the dynamic index is the same as the output index, which is equal to the input index
|
||||
//WalkerTest.assertOnDiskIndexEqualToNewlyCreatedIndex(outputFileIndex, "unittest", outputFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,171 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.testng.Assert;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.*;
|
||||
import java.math.BigInteger;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaron
|
||||
* Date: Jun 30, 2010
|
||||
* Time: 3:32:08 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class VCFHeaderUnitTest extends VariantBaseTest {
|
||||
|
||||
private VCFHeader createHeader(String headerStr) {
|
||||
VCFCodec codec = new VCFCodec();
|
||||
VCFHeader header = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(headerStr))));
|
||||
Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount);
|
||||
return header;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCF4ToVCF4() {
|
||||
VCFHeader header = createHeader(VCF4headerStrings);
|
||||
checkMD5ofHeaderFile(header, "f05a57053a0c6a5bac15dba566f7f7ff");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCF4ToVCF4_alternate() {
|
||||
VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne);
|
||||
checkMD5ofHeaderFile(header, "b1d71cc94261053131f8d239d65a8c9f");
|
||||
}
|
||||
|
||||
/**
|
||||
* a little utility function for all tests to md5sum a file
|
||||
* Shameless taken from:
|
||||
*
|
||||
* http://www.javalobby.org/java/forums/t84420.html
|
||||
*
|
||||
* @param file the file
|
||||
* @return a string
|
||||
*/
|
||||
private static String md5SumFile(File file) {
|
||||
MessageDigest digest;
|
||||
try {
|
||||
digest = MessageDigest.getInstance("MD5");
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException("Unable to find MD5 digest");
|
||||
}
|
||||
InputStream is;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new RuntimeException("Unable to open file " + file);
|
||||
}
|
||||
byte[] buffer = new byte[8192];
|
||||
int read;
|
||||
try {
|
||||
while ((read = is.read(buffer)) > 0) {
|
||||
digest.update(buffer, 0, read);
|
||||
}
|
||||
byte[] md5sum = digest.digest();
|
||||
BigInteger bigInt = new BigInteger(1, md5sum);
|
||||
return bigInt.toString(16);
|
||||
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("Unable to process file for MD5", e);
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
is.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close input stream for MD5 calculation", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkMD5ofHeaderFile(VCFHeader header, String md5sum) {
|
||||
File myTempFile = null;
|
||||
PrintWriter pw = null;
|
||||
try {
|
||||
myTempFile = File.createTempFile("VCFHeader","vcf");
|
||||
myTempFile.deleteOnExit();
|
||||
pw = new PrintWriter(myTempFile);
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Unable to make a temp file!");
|
||||
}
|
||||
for (VCFHeaderLine line : header.getMetaDataInSortedOrder())
|
||||
pw.println(line);
|
||||
pw.close();
|
||||
Assert.assertEquals(md5SumFile(myTempFile), md5sum);
|
||||
}
|
||||
|
||||
public static int VCF4headerStringCount = 16;
|
||||
|
||||
public static String VCF4headerStrings =
|
||||
"##fileformat=VCFv4.0\n"+
|
||||
"##filedate=2010-06-21\n"+
|
||||
"##reference=NCBI36\n"+
|
||||
"##INFO=<ID=GC, Number=0, Type=Flag, Description=\"Overlap with Gencode CCDS coding sequence\">\n"+
|
||||
"##INFO=<ID=DP, Number=1, Type=Integer, Description=\"Total number of reads in haplotype window\">\n"+
|
||||
"##INFO=<ID=AF, Number=A, Type=Float, Description=\"Dindel estimated population allele frequency\">\n"+
|
||||
"##INFO=<ID=CA, Number=1, Type=String, Description=\"Pilot 1 callability mask\">\n"+
|
||||
"##INFO=<ID=HP, Number=1, Type=Integer, Description=\"Reference homopolymer tract length\">\n"+
|
||||
"##INFO=<ID=NS, Number=1, Type=Integer, Description=\"Number of samples with data\">\n"+
|
||||
"##INFO=<ID=DB, Number=0, Type=Flag, Description=\"dbSNP membership build 129 - type match and indel sequence length match within 25 bp\">\n"+
|
||||
"##INFO=<ID=NR, Number=1, Type=Integer, Description=\"Number of reads covering non-ref variant on reverse strand\">\n"+
|
||||
"##INFO=<ID=NF, Number=1, Type=Integer, Description=\"Number of reads covering non-ref variant on forward strand\">\n"+
|
||||
"##FILTER=<ID=NoQCALL, Description=\"Variant called by Dindel but not confirmed by QCALL\">\n"+
|
||||
"##FORMAT=<ID=GT, Number=1, Type=String, Description=\"Genotype\">\n"+
|
||||
"##FORMAT=<ID=HQ, Number=2, Type=Integer, Description=\"Haplotype quality\">\n"+
|
||||
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
|
||||
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
|
||||
|
||||
|
||||
public static String VCF4headerStrings_with_negativeOne =
|
||||
"##fileformat=VCFv4.0\n"+
|
||||
"##filedate=2010-06-21\n"+
|
||||
"##reference=NCBI36\n"+
|
||||
"##INFO=<ID=GC, Number=0, Type=Flag, Description=\"Overlap with Gencode CCDS coding sequence\">\n"+
|
||||
"##INFO=<ID=YY, Number=., Type=Integer, Description=\"Some weird value that has lots of parameters\">\n"+
|
||||
"##INFO=<ID=AF, Number=A, Type=Float, Description=\"Dindel estimated population allele frequency\">\n"+
|
||||
"##INFO=<ID=CA, Number=1, Type=String, Description=\"Pilot 1 callability mask\">\n"+
|
||||
"##INFO=<ID=HP, Number=1, Type=Integer, Description=\"Reference homopolymer tract length\">\n"+
|
||||
"##INFO=<ID=NS, Number=1, Type=Integer, Description=\"Number of samples with data\">\n"+
|
||||
"##INFO=<ID=DB, Number=0, Type=Flag, Description=\"dbSNP membership build 129 - type match and indel sequence length match within 25 bp\">\n"+
|
||||
"##INFO=<ID=NR, Number=1, Type=Integer, Description=\"Number of reads covering non-ref variant on reverse strand\">\n"+
|
||||
"##INFO=<ID=NF, Number=1, Type=Integer, Description=\"Number of reads covering non-ref variant on forward strand\">\n"+
|
||||
"##FILTER=<ID=NoQCALL, Description=\"Variant called by Dindel but not confirmed by QCALL\">\n"+
|
||||
"##FORMAT=<ID=GT, Number=1, Type=String, Description=\"Genotype\">\n"+
|
||||
"##FORMAT=<ID=HQ, Number=2, Type=Integer, Description=\"Haplotype quality\">\n"+
|
||||
"##FORMAT=<ID=TT, Number=., Type=Integer, Description=\"Lots of TTs\">\n"+
|
||||
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
|
||||
|
||||
}
|
||||
|
|
@ -1,149 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.variant.vcf;
|
||||
|
||||
import org.broadinstitute.variant.VariantBaseTest;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaron
|
||||
* Date: Jun 30, 2010
|
||||
* Time: 3:32:08 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class VCFStandardHeaderLinesUnitTest extends VariantBaseTest {
|
||||
@DataProvider(name = "getStandardLines")
|
||||
public Object[][] makeGetStandardLines() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// info
|
||||
tests.add(new Object[]{"AC", "info", true});
|
||||
tests.add(new Object[]{"AN", "info", true});
|
||||
tests.add(new Object[]{"AF", "info", true});
|
||||
tests.add(new Object[]{"DP", "info", true});
|
||||
tests.add(new Object[]{"DB", "info", true});
|
||||
tests.add(new Object[]{"END", "info", true});
|
||||
|
||||
// format
|
||||
tests.add(new Object[]{"GT", "format", true});
|
||||
tests.add(new Object[]{"GQ", "format", true});
|
||||
tests.add(new Object[]{"DP", "format", true});
|
||||
tests.add(new Object[]{"AD", "format", true});
|
||||
tests.add(new Object[]{"PL", "format", true});
|
||||
|
||||
tests.add(new Object[]{"NOT_STANDARD", "info", false});
|
||||
tests.add(new Object[]{"NOT_STANDARD", "format", false});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "getStandardLines")
|
||||
public void getStandardLines(final String key, final String type, final boolean expectedToBeStandard) {
|
||||
VCFCompoundHeaderLine line = null;
|
||||
if ( type.equals("info") )
|
||||
line = VCFStandardHeaderLines.getInfoLine(key, false);
|
||||
else if ( type.equals("format") )
|
||||
line = VCFStandardHeaderLines.getFormatLine(key, false);
|
||||
else
|
||||
throw new IllegalArgumentException("Unexpected type in getStandardLines " + type);
|
||||
|
||||
if ( expectedToBeStandard ) {
|
||||
Assert.assertNotNull(line);
|
||||
Assert.assertEquals(line.getID(), key);
|
||||
} else
|
||||
Assert.assertNull(line);
|
||||
}
|
||||
|
||||
private class RepairHeaderTest extends TestDataProvider {
|
||||
final VCFCompoundHeaderLine original, expectedResult;
|
||||
|
||||
private RepairHeaderTest(final VCFCompoundHeaderLine original) {
|
||||
this(original, original);
|
||||
}
|
||||
|
||||
private RepairHeaderTest(final VCFCompoundHeaderLine original, final VCFCompoundHeaderLine expectedResult) {
|
||||
super(RepairHeaderTest.class);
|
||||
this.original = original;
|
||||
this.expectedResult = expectedResult;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "RepairHeaderTest")
|
||||
public Object[][] makeRepairHeaderTest() {
|
||||
final VCFInfoHeaderLine standardAC = VCFStandardHeaderLines.getInfoLine("AC");
|
||||
final VCFInfoHeaderLine goodAC = new VCFInfoHeaderLine("AC", VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "x");
|
||||
|
||||
final VCFFormatHeaderLine standardGT = VCFStandardHeaderLines.getFormatLine("GT");
|
||||
final VCFFormatHeaderLine goodGT = new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "x");
|
||||
|
||||
new RepairHeaderTest( standardGT, standardGT);
|
||||
new RepairHeaderTest( goodGT, goodGT );
|
||||
new RepairHeaderTest( new VCFFormatHeaderLine("GT", 2, VCFHeaderLineType.String, "x"), standardGT);
|
||||
new RepairHeaderTest( new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.Integer, "x"), standardGT);
|
||||
new RepairHeaderTest( new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.Float, "x"), standardGT);
|
||||
new RepairHeaderTest( new VCFFormatHeaderLine("GT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "x"), standardGT);
|
||||
new RepairHeaderTest( new VCFFormatHeaderLine("GT", VCFHeaderLineCount.G, VCFHeaderLineType.String, "x"), standardGT);
|
||||
new RepairHeaderTest( new VCFFormatHeaderLine("GT", VCFHeaderLineCount.A, VCFHeaderLineType.String, "x"), standardGT);
|
||||
|
||||
new RepairHeaderTest( standardAC, standardAC);
|
||||
new RepairHeaderTest( goodAC, goodAC );
|
||||
new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.Integer, "x"), standardAC);
|
||||
new RepairHeaderTest( new VCFInfoHeaderLine("AC", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "x"), standardAC);
|
||||
new RepairHeaderTest( new VCFInfoHeaderLine("AC", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"), standardAC);
|
||||
new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.Float, "x"), standardAC);
|
||||
new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.String, "x"), standardAC);
|
||||
new RepairHeaderTest( new VCFInfoHeaderLine("AC", 0, VCFHeaderLineType.Flag, "x"), standardAC);
|
||||
|
||||
new RepairHeaderTest( new VCFInfoHeaderLine("NON_STANDARD_INFO", 1, VCFHeaderLineType.String, "x"));
|
||||
new RepairHeaderTest( new VCFFormatHeaderLine("NON_STANDARD_FORMAT", 1, VCFHeaderLineType.String, "x"));
|
||||
|
||||
return RepairHeaderTest.getTests(RepairHeaderTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "RepairHeaderTest")
|
||||
public void testRepairHeaderTest(RepairHeaderTest cfg) {
|
||||
final VCFHeader toRepair = new VCFHeader(Collections.singleton((VCFHeaderLine)cfg.original));
|
||||
final VCFHeader repaired = VCFStandardHeaderLines.repairStandardHeaderLines(toRepair);
|
||||
|
||||
VCFCompoundHeaderLine repairedLine = (VCFCompoundHeaderLine)repaired.getFormatHeaderLine(cfg.original.getID());
|
||||
if ( repairedLine == null ) repairedLine = (VCFCompoundHeaderLine)repaired.getInfoHeaderLine(cfg.original.getID());
|
||||
|
||||
Assert.assertNotNull(repairedLine, "Repaired header didn't contain the expected line");
|
||||
Assert.assertEquals(repairedLine.getID(), cfg.expectedResult.getID());
|
||||
Assert.assertEquals(repairedLine.getType(), cfg.expectedResult.getType());
|
||||
Assert.assertEquals(repairedLine.getCountType(), cfg.expectedResult.getCountType());
|
||||
if ( repairedLine.getCountType() == VCFHeaderLineCount.INTEGER )
|
||||
Assert.assertEquals(repairedLine.getCount(), cfg.expectedResult.getCount());
|
||||
}
|
||||
}
|
||||
Binary file not shown.
|
|
@ -0,0 +1,3 @@
|
|||
<ivy-module version="1.0">
|
||||
<info organisation="org.broadinstitute" module="variant" revision="1.84.1338" status="integration" />
|
||||
</ivy-module>
|
||||
Loading…
Reference in New Issue