gatk-3.8/public/java/src/org/broadinstitute/variant/bcf2/BCF2Utils.java

334 lines
12 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.variant.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broad.tribble.TribbleException;
import org.broadinstitute.variant.vcf.*;
import java.io.*;
import java.util.*;
/**
* Common utilities for working with BCF2 files
*
* Includes convenience methods for encoding, decoding BCF2 type descriptors (size + type)
*
* @author depristo
* @since 5/12
*/
public final class BCF2Utils {
public static final int MAX_ALLELES_IN_GENOTYPES = 127;
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
public final static BCF2Type[] ID_TO_ENUM;
static {
int maxID = -1;
for ( BCF2Type v : BCF2Type.values() ) maxID = Math.max(v.getID(), maxID);
ID_TO_ENUM = new BCF2Type[maxID+1];
for ( BCF2Type v : BCF2Type.values() ) ID_TO_ENUM[v.getID()] = v;
}
private BCF2Utils() {}
/**
* Create a strings dictionary from the VCF header
*
* The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT)
* fields.
*
* Note that its critical that the list be dedupped and sorted in a consistent manner each time,
* as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly
* the same way as in the header each time it's very bad
*
* @param header the VCFHeader from which to build the dictionary
* @return a non-null dictionary of elements, may be empty
*/
@Requires("header != null")
@Ensures({"result != null", "new HashSet(result).size() == result.size()"})
public static ArrayList<String> makeDictionary(final VCFHeader header) {
Finalizing BCF2 mark III commit -- Moved GENOTYPE_KEY vcf header line to VCFConstants. This general migration and cleanup is on Eric's plate now -- Updated HC to initialize the annotation engine in an order that allows it to write a proper VCF header. Still doesn't work... -- Updating integration test files. Moved many more files into public/testdata. Updated their headers to all work correctly with new strict VCF header checking. -- Bugfix for TandemRepeatAnnotation that must be unbounded not A count type as it provides info for the REF as well as each alt -- No longer add FALSE values to flag values in VCs in VariantAnnotatorEngine. DB = 0 is never seen in the output VCFs now -- Fixed bug in VCFDiffableReader that didn't differeniate between "." and "PASS" VC filter status -- Unconditionally add lowQual Filter to UG output VCF files as this is in some cases (EMIT_ALL_SITES) used when the previous check said it wouldn't be -- VariantsToVCF now properly writes out the GT FORMAT field -- BCF2 codec explodes when reading symbolic alleles as I literally cannot figure out how to use the allele clipping code. Eric said he and Ami will clean up this whole piece of instructure -- Fixed bug in BCF2Codec that wasn't setting the phase field correctly. UnitTested now -- PASS string now added at the end of the BCF2 dictionary after discussion with Heng -- Fixed bug where I was writing out all field values as BigEndian. Now everything is LittleEndian. -- VCFHeader detects the case where a count field has size < 0 (some of our files have count = -1) and throws a UserException -- Cleaned up unused code -- Fixed bug in BCF2 string encoder that wasn't handling the case of an empty list of strings for encoding -- Fixed bug where all samples are no called in a VC, in which case we (like the VCFwriter) write out no called diploid genotypes for all samples -- We always write the number of genotype samples into the BCF2 nSamples header. How we can have a variable number of samples per record isn't clear to me, as we don't have a map from missing samples to header names... -- Removed old filtersWereAppliedToContext code in VCF as properly handle unfiltered, filtered, and PASS records internally -- Fastpath function getDisplayBases() in allele that just gives you the raw bytes[] you'd see for an Allele -- Genotype fields no longer differentiate between unfiltered, filtered, and PASS values. Genotype objects are all PASS implicitly, or explicitly filtered. We only write out the FT values if at least one sample is filtered. Removed interface functions and cleaned up code -- Refactored padAllele code from createVariantContextWithPaddedAlleles into the function padAllele so that it actually works. In general, **** NEVER COPY CODE **** if you need to share funcitonality make a function, that's why there were invented! -- Increased the default number of records to read for DiffObjects to 1M
2012-06-19 21:46:26 +08:00
final Set<String> seen = new HashSet<String>();
final ArrayList<String> dict = new ArrayList<String>();
// special case the special PASS field which doesn't show up in the FILTER field definitions
seen.add(VCFConstants.PASSES_FILTERS_v4);
dict.add(VCFConstants.PASSES_FILTERS_v4);
// set up the strings dictionary
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
if ( line instanceof VCFIDHeaderLine && ! (line instanceof VCFContigHeaderLine) ) {
Finalizing BCF2 mark III commit -- Moved GENOTYPE_KEY vcf header line to VCFConstants. This general migration and cleanup is on Eric's plate now -- Updated HC to initialize the annotation engine in an order that allows it to write a proper VCF header. Still doesn't work... -- Updating integration test files. Moved many more files into public/testdata. Updated their headers to all work correctly with new strict VCF header checking. -- Bugfix for TandemRepeatAnnotation that must be unbounded not A count type as it provides info for the REF as well as each alt -- No longer add FALSE values to flag values in VCs in VariantAnnotatorEngine. DB = 0 is never seen in the output VCFs now -- Fixed bug in VCFDiffableReader that didn't differeniate between "." and "PASS" VC filter status -- Unconditionally add lowQual Filter to UG output VCF files as this is in some cases (EMIT_ALL_SITES) used when the previous check said it wouldn't be -- VariantsToVCF now properly writes out the GT FORMAT field -- BCF2 codec explodes when reading symbolic alleles as I literally cannot figure out how to use the allele clipping code. Eric said he and Ami will clean up this whole piece of instructure -- Fixed bug in BCF2Codec that wasn't setting the phase field correctly. UnitTested now -- PASS string now added at the end of the BCF2 dictionary after discussion with Heng -- Fixed bug where I was writing out all field values as BigEndian. Now everything is LittleEndian. -- VCFHeader detects the case where a count field has size < 0 (some of our files have count = -1) and throws a UserException -- Cleaned up unused code -- Fixed bug in BCF2 string encoder that wasn't handling the case of an empty list of strings for encoding -- Fixed bug where all samples are no called in a VC, in which case we (like the VCFwriter) write out no called diploid genotypes for all samples -- We always write the number of genotype samples into the BCF2 nSamples header. How we can have a variable number of samples per record isn't clear to me, as we don't have a map from missing samples to header names... -- Removed old filtersWereAppliedToContext code in VCF as properly handle unfiltered, filtered, and PASS records internally -- Fastpath function getDisplayBases() in allele that just gives you the raw bytes[] you'd see for an Allele -- Genotype fields no longer differentiate between unfiltered, filtered, and PASS values. Genotype objects are all PASS implicitly, or explicitly filtered. We only write out the FT values if at least one sample is filtered. Removed interface functions and cleaned up code -- Refactored padAllele code from createVariantContextWithPaddedAlleles into the function padAllele so that it actually works. In general, **** NEVER COPY CODE **** if you need to share funcitonality make a function, that's why there were invented! -- Increased the default number of records to read for DiffObjects to 1M
2012-06-19 21:46:26 +08:00
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
if ( ! seen.contains(idLine.getID())) {
dict.add(idLine.getID());
seen.add(idLine.getID());
}
}
}
Finalizing BCF2 mark III commit -- Moved GENOTYPE_KEY vcf header line to VCFConstants. This general migration and cleanup is on Eric's plate now -- Updated HC to initialize the annotation engine in an order that allows it to write a proper VCF header. Still doesn't work... -- Updating integration test files. Moved many more files into public/testdata. Updated their headers to all work correctly with new strict VCF header checking. -- Bugfix for TandemRepeatAnnotation that must be unbounded not A count type as it provides info for the REF as well as each alt -- No longer add FALSE values to flag values in VCs in VariantAnnotatorEngine. DB = 0 is never seen in the output VCFs now -- Fixed bug in VCFDiffableReader that didn't differeniate between "." and "PASS" VC filter status -- Unconditionally add lowQual Filter to UG output VCF files as this is in some cases (EMIT_ALL_SITES) used when the previous check said it wouldn't be -- VariantsToVCF now properly writes out the GT FORMAT field -- BCF2 codec explodes when reading symbolic alleles as I literally cannot figure out how to use the allele clipping code. Eric said he and Ami will clean up this whole piece of instructure -- Fixed bug in BCF2Codec that wasn't setting the phase field correctly. UnitTested now -- PASS string now added at the end of the BCF2 dictionary after discussion with Heng -- Fixed bug where I was writing out all field values as BigEndian. Now everything is LittleEndian. -- VCFHeader detects the case where a count field has size < 0 (some of our files have count = -1) and throws a UserException -- Cleaned up unused code -- Fixed bug in BCF2 string encoder that wasn't handling the case of an empty list of strings for encoding -- Fixed bug where all samples are no called in a VC, in which case we (like the VCFwriter) write out no called diploid genotypes for all samples -- We always write the number of genotype samples into the BCF2 nSamples header. How we can have a variable number of samples per record isn't clear to me, as we don't have a map from missing samples to header names... -- Removed old filtersWereAppliedToContext code in VCF as properly handle unfiltered, filtered, and PASS records internally -- Fastpath function getDisplayBases() in allele that just gives you the raw bytes[] you'd see for an Allele -- Genotype fields no longer differentiate between unfiltered, filtered, and PASS values. Genotype objects are all PASS implicitly, or explicitly filtered. We only write out the FT values if at least one sample is filtered. Removed interface functions and cleaned up code -- Refactored padAllele code from createVariantContextWithPaddedAlleles into the function padAllele so that it actually works. In general, **** NEVER COPY CODE **** if you need to share funcitonality make a function, that's why there were invented! -- Increased the default number of records to read for DiffObjects to 1M
2012-06-19 21:46:26 +08:00
return dict;
}
@Requires({"nElements >= 0", "nElements <= OVERFLOW_ELEMENT_MARKER", "type != null"})
public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
return (byte)((0x0F & nElements) << 4 | (type.getID() & 0x0F));
}
@Ensures("result >= 0")
public static int decodeSize(final byte typeDescriptor) {
return (0xF0 & typeDescriptor) >> 4;
}
@Ensures("result >= 0")
public static int decodeTypeID(final byte typeDescriptor) {
return typeDescriptor & 0x0F;
}
@Ensures("result != null")
public static BCF2Type decodeType(final byte typeDescriptor) {
return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
}
public static boolean sizeIsOverflow(final byte typeDescriptor) {
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
}
public static byte readByte(final InputStream stream) throws IOException {
return (byte)(stream.read() & 0xFF);
}
/**
* Collapse multiple strings into a comma separated list
*
* ["s1", "s2", "s3"] => ",s1,s2,s3"
*
* @param strings size > 1 list of strings
* @return
*/
@Requires({"strings != null"})
@Ensures("result != null")
public static String collapseStringList(final List<String> strings) {
if ( strings.isEmpty() ) return "";
else if ( strings.size() == 1 ) return strings.get(0);
else {
final StringBuilder b = new StringBuilder();
for ( final String s : strings ) {
if ( s != null ) {
assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s);
}
}
return b.toString();
}
}
/**
* Inverse operation of collapseStringList.
*
* ",s1,s2,s3" => ["s1", "s2", "s3"]
*
*
* @param collapsed
* @return
*/
@Requires({"collapsed != null", "isCollapsedString(collapsed)"})
@Ensures("result != null")
public static List<String> explodeStringList(final String collapsed) {
assert isCollapsedString(collapsed);
final String[] exploded = collapsed.substring(1).split(",");
return Arrays.asList(exploded);
}
@Requires("s != null")
public static boolean isCollapsedString(final String s) {
return s.length() > 0 && s.charAt(0) == ',';
}
/**
* Returns a good name for a shadow BCF file for vcfFile.
*
* foo.vcf => foo.bcf
* foo.xxx => foo.xxx.bcf
*
* If the resulting BCF file cannot be written, return null. Happens
* when vcfFile = /dev/null for example
*
* @param vcfFile
* @return the BCF
*/
@Requires("vcfFile != null")
public static final File shadowBCF(final File vcfFile) {
final String path = vcfFile.getAbsolutePath();
if ( path.contains(".vcf") )
return new File(path.replace(".vcf", ".bcf"));
else {
final File bcf = new File( path + ".bcf" );
if ( bcf.canRead() )
return bcf;
else {
try {
// this is the only way to robustly decide if we could actually write to BCF
final FileOutputStream o = new FileOutputStream(bcf);
o.close();
bcf.delete();
return bcf;
} catch ( FileNotFoundException e ) {
return null;
} catch ( IOException e ) {
return null;
}
}
}
}
@Ensures("result.isIntegerType()")
public static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new TribbleException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
@Ensures("result.isIntegerType()")
public static BCF2Type determineIntegerType(final int[] values) {
// find the min and max values in the array
int max = 0, min = 0;
for ( final int v : values ) {
if ( v > max ) max = v;
if ( v < min ) min = v;
}
final BCF2Type maxType = determineIntegerType(max);
final BCF2Type minType = determineIntegerType(min);
// INT8 < INT16 < INT32 so this returns the larger of the two
return maxType.compareTo(minType) >= 0 ? maxType : minType;
}
/**
* Returns the maximum BCF2 integer size of t1 and t2
*
* For example, if t1 == INT8 and t2 == INT16 returns INT16
*
* @param t1
* @param t2
* @return
*/
@Requires({"t1.isIntegerType()","t2.isIntegerType()"})
@Ensures("result.isIntegerType()")
public static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
switch ( t1 ) {
case INT8: return t2;
case INT16: return t2 == BCF2Type.INT32 ? t2 : t1;
case INT32: return t1;
default: throw new TribbleException("BUG: unexpected BCF2Type " + t1);
}
}
@Ensures("result.isIntegerType()")
public static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new TribbleException("Unexpected integer type " + type1 );
}
}
return maxType;
}
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
public static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
/**
* Are the elements and their order in the output and input headers consistent so that
* we can write out the raw genotypes block without decoding and recoding it?
*
* If the order of INFO, FILTER, or contrig elements in the output header is different than
* in the input header we must decode the blocks using the input header and then recode them
* based on the new output order.
*
* If they are consistent, we can simply pass through the raw genotypes block bytes, which is
* a *huge* performance win for large blocks.
*
* Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc)
* don't modify the ordering of the header fields and so can safely pass through the genotypes
* undecoded. Some operations -- those at add filters or info fields -- can change the ordering
* of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded
*/
public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) {
// first, we have to have the same samples in the same order
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
return false;
final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
while ( inputLinesIt.hasNext() ) {
if ( ! outputLinesIt.hasNext() ) // missing lines in output
return false;
final VCFIDHeaderLine outputLine = outputLinesIt.next();
final VCFIDHeaderLine inputLine = inputLinesIt.next();
if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
return false;
}
return true;
}
private static <T> List<T> nullAsEmpty(List<T> l) {
if ( l == null )
return Collections.emptyList();
else
return l;
}
}