gatk-3.8/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java

477 lines
17 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.variant.variantcontext;
import net.sf.samtools.util.StringUtil;
import java.util.Arrays;
import java.util.Collection;
/**
* Immutable representation of an allele
*
* Types of alleles:
*
* Ref: a t C g a // C is the reference base
*
* : a t G g a // C base is a G in some individuals
*
* : a t - g a // C base is deleted w.r.t. the reference
*
* : a t CAg a // A base is inserted w.r.t. the reference sequence
*
* In these cases, where are the alleles?
*
* SNP polymorphism of C/G -> { C , G } -> C is the reference allele
* 1 base deletion of C -> { C , - } -> C is the reference allele
* 1 base insertion of A -> { - ; A } -> Null is the reference allele
*
* Suppose I see a the following in the population:
*
* Ref: a t C g a // C is the reference base
* : a t G g a // C base is a G in some individuals
* : a t - g a // C base is deleted w.r.t. the reference
*
* How do I represent this? There are three segregating alleles:
*
* { C , G , - }
*
* Now suppose I have this more complex example:
*
* Ref: a t C g a // C is the reference base
* : a t - g a
* : a t - - a
* : a t CAg a
*
* There are actually four segregating alleles:
*
* { C g , - g, - -, and CAg } over bases 2-4
*
* However, the molecular equivalence explicitly listed above is usually discarded, so the actual
* segregating alleles are:
*
* { C g, g, -, C a g }
*
* Critically, it should be possible to apply an allele to a reference sequence to create the
* correct haplotype sequence:
*
* Allele + reference => haplotype
*
* For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the
* Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context.
*
* Given list of alleles it's possible to determine the "type" of the variation
*
* A / C @ loc => SNP with
* - / A => INDEL
*
* If you know where allele is the reference, you can determine whether the variant is an insertion or deletion.
*
* Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be
* determined. This is usually represented by a '.' allele.
*
* Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an
* Allele.
* @author ebanks, depristo
*/
public class Allele implements Comparable<Allele> {
private static final byte[] EMPTY_ALLELE_BASES = new byte[0];
private boolean isRef = false;
private boolean isNoCall = false;
private boolean isSymbolic = false;
private byte[] bases = null;
public final static String NO_CALL_STRING = ".";
/** A generic static NO_CALL allele for use */
// no public way to create an allele
protected Allele(byte[] bases, boolean isRef) {
// null alleles are no longer allowed
if ( wouldBeNullAllele(bases) ) {
throw new IllegalArgumentException("Null alleles are not supported");
}
// no-calls are represented as no bases
if ( wouldBeNoCallAllele(bases) ) {
this.bases = EMPTY_ALLELE_BASES;
isNoCall = true;
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
return;
}
if ( wouldBeSymbolicAllele(bases) ) {
isSymbolic = true;
if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele");
}
else {
StringUtil.toUpperCase(bases);
}
this.isRef = isRef;
this.bases = bases;
if ( ! acceptableAlleleBases(bases) )
throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'");
}
protected Allele(String bases, boolean isRef) {
this(bases.getBytes(), isRef);
}
private final static Allele REF_A = new Allele("A", true);
private final static Allele ALT_A = new Allele("A", false);
private final static Allele REF_C = new Allele("C", true);
private final static Allele ALT_C = new Allele("C", false);
private final static Allele REF_G = new Allele("G", true);
private final static Allele ALT_G = new Allele("G", false);
private final static Allele REF_T = new Allele("T", true);
private final static Allele ALT_T = new Allele("T", false);
private final static Allele REF_N = new Allele("N", true);
private final static Allele ALT_N = new Allele("N", false);
public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false);
// ---------------------------------------------------------------------------------------------------------
//
// creation routines
//
// ---------------------------------------------------------------------------------------------------------
/**
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases
* == '-', a Null allele is created. If bases == '.', a no call Allele is created.
*
* @param bases the DNA sequence of this variation, '-', of '.'
* @param isRef should we make this a reference allele?
* @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated
*/
public static Allele create(byte[] bases, boolean isRef) {
if ( bases == null )
throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele");
if ( bases.length == 1 ) {
// optimization to return a static constant Allele for each single base object
switch (bases[0]) {
case '.':
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
return NO_CALL;
case 'A': case 'a' : return isRef ? REF_A : ALT_A;
case 'C': case 'c' : return isRef ? REF_C : ALT_C;
case 'G': case 'g' : return isRef ? REF_G : ALT_G;
case 'T': case 't' : return isRef ? REF_T : ALT_T;
case 'N': case 'n' : return isRef ? REF_N : ALT_N;
default: throw new IllegalArgumentException("Illegal base [" + (char)bases[0] + "] seen in the allele");
}
} else {
return new Allele(bases, isRef);
}
}
public static Allele create(byte base, boolean isRef) {
// public Allele(byte base, boolean isRef) {
return create( new byte[]{ base }, isRef);
}
public static Allele create(byte base) {
return create( base, false );
}
public static Allele extend(Allele left, byte[] right) {
if (left.isSymbolic())
throw new IllegalArgumentException("Cannot extend a symbolic allele");
byte[] bases = new byte[left.length() + right.length];
System.arraycopy(left.getBases(), 0, bases, 0, left.length());
System.arraycopy(right, 0, bases, left.length(), right.length);
return create(bases, left.isReference());
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the null allele
*/
public static boolean wouldBeNullAllele(byte[] bases) {
return (bases.length == 1 && bases[0] == '-') || bases.length == 0;
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the NO_CALL allele
*/
public static boolean wouldBeNoCallAllele(byte[] bases) {
return bases.length == 1 && bases[0] == '.';
}
/**
* @param bases bases representing an allele
* @return true if the bases represent a symbolic allele
*/
public static boolean wouldBeSymbolicAllele(byte[] bases) {
if ( bases.length <= 2 )
return false;
else {
final String strBases = new String(bases);
return (bases[0] == '<' && bases[bases.length-1] == '>') ||
(strBases.contains("[") || strBases.contains("]"));
}
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the well formatted allele
*/
public static boolean acceptableAlleleBases(String bases) {
return acceptableAlleleBases(bases.getBytes(), true);
}
public static boolean acceptableAlleleBases(String bases, boolean allowNsAsAcceptable) {
return acceptableAlleleBases(bases.getBytes(), allowNsAsAcceptable);
}
/**
* @param bases bases representing an allele
* @return true if the bases represent the well formatted allele
*/
public static boolean acceptableAlleleBases(byte[] bases) {
return acceptableAlleleBases(bases, true); // default: N bases are acceptable
}
public static boolean acceptableAlleleBases(byte[] bases, boolean allowNsAsAcceptable) {
if ( wouldBeNullAllele(bases) )
return false;
if ( wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) )
return true;
for (byte base : bases ) {
switch (base) {
case 'A': case 'C': case 'G': case 'T': case 'a': case 'c': case 'g': case 't':
break;
case 'N' : case 'n' :
if (allowNsAsAcceptable)
break;
else
return false;
default:
return false;
}
}
return true;
}
/**
* @see Allele(byte[], boolean)
*
* @param bases bases representing an allele
* @param isRef is this the reference allele?
*/
public static Allele create(String bases, boolean isRef) {
//public Allele(String bases, boolean isRef) {
return create(bases.getBytes(), isRef);
}
/**
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
*
* @param bases bases representing an allele
*/
public static Allele create(String bases) {
return create(bases, false);
}
/**
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
*
* @param bases bases representing an allele
*/
public static Allele create(byte[] bases) {
return create(bases, false);
//this(bases, false);
}
// ---------------------------------------------------------------------------------------------------------
//
// accessor routines
//
// ---------------------------------------------------------------------------------------------------------
// Returns true if this is the NO_CALL allele
public boolean isNoCall() { return isNoCall; }
// Returns true if this is not the NO_CALL allele
public boolean isCalled() { return ! isNoCall(); }
// Returns true if this Allele is the reference allele
public boolean isReference() { return isRef; }
// Returns true if this Allele is not the reference allele
public boolean isNonReference() { return ! isReference(); }
// Returns true if this Allele is symbolic (i.e. no well-defined base sequence)
public boolean isSymbolic() { return isSymbolic; }
// Returns a nice string representation of this object
public String toString() {
return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : "");
}
/**
* Return the DNA bases segregating in this allele. Note this isn't reference polarized,
* so the Null allele is represented by a vector of length 0
*
* @return the segregating bases
*/
public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; }
/**
* Return the DNA bases segregating in this allele in String format.
* This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it.
*
* @return the segregating bases
*/
public String getBaseString() { return isNoCall() ? NO_CALL_STRING : new String(getBases()); }
/**
* Return the printed representation of this allele.
* Same as getBaseString(), except for symbolic alleles.
* For symbolic alleles, the base string is empty while the display string contains <TAG>.
*
* @return the allele string representation
*/
public String getDisplayString() { return new String(bases); }
Finalizing BCF2 mark III commit -- Moved GENOTYPE_KEY vcf header line to VCFConstants. This general migration and cleanup is on Eric's plate now -- Updated HC to initialize the annotation engine in an order that allows it to write a proper VCF header. Still doesn't work... -- Updating integration test files. Moved many more files into public/testdata. Updated their headers to all work correctly with new strict VCF header checking. -- Bugfix for TandemRepeatAnnotation that must be unbounded not A count type as it provides info for the REF as well as each alt -- No longer add FALSE values to flag values in VCs in VariantAnnotatorEngine. DB = 0 is never seen in the output VCFs now -- Fixed bug in VCFDiffableReader that didn't differeniate between "." and "PASS" VC filter status -- Unconditionally add lowQual Filter to UG output VCF files as this is in some cases (EMIT_ALL_SITES) used when the previous check said it wouldn't be -- VariantsToVCF now properly writes out the GT FORMAT field -- BCF2 codec explodes when reading symbolic alleles as I literally cannot figure out how to use the allele clipping code. Eric said he and Ami will clean up this whole piece of instructure -- Fixed bug in BCF2Codec that wasn't setting the phase field correctly. UnitTested now -- PASS string now added at the end of the BCF2 dictionary after discussion with Heng -- Fixed bug where I was writing out all field values as BigEndian. Now everything is LittleEndian. -- VCFHeader detects the case where a count field has size < 0 (some of our files have count = -1) and throws a UserException -- Cleaned up unused code -- Fixed bug in BCF2 string encoder that wasn't handling the case of an empty list of strings for encoding -- Fixed bug where all samples are no called in a VC, in which case we (like the VCFwriter) write out no called diploid genotypes for all samples -- We always write the number of genotype samples into the BCF2 nSamples header. How we can have a variable number of samples per record isn't clear to me, as we don't have a map from missing samples to header names... -- Removed old filtersWereAppliedToContext code in VCF as properly handle unfiltered, filtered, and PASS records internally -- Fastpath function getDisplayBases() in allele that just gives you the raw bytes[] you'd see for an Allele -- Genotype fields no longer differentiate between unfiltered, filtered, and PASS values. Genotype objects are all PASS implicitly, or explicitly filtered. We only write out the FT values if at least one sample is filtered. Removed interface functions and cleaned up code -- Refactored padAllele code from createVariantContextWithPaddedAlleles into the function padAllele so that it actually works. In general, **** NEVER COPY CODE **** if you need to share funcitonality make a function, that's why there were invented! -- Increased the default number of records to read for DiffObjects to 1M
2012-06-19 21:46:26 +08:00
/**
* Same as #getDisplayString() but returns the result as byte[].
*
* Slightly faster then getDisplayString()
*
* @return the allele string representation
*/
public byte[] getDisplayBases() { return bases; }
/**
* @param other the other allele
*
* @return true if these alleles are equal
*/
public boolean equals(Object other) {
return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) );
}
/**
* @return hash code
*/
public int hashCode() {
int hash = 1;
for (int i = 0; i < bases.length; i++)
hash += (i+1) * bases[i];
return hash;
}
/**
* Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the
* same ref tag
*
* @param other allele to compare to
* @param ignoreRefState if true, ignore ref state in comparison
* @return true if this and other are equal
*/
public boolean equals(Allele other, boolean ignoreRefState) {
return this == other || (isRef == other.isRef || ignoreRefState) && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases));
}
/**
* @param test bases to test against
*
* @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
*/
public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); }
/**
* @param test bases to test against
*
* @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
*/
public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); }
/**
* @param test allele to test against
*
* @return true if this Allele contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
*/
public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); }
/**
* @return the length of this allele. Null and NO_CALL alleles have 0 length.
*/
public int length() {
return isSymbolic ? 0 : bases.length;
}
// ---------------------------------------------------------------------------------------------------------
//
// useful static functions
//
// ---------------------------------------------------------------------------------------------------------
public static Allele getMatchingAllele(Collection<Allele> allAlleles, byte[] alleleBases) {
for ( Allele a : allAlleles ) {
if ( a.basesMatch(alleleBases) ) {
return a;
}
}
if ( wouldBeNoCallAllele(alleleBases) )
return NO_CALL;
else
return null; // couldn't find anything
}
public int compareTo(Allele other) {
if ( isReference() && other.isNonReference() )
return -1;
else if ( isNonReference() && other.isReference() )
return 1;
else
return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue
}
public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) {
if ( a2.length() >= a1.length() )
return firstIsPrefixOfSecond(a1, a2);
else
return firstIsPrefixOfSecond(a2, a1);
}
private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) {
String a1String = a1.getBaseString();
return a2.getBaseString().substring(0, a1String.length()).equals(a1String);
}
}