2010-01-26 21:53:29 +08:00
|
|
|
package org.broadinstitute.sting.oneoffprojects.variantcontext;
|
|
|
|
|
|
|
|
|
|
import org.broadinstitute.sting.utils.BaseUtils;
|
|
|
|
|
|
|
|
|
|
import java.util.Arrays;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @author ebanks, depristo
|
|
|
|
|
* Types of alleles:
|
|
|
|
|
*
|
|
|
|
|
* Ref: a t C g a // C is the reference base
|
|
|
|
|
*
|
|
|
|
|
* : a t G g a // C base is a G in some individuals
|
|
|
|
|
*
|
|
|
|
|
* : a t - g a // C base is deleted w.r.t. the reference
|
|
|
|
|
*
|
|
|
|
|
* : a t CAg a // A base is inserted w.r.t. the reference sequence
|
|
|
|
|
*
|
|
|
|
|
* In these cases, where are the alleles?
|
|
|
|
|
*
|
|
|
|
|
* SNP polymorphism of C/G -> { C , G } -> C is the reference allele
|
|
|
|
|
* 1 base deletion of C -> { C , - } -> C is the reference allele
|
2010-02-02 01:49:51 +08:00
|
|
|
* 1 base insertion of A -> { - ; A } -> Null is the reference allele
|
2010-01-26 21:53:29 +08:00
|
|
|
*
|
|
|
|
|
* Suppose I see a the following in the population:
|
|
|
|
|
*
|
|
|
|
|
* Ref: a t C g a // C is the reference base
|
|
|
|
|
* : a t G g a // C base is a G in some individuals
|
|
|
|
|
* : a t - g a // C base is deleted w.r.t. the reference
|
|
|
|
|
*
|
|
|
|
|
* How do I represent this? There are three segregating alleles:
|
|
|
|
|
*
|
|
|
|
|
* { C , G , - }
|
|
|
|
|
*
|
|
|
|
|
* Now suppose I have this more complex example:
|
|
|
|
|
*
|
|
|
|
|
* Ref: a t C g a // C is the reference base
|
|
|
|
|
* : a t - g a
|
|
|
|
|
* : a t - - a
|
|
|
|
|
* : a t CAg a
|
|
|
|
|
*
|
|
|
|
|
* There are actually four segregating alleles:
|
|
|
|
|
*
|
|
|
|
|
* { C g , - g, - -, and CAg } over bases 2-4
|
|
|
|
|
*
|
|
|
|
|
* However, the molecular equivalence explicitly listed above is usually discarded, so the actual
|
|
|
|
|
* segregating alleles are:
|
|
|
|
|
*
|
|
|
|
|
* { C g, g, -, C a g }
|
|
|
|
|
*
|
|
|
|
|
* Critically, it should be possible to apply an allele to a reference sequence to create the
|
|
|
|
|
* correct haplotype sequence:
|
|
|
|
|
*
|
|
|
|
|
* Allele + reference => haplotype
|
|
|
|
|
*
|
|
|
|
|
* For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the
|
|
|
|
|
* Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context.
|
|
|
|
|
*
|
|
|
|
|
* Given list of alleles it's possible to determine the "type" of the variation
|
|
|
|
|
*
|
|
|
|
|
* A / C @ loc => SNP with
|
|
|
|
|
* - / A => INDEL
|
|
|
|
|
*
|
2010-02-02 01:49:51 +08:00
|
|
|
* If you know where allele is the reference, you can determine whether the variant is an insertion or deletion.
|
|
|
|
|
*
|
|
|
|
|
* Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be
|
|
|
|
|
* determined. This is usually represented by a '.' allele.
|
|
|
|
|
*
|
|
|
|
|
* Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an
|
|
|
|
|
* Allele.
|
2010-01-26 21:53:29 +08:00
|
|
|
*/
|
|
|
|
|
public class Allele {
|
2010-01-28 12:10:16 +08:00
|
|
|
private static final byte[] EMPTY_ALLELE_BASES = new byte[0];
|
2010-01-28 01:19:37 +08:00
|
|
|
|
2010-01-26 21:53:29 +08:00
|
|
|
private boolean isRef = false;
|
2010-01-28 12:10:16 +08:00
|
|
|
private boolean isNull = false;
|
|
|
|
|
private boolean isNoCall = false;
|
|
|
|
|
|
2010-01-26 21:53:29 +08:00
|
|
|
private byte[] bases = null;
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/** A generic static NO_CALL allele for use */
|
2010-01-28 12:10:16 +08:00
|
|
|
public final static Allele NO_CALL = new Allele(".");
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/**
|
|
|
|
|
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases
|
|
|
|
|
* == '-', a Null allele is created. If bases == '.', a no call Allele is created.
|
|
|
|
|
*
|
|
|
|
|
* @param bases the DNA sequence of this variation, '-', of '.'
|
|
|
|
|
* @param isRef should we make this a reference allele?
|
|
|
|
|
* @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated
|
|
|
|
|
*/
|
2010-01-26 21:53:29 +08:00
|
|
|
public Allele(byte[] bases, boolean isRef) {
|
|
|
|
|
if ( bases == null )
|
|
|
|
|
throw new IllegalArgumentException("Constructor: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele");
|
|
|
|
|
|
2010-01-28 01:19:37 +08:00
|
|
|
// standardize our representation of null allele and bases
|
2010-01-28 12:10:16 +08:00
|
|
|
if ( wouldBeNullAllele(bases) ) {
|
|
|
|
|
bases = EMPTY_ALLELE_BASES;
|
|
|
|
|
isNull = true;
|
2010-02-02 01:49:51 +08:00
|
|
|
} else if ( wouldBeNoCallAllele(bases) ) {
|
2010-01-28 12:10:16 +08:00
|
|
|
bases = EMPTY_ALLELE_BASES;
|
|
|
|
|
isNoCall = true;
|
|
|
|
|
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
|
|
|
|
|
}
|
2010-01-28 01:19:37 +08:00
|
|
|
else
|
|
|
|
|
bases = new String(bases).toUpperCase().getBytes(); // todo -- slow performance
|
|
|
|
|
|
|
|
|
|
this.isRef = isRef;
|
2010-01-26 21:53:29 +08:00
|
|
|
this.bases = bases;
|
2010-01-28 12:10:16 +08:00
|
|
|
|
|
|
|
|
if ( ! acceptableAlleleBases(bases) )
|
|
|
|
|
throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases));
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/**
|
|
|
|
|
* Do the bases represent the null allele?
|
|
|
|
|
*/
|
|
|
|
|
public static boolean wouldBeNullAllele(byte[] bases) {
|
2010-01-28 12:10:16 +08:00
|
|
|
return (bases.length == 1 && bases[0] == '-') || bases.length == 0;
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Do the bases represent the NO_CALL allele? */
|
|
|
|
|
public static boolean wouldBeNoCallAllele(byte[] bases) {
|
2010-01-28 12:10:16 +08:00
|
|
|
return bases.length == 1 && bases[0] == '.';
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Do the bases represent the null allele? */
|
|
|
|
|
public static boolean acceptableAlleleBases(String bases) {
|
2010-01-28 12:10:16 +08:00
|
|
|
return acceptableAlleleBases(bases.getBytes());
|
|
|
|
|
}
|
2010-02-02 01:49:51 +08:00
|
|
|
|
|
|
|
|
/** Can we create an allele from bases, including NO_CALL and Null alleles? */
|
|
|
|
|
public static boolean acceptableAlleleBases(byte[] bases) {
|
|
|
|
|
if ( wouldBeNullAllele(bases) || wouldBeNoCallAllele(bases) )
|
2010-01-28 12:10:16 +08:00
|
|
|
return true;
|
|
|
|
|
|
2010-01-26 21:53:29 +08:00
|
|
|
for ( byte b : bases ) {
|
|
|
|
|
if ( ! BaseUtils.isRegularBase(b) ) {
|
2010-01-28 12:10:16 +08:00
|
|
|
return false;
|
2010-01-26 21:53:29 +08:00
|
|
|
}
|
|
|
|
|
}
|
2010-01-28 12:10:16 +08:00
|
|
|
|
|
|
|
|
return true;
|
2010-01-26 21:53:29 +08:00
|
|
|
}
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/**
|
|
|
|
|
* @see Allele(byte[], boolean)
|
|
|
|
|
*
|
|
|
|
|
* @param bases
|
|
|
|
|
* @param isRef
|
|
|
|
|
*/
|
2010-01-26 21:53:29 +08:00
|
|
|
public Allele(String bases, boolean isRef) {
|
|
|
|
|
this(bases.getBytes(), isRef);
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/**
|
|
|
|
|
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
|
|
|
|
|
*
|
|
|
|
|
* @param bases
|
|
|
|
|
*/
|
2010-01-28 01:19:37 +08:00
|
|
|
public Allele(String bases) { this(bases, false); }
|
2010-02-02 01:49:51 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
|
|
|
|
|
*
|
|
|
|
|
* @param bases
|
|
|
|
|
*/
|
2010-01-28 01:19:37 +08:00
|
|
|
public Allele(byte[] bases) { this(bases, false); }
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
// ---------------------------------------------------------------------------------------------------------
|
2010-01-26 21:53:29 +08:00
|
|
|
//
|
|
|
|
|
// accessor routines
|
|
|
|
|
//
|
2010-02-02 01:49:51 +08:00
|
|
|
// ---------------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
/** Returns true if this is the null allele */
|
2010-01-28 12:10:16 +08:00
|
|
|
public boolean isNull() { return isNull; }
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Returns true if this is not the null allele */
|
2010-01-28 12:10:16 +08:00
|
|
|
public boolean isNonNull() { return ! isNull(); }
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Returns true if this is the NO_CALL allele */
|
2010-01-28 12:10:16 +08:00
|
|
|
public boolean isNoCall() { return isNoCall; }
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Returns true if this is the not the NO_CALL allele */
|
2010-01-28 12:10:16 +08:00
|
|
|
public boolean isCalled() { return ! isNoCall(); }
|
2010-01-26 21:53:29 +08:00
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Returns true if this Allele is the reference allele */
|
2010-01-26 21:53:29 +08:00
|
|
|
public boolean isReference() { return isRef; }
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Returns true if this Allele is not the reference allele */
|
2010-01-26 21:53:29 +08:00
|
|
|
public boolean isNonReference() { return ! isReference(); }
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/** Returns a nice string representation of this object */
|
2010-01-28 01:19:37 +08:00
|
|
|
public String toString() {
|
2010-01-28 12:10:16 +08:00
|
|
|
return (isNull() ? "-" : ( isNoCall() ? "." : new String(getBases()))) + (isReference() ? "*" : "");
|
2010-01-28 01:19:37 +08:00
|
|
|
}
|
2010-01-26 21:53:29 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return the DNA bases segregating in this allele. Note this isn't reference polarized,
|
|
|
|
|
* so the Null allele is represented by a vector of length 0
|
|
|
|
|
*
|
|
|
|
|
* @return the segregating bases
|
|
|
|
|
*/
|
|
|
|
|
public byte[] getBases() { return bases; }
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param other the other allele
|
|
|
|
|
*
|
|
|
|
|
* @return true if these alleles are equal
|
|
|
|
|
*/
|
|
|
|
|
public boolean equals(Allele other) {
|
2010-02-02 22:18:46 +08:00
|
|
|
return equals(other, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the
|
|
|
|
|
* same ref tag
|
|
|
|
|
*
|
|
|
|
|
* @param other
|
|
|
|
|
* @param ignoreRefState
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public boolean equals(Allele other, boolean ignoreRefState) {
|
|
|
|
|
return (isRef == other.isRef || ignoreRefState) && isNull == other.isNull && isNoCall == other.isNoCall && this.basesMatch(other.getBases());
|
2010-01-26 21:53:29 +08:00
|
|
|
}
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/**
|
|
|
|
|
* Returns true if this Alelle contains the same bases as test, regardless of its reference status. Also handles
|
|
|
|
|
* Null and NO_CALL alleles
|
|
|
|
|
*
|
|
|
|
|
* @param test
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2010-01-28 01:19:37 +08:00
|
|
|
public boolean basesMatch(byte[] test) { return bases == test || Arrays.equals(bases, test); }
|
2010-02-02 01:49:51 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns true if this Alelle contains the same bases as test, regardless of its reference status. Also handles
|
|
|
|
|
* Null and NO_CALL alleles
|
|
|
|
|
*
|
|
|
|
|
* @param test
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2010-01-28 01:19:37 +08:00
|
|
|
public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); }
|
2010-02-02 01:49:51 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns true if this Alelle contains the same bases as test, regardless of its reference status. Also handles
|
|
|
|
|
* Null and NO_CALL alleles
|
|
|
|
|
*
|
|
|
|
|
* @param test
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2010-01-28 01:19:37 +08:00
|
|
|
public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); }
|
|
|
|
|
|
2010-02-02 01:49:51 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the length of this allele. Null and NO_CALL alleles have 0 length.
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2010-01-26 21:53:29 +08:00
|
|
|
public int length() {
|
|
|
|
|
return bases.length;
|
|
|
|
|
}
|
|
|
|
|
}
|