2009-04-13 08:46:23 +08:00
|
|
|
package org.broadinstitute.sting.utils;
|
|
|
|
|
|
2012-08-22 23:26:08 +08:00
|
|
|
import net.sf.samtools.util.StringUtil;
|
2011-04-08 01:03:48 +08:00
|
|
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
2009-11-25 11:51:41 +08:00
|
|
|
|
2012-06-28 04:55:49 +08:00
|
|
|
import java.util.Arrays;
|
|
|
|
|
|
2009-04-14 22:49:12 +08:00
|
|
|
/**
|
|
|
|
|
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
|
|
|
|
*/
|
2009-04-13 08:46:23 +08:00
|
|
|
public class BaseUtils {
|
2012-02-08 07:11:53 +08:00
|
|
|
public final static byte A = (byte) 'A';
|
|
|
|
|
public final static byte C = (byte) 'C';
|
|
|
|
|
public final static byte G = (byte) 'G';
|
|
|
|
|
public final static byte T = (byte) 'T';
|
2010-05-21 05:02:44 +08:00
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
public final static byte N = (byte) 'N';
|
|
|
|
|
public final static byte D = (byte) 'D';
|
2010-05-21 05:02:44 +08:00
|
|
|
|
2010-05-20 22:05:13 +08:00
|
|
|
//
|
|
|
|
|
// todo -- we need a generalized base abstraction using the Base enum.
|
|
|
|
|
//
|
2012-02-08 07:11:53 +08:00
|
|
|
public final static byte[] BASES = {'A', 'C', 'G', 'T'};
|
|
|
|
|
public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
|
2010-03-29 05:45:22 +08:00
|
|
|
|
|
|
|
|
public enum Base {
|
2012-02-08 07:11:53 +08:00
|
|
|
A('A', 0),
|
|
|
|
|
C('C', 1),
|
|
|
|
|
G('G', 2),
|
|
|
|
|
T('T', 3);
|
2010-03-29 05:45:22 +08:00
|
|
|
|
|
|
|
|
byte b;
|
|
|
|
|
int index;
|
2012-02-08 07:11:53 +08:00
|
|
|
|
2010-03-29 05:45:22 +08:00
|
|
|
private Base(char base, int index) {
|
2012-02-08 07:11:53 +08:00
|
|
|
this.b = (byte) base;
|
2010-03-29 05:45:22 +08:00
|
|
|
this.index = index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public byte getBase() { return b; }
|
2012-02-08 07:11:53 +08:00
|
|
|
|
|
|
|
|
public char getBaseAsChar() { return (char) b; }
|
|
|
|
|
|
2010-03-29 05:45:22 +08:00
|
|
|
public int getIndex() { return index; }
|
|
|
|
|
|
|
|
|
|
public boolean sameBase(byte o) { return b == o; }
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
public boolean sameBase(char o) { return b == (byte) o; }
|
|
|
|
|
|
|
|
|
|
public boolean sameBase(int i) { return index == i; }
|
|
|
|
|
}
|
2010-03-29 05:45:22 +08:00
|
|
|
|
2012-06-28 04:55:49 +08:00
|
|
|
static private final int[] baseIndexMap = new int[256];
|
|
|
|
|
static {
|
|
|
|
|
Arrays.fill(baseIndexMap, -1);
|
|
|
|
|
baseIndexMap['A'] = 0;
|
|
|
|
|
baseIndexMap['a'] = 0;
|
|
|
|
|
baseIndexMap['*'] = 0; // the wildcard character counts as an A
|
|
|
|
|
baseIndexMap['C'] = 1;
|
|
|
|
|
baseIndexMap['c'] = 1;
|
|
|
|
|
baseIndexMap['G'] = 2;
|
|
|
|
|
baseIndexMap['g'] = 2;
|
|
|
|
|
baseIndexMap['T'] = 3;
|
|
|
|
|
baseIndexMap['t'] = 3;
|
|
|
|
|
}
|
|
|
|
|
|
2010-03-04 23:00:02 +08:00
|
|
|
// todo -- fix me (enums?)
|
|
|
|
|
public static final byte DELETION_INDEX = 4;
|
|
|
|
|
public static final byte NO_CALL_INDEX = 5; // (this is 'N')
|
2009-05-23 01:05:06 +08:00
|
|
|
|
2012-08-17 02:00:48 +08:00
|
|
|
public static final int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A');
|
|
|
|
|
public static final int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C');
|
|
|
|
|
public static final int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G');
|
|
|
|
|
public static final int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T');
|
2010-03-15 05:08:14 +08:00
|
|
|
|
2009-05-23 01:05:06 +08:00
|
|
|
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
|
|
|
|
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
|
|
|
|
// Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions.
|
|
|
|
|
public enum BaseSubstitutionType {
|
|
|
|
|
TRANSITION, // A <-> G or C <-> T
|
|
|
|
|
TRANSVERSION
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the base substitution type of the 2 state SNP
|
2012-02-08 07:11:53 +08:00
|
|
|
*
|
2009-05-23 01:05:06 +08:00
|
|
|
* @param base1
|
|
|
|
|
* @param base2
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2012-02-08 07:11:53 +08:00
|
|
|
public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) {
|
2009-05-23 01:05:06 +08:00
|
|
|
BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION;
|
|
|
|
|
//System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t);
|
|
|
|
|
return t;
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
public static boolean isTransition(byte base1, byte base2) {
|
2009-05-23 01:05:06 +08:00
|
|
|
int b1 = simpleBaseToBaseIndex(base1);
|
|
|
|
|
int b2 = simpleBaseToBaseIndex(base2);
|
|
|
|
|
return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 ||
|
2012-02-08 07:11:53 +08:00
|
|
|
b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
|
2009-05-23 01:05:06 +08:00
|
|
|
}
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
public static boolean isTransversion(byte base1, byte base2) {
|
|
|
|
|
return !isTransition(base1, base2);
|
2009-05-23 01:05:06 +08:00
|
|
|
}
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
/**
|
|
|
|
|
* Private constructor. No instantiating this class!
|
|
|
|
|
*/
|
2009-04-24 01:45:39 +08:00
|
|
|
private BaseUtils() {}
|
2009-05-22 06:25:16 +08:00
|
|
|
|
|
|
|
|
static public boolean basesAreEqual(byte base1, byte base2) {
|
2010-05-20 22:05:13 +08:00
|
|
|
return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2);
|
2009-05-22 06:25:16 +08:00
|
|
|
}
|
|
|
|
|
|
2011-01-06 04:05:56 +08:00
|
|
|
static public boolean extendedBasesAreEqual(byte base1, byte base2) {
|
|
|
|
|
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
|
|
|
|
}
|
|
|
|
|
|
2012-06-08 22:42:42 +08:00
|
|
|
/**
|
|
|
|
|
* @return true iff the bases array contains at least one instance of base
|
|
|
|
|
*/
|
|
|
|
|
static public boolean containsBase(final byte[] bases, final byte base) {
|
|
|
|
|
for ( final byte b : bases ) {
|
|
|
|
|
if ( b == base )
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-16 02:34:41 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a IUPAC nucleotide code to a pair of bases
|
|
|
|
|
*
|
|
|
|
|
* @param code
|
|
|
|
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-07-16 02:34:41 +08:00
|
|
|
static public char[] iupacToBases(char code) {
|
|
|
|
|
char[] bases = new char[2];
|
|
|
|
|
switch (code) {
|
|
|
|
|
case '*': // the wildcard character counts as an A
|
|
|
|
|
case 'A':
|
|
|
|
|
case 'a':
|
|
|
|
|
bases[0] = bases[1] = 'A';
|
|
|
|
|
break;
|
|
|
|
|
case 'C':
|
|
|
|
|
case 'c':
|
|
|
|
|
bases[0] = bases[1] = 'C';
|
|
|
|
|
break;
|
|
|
|
|
case 'G':
|
|
|
|
|
case 'g':
|
|
|
|
|
bases[0] = bases[1] = 'G';
|
|
|
|
|
break;
|
|
|
|
|
case 'T':
|
|
|
|
|
case 't':
|
|
|
|
|
bases[0] = bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'R':
|
|
|
|
|
case 'r':
|
|
|
|
|
bases[0] = 'A';
|
|
|
|
|
bases[1] = 'G';
|
|
|
|
|
break;
|
|
|
|
|
case 'Y':
|
|
|
|
|
case 'y':
|
|
|
|
|
bases[0] = 'C';
|
|
|
|
|
bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'S':
|
|
|
|
|
case 's':
|
|
|
|
|
bases[0] = 'G';
|
|
|
|
|
bases[1] = 'C';
|
|
|
|
|
break;
|
|
|
|
|
case 'W':
|
|
|
|
|
case 'w':
|
|
|
|
|
bases[0] = 'A';
|
|
|
|
|
bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'K':
|
|
|
|
|
case 'k':
|
|
|
|
|
bases[0] = 'G';
|
|
|
|
|
bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'M':
|
|
|
|
|
case 'm':
|
|
|
|
|
bases[0] = 'A';
|
|
|
|
|
bases[1] = 'C';
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
bases[0] = bases[1] = 'N';
|
|
|
|
|
}
|
|
|
|
|
return bases;
|
|
|
|
|
}
|
2010-05-20 20:38:06 +08:00
|
|
|
|
2010-07-20 03:10:29 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a simple base to a base index
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param base [AaCcGgTt]
|
2010-07-20 03:10:29 +08:00
|
|
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
|
|
|
|
*/
|
|
|
|
|
static public int simpleBaseToBaseIndex(byte base) {
|
2012-06-28 04:55:49 +08:00
|
|
|
return baseIndexMap[base];
|
2010-07-20 03:10:29 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-14 22:49:12 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a simple base to a base index
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param base [AaCcGgTt]
|
2009-04-14 22:49:12 +08:00
|
|
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-04-13 08:46:23 +08:00
|
|
|
static public int simpleBaseToBaseIndex(char base) {
|
2012-06-28 04:55:49 +08:00
|
|
|
return baseIndexMap[base];
|
2009-04-13 08:46:23 +08:00
|
|
|
}
|
|
|
|
|
|
2010-05-21 05:02:44 +08:00
|
|
|
static public int extendedBaseToBaseIndex(byte base) {
|
2010-03-04 23:00:02 +08:00
|
|
|
switch (base) {
|
|
|
|
|
case 'd':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'D':
|
|
|
|
|
return DELETION_INDEX;
|
2010-03-04 23:00:02 +08:00
|
|
|
case 'n':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'N':
|
|
|
|
|
return NO_CALL_INDEX;
|
2010-03-04 23:00:02 +08:00
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
default:
|
|
|
|
|
return simpleBaseToBaseIndex(base);
|
2010-03-04 23:00:02 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2012-08-21 01:41:08 +08:00
|
|
|
static public boolean isRegularBase( final char base ) {
|
2009-09-10 05:19:36 +08:00
|
|
|
return simpleBaseToBaseIndex(base) != -1;
|
|
|
|
|
}
|
|
|
|
|
|
2012-08-21 01:41:08 +08:00
|
|
|
static public boolean isRegularBase( final byte base ) {
|
2010-05-21 05:02:44 +08:00
|
|
|
return simpleBaseToBaseIndex(base) != -1;
|
2009-12-02 23:41:35 +08:00
|
|
|
}
|
|
|
|
|
|
2012-08-21 01:41:08 +08:00
|
|
|
static public boolean isAllRegularBases( final byte[] bases ) {
|
|
|
|
|
for( final byte base : bases) {
|
|
|
|
|
if( !isRegularBase(base) ) { return false; }
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2010-01-06 23:01:51 +08:00
|
|
|
static public boolean isNBase(byte base) {
|
2011-05-20 03:50:39 +08:00
|
|
|
return base == 'N' || base == 'n';
|
2010-01-06 23:01:51 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-14 22:49:12 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a base index to a simple base
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param baseIndex 0, 1, 2, 3
|
2009-04-14 22:49:12 +08:00
|
|
|
* @return A, C, G, T, or '.' if the index can't be understood
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
static public byte baseIndexToSimpleBase(int baseIndex) {
|
2009-04-13 08:46:23 +08:00
|
|
|
switch (baseIndex) {
|
2012-02-08 07:11:53 +08:00
|
|
|
case 0:
|
|
|
|
|
return 'A';
|
|
|
|
|
case 1:
|
|
|
|
|
return 'C';
|
|
|
|
|
case 2:
|
|
|
|
|
return 'G';
|
|
|
|
|
case 3:
|
|
|
|
|
return 'T';
|
|
|
|
|
default:
|
|
|
|
|
return '.';
|
2009-04-13 08:46:23 +08:00
|
|
|
}
|
|
|
|
|
}
|
2009-04-14 22:49:12 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Converts a base index to a base index representing its cross-talk partner
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param baseIndex 0, 1, 2, 3
|
2009-04-14 22:49:12 +08:00
|
|
|
* @return 1, 0, 3, 2, or -1 if the index can't be understood
|
|
|
|
|
*/
|
|
|
|
|
static public int crossTalkPartnerIndex(int baseIndex) {
|
|
|
|
|
switch (baseIndex) {
|
2012-02-08 07:11:53 +08:00
|
|
|
case 0:
|
|
|
|
|
return 1; // A -> C
|
|
|
|
|
case 1:
|
|
|
|
|
return 0; // C -> A
|
|
|
|
|
case 2:
|
|
|
|
|
return 3; // G -> T
|
|
|
|
|
case 3:
|
|
|
|
|
return 2; // T -> G
|
|
|
|
|
default:
|
|
|
|
|
return -1;
|
2009-04-14 22:49:12 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Converts a base to the base representing its cross-talk partner
|
2009-04-22 06:25:33 +08:00
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param base [AaCcGgTt]
|
2009-04-14 22:49:12 +08:00
|
|
|
* @return C, A, T, G, or '.' if the base can't be understood
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-04-14 22:49:12 +08:00
|
|
|
static public char crossTalkPartnerBase(char base) {
|
2012-02-08 07:11:53 +08:00
|
|
|
return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
|
2009-04-14 22:49:12 +08:00
|
|
|
}
|
2009-04-22 06:25:33 +08:00
|
|
|
|
2009-05-15 02:57:48 +08:00
|
|
|
/**
|
|
|
|
|
* Return the complement of a base index.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
|
2009-05-15 02:57:48 +08:00
|
|
|
* @return the complementary base index
|
|
|
|
|
*/
|
|
|
|
|
static public byte complementIndex(int baseIndex) {
|
|
|
|
|
switch (baseIndex) {
|
2012-02-08 07:11:53 +08:00
|
|
|
case 0:
|
|
|
|
|
return 3; // a -> t
|
|
|
|
|
case 1:
|
|
|
|
|
return 2; // c -> g
|
|
|
|
|
case 2:
|
|
|
|
|
return 1; // g -> c
|
|
|
|
|
case 3:
|
|
|
|
|
return 0; // t -> a
|
|
|
|
|
default:
|
|
|
|
|
return -1; // wtf?
|
2009-05-15 02:57:48 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
/**
|
2009-12-07 22:26:27 +08:00
|
|
|
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
|
|
|
|
|
*
|
|
|
|
|
* @param base the base [AaCcGgTt]
|
2009-05-23 03:32:20 +08:00
|
|
|
* @return the complementary base, or the input base if it's not one of the understood ones
|
2009-05-15 02:57:48 +08:00
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
static public byte simpleComplement(byte base) {
|
2009-04-22 06:25:33 +08:00
|
|
|
switch (base) {
|
|
|
|
|
case 'A':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'a':
|
|
|
|
|
return 'T';
|
2009-04-22 06:25:33 +08:00
|
|
|
case 'C':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'c':
|
|
|
|
|
return 'G';
|
2009-04-22 06:25:33 +08:00
|
|
|
case 'G':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'g':
|
|
|
|
|
return 'C';
|
2009-04-22 06:25:33 +08:00
|
|
|
case 'T':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 't':
|
|
|
|
|
return 'A';
|
|
|
|
|
default:
|
|
|
|
|
return base;
|
2009-04-22 06:25:33 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
|
|
|
|
static public char simpleComplement(char base) {
|
2012-02-08 07:11:53 +08:00
|
|
|
return (char) simpleComplement((byte) base);
|
2010-05-20 22:05:13 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-15 02:57:48 +08:00
|
|
|
/**
|
2009-05-22 03:39:39 +08:00
|
|
|
* Reverse complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
|
|
|
|
*
|
2009-12-07 22:26:27 +08:00
|
|
|
* @param bases the byte array of bases
|
2009-05-15 02:57:48 +08:00
|
|
|
* @return the reverse complement of the base byte array
|
|
|
|
|
*/
|
2009-04-22 06:25:33 +08:00
|
|
|
static public byte[] simpleReverseComplement(byte[] bases) {
|
|
|
|
|
byte[] rcbases = new byte[bases.length];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < bases.length; i++) {
|
2010-07-20 03:10:29 +08:00
|
|
|
rcbases[i] = simpleComplement(bases[bases.length - 1 - i]);
|
2009-04-22 06:25:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return rcbases;
|
|
|
|
|
}
|
2009-05-22 02:30:45 +08:00
|
|
|
|
2009-10-17 06:29:35 +08:00
|
|
|
/**
|
|
|
|
|
* Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param bases the byte array of bases
|
2009-10-17 06:29:35 +08:00
|
|
|
* @return the complement of the base byte array
|
|
|
|
|
*/
|
|
|
|
|
static public byte[] simpleComplement(byte[] bases) {
|
|
|
|
|
byte[] rcbases = new byte[bases.length];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < bases.length; i++) {
|
2010-07-20 03:10:29 +08:00
|
|
|
rcbases[i] = simpleComplement(bases[i]);
|
2009-10-17 06:29:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return rcbases;
|
|
|
|
|
}
|
|
|
|
|
|
2009-12-18 04:07:26 +08:00
|
|
|
/**
|
|
|
|
|
* Reverse complement a char array of bases
|
|
|
|
|
*
|
|
|
|
|
* @param bases the char array of bases
|
|
|
|
|
* @return the reverse complement of the char byte array
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-12-18 04:07:26 +08:00
|
|
|
static public char[] simpleReverseComplement(char[] bases) {
|
|
|
|
|
char[] rcbases = new char[bases.length];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < bases.length; i++) {
|
|
|
|
|
rcbases[i] = simpleComplement(bases[bases.length - 1 - i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return rcbases;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Complement a char array of bases
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param bases the char array of bases
|
2009-12-18 04:07:26 +08:00
|
|
|
* @return the complement of the base char array
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-12-18 04:07:26 +08:00
|
|
|
static public char[] simpleComplement(char[] bases) {
|
|
|
|
|
char[] rcbases = new char[bases.length];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < bases.length; i++) {
|
|
|
|
|
rcbases[i] = simpleComplement(bases[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return rcbases;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:39:39 +08:00
|
|
|
/**
|
|
|
|
|
* Reverse complement a String of bases. Preserves ambiguous bases.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param bases the String of bases
|
2009-05-22 03:39:39 +08:00
|
|
|
* @return the reverse complement of the String
|
|
|
|
|
*/
|
2010-07-20 03:10:29 +08:00
|
|
|
@Deprecated
|
2009-05-22 03:39:39 +08:00
|
|
|
static public String simpleReverseComplement(String bases) {
|
2009-05-23 03:32:20 +08:00
|
|
|
return new String(simpleReverseComplement(bases.getBytes()));
|
2009-05-22 03:39:39 +08:00
|
|
|
}
|
|
|
|
|
|
2009-10-17 06:29:35 +08:00
|
|
|
/**
|
|
|
|
|
* Complement a String of bases. Preserves ambiguous bases.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param bases the String of bases
|
2009-10-17 06:29:35 +08:00
|
|
|
* @return the complement of the String
|
|
|
|
|
*/
|
2010-07-20 03:10:29 +08:00
|
|
|
@Deprecated
|
2009-10-17 06:29:35 +08:00
|
|
|
static public String simpleComplement(String bases) {
|
|
|
|
|
return new String(simpleComplement(bases.getBytes()));
|
|
|
|
|
}
|
|
|
|
|
|
2012-07-26 13:50:39 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the uppercased version of the bases
|
|
|
|
|
*
|
|
|
|
|
* @param bases the bases
|
|
|
|
|
* @return the upper cased version
|
|
|
|
|
*/
|
2012-08-22 23:26:08 +08:00
|
|
|
static public void convertToUpperCase(final byte[] bases) {
|
|
|
|
|
StringUtil.toUpperCase(bases);
|
2012-07-26 13:50:39 +08:00
|
|
|
}
|
|
|
|
|
|
2011-07-29 06:58:36 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the index of the most common base in the basecounts array. To be used with
|
|
|
|
|
* pileup.getBaseCounts.
|
|
|
|
|
*
|
|
|
|
|
* @param baseCounts counts of a,c,g,t in order.
|
|
|
|
|
* @return the index of the most common base
|
|
|
|
|
*/
|
|
|
|
|
static public int mostFrequentBaseIndex(int[] baseCounts) {
|
|
|
|
|
int mostFrequentBaseIndex = 0;
|
|
|
|
|
for (int baseIndex = 1; baseIndex < 4; baseIndex++) {
|
|
|
|
|
if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) {
|
|
|
|
|
mostFrequentBaseIndex = baseIndex;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return mostFrequentBaseIndex;
|
|
|
|
|
}
|
|
|
|
|
|
2011-08-05 05:49:08 +08:00
|
|
|
static public int mostFrequentBaseIndexNotRef(int[] baseCounts, int refBaseIndex) {
|
|
|
|
|
int tmp = baseCounts[refBaseIndex];
|
|
|
|
|
baseCounts[refBaseIndex] = -1;
|
|
|
|
|
int result = mostFrequentBaseIndex(baseCounts);
|
|
|
|
|
baseCounts[refBaseIndex] = tmp;
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static public int mostFrequentBaseIndexNotRef(int[] baseCounts, byte refSimpleBase) {
|
|
|
|
|
return mostFrequentBaseIndexNotRef(baseCounts, simpleBaseToBaseIndex(refSimpleBase));
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-29 06:58:36 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param baseCounts counts of a,c,g,t in order.
|
2011-07-29 06:58:36 +08:00
|
|
|
* @return the most common base
|
|
|
|
|
*/
|
|
|
|
|
static public byte mostFrequentSimpleBase(int[] baseCounts) {
|
|
|
|
|
return baseIndexToSimpleBase(mostFrequentBaseIndex(baseCounts));
|
|
|
|
|
}
|
2009-08-13 04:16:22 +08:00
|
|
|
|
2009-05-22 04:35:31 +08:00
|
|
|
/**
|
|
|
|
|
* For the most frequent base in the sequence, return the percentage of the read it constitutes.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param sequence the read sequence
|
|
|
|
|
* @return the percentage of the read that's made up of the most frequent base
|
2009-05-22 04:35:31 +08:00
|
|
|
*/
|
|
|
|
|
static public double mostFrequentBaseFraction(byte[] sequence) {
|
|
|
|
|
int[] baseCounts = new int[4];
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
for (byte base : sequence) {
|
2010-07-20 03:10:29 +08:00
|
|
|
int baseIndex = simpleBaseToBaseIndex(base);
|
2009-05-22 04:35:31 +08:00
|
|
|
|
|
|
|
|
if (baseIndex >= 0) {
|
|
|
|
|
baseCounts[baseIndex]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-29 06:58:36 +08:00
|
|
|
int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts);
|
2009-05-22 04:35:31 +08:00
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length);
|
2009-05-22 04:35:31 +08:00
|
|
|
}
|
2009-06-09 08:47:54 +08:00
|
|
|
|
2010-05-20 22:05:13 +08:00
|
|
|
// --------------------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// random bases
|
|
|
|
|
//
|
|
|
|
|
// --------------------------------------------------------------------------------
|
|
|
|
|
|
2009-06-09 08:47:54 +08:00
|
|
|
/**
|
|
|
|
|
* Return a random base index (A=0, C=1, G=2, T=3).
|
|
|
|
|
*
|
|
|
|
|
* @return a random base index (A=0, C=1, G=2, T=3)
|
|
|
|
|
*/
|
|
|
|
|
static public int getRandomBaseIndex() {
|
|
|
|
|
return getRandomBaseIndex(-1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a random base index, excluding some base index.
|
|
|
|
|
*
|
|
|
|
|
* @param excludeBaseIndex the base index to exclude
|
|
|
|
|
* @return a random base index, excluding the one specified (A=0, C=1, G=2, T=3)
|
|
|
|
|
*/
|
|
|
|
|
static public int getRandomBaseIndex(int excludeBaseIndex) {
|
|
|
|
|
int randomBaseIndex = excludeBaseIndex;
|
|
|
|
|
|
|
|
|
|
while (randomBaseIndex == excludeBaseIndex) {
|
2011-04-08 01:03:48 +08:00
|
|
|
randomBaseIndex = GenomeAnalysisEngine.getRandomGenerator().nextInt(4);
|
2009-06-09 08:47:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return randomBaseIndex;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a random base (A, C, G, T).
|
|
|
|
|
*
|
|
|
|
|
* @return a random base (A, C, G, T)
|
|
|
|
|
*/
|
2010-07-20 03:10:29 +08:00
|
|
|
@Deprecated
|
2010-05-20 22:05:13 +08:00
|
|
|
static public byte getRandomBase() {
|
2009-06-09 08:47:54 +08:00
|
|
|
return getRandomBase('.');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a random base, excluding some base.
|
|
|
|
|
*
|
|
|
|
|
* @param excludeBase the base to exclude
|
|
|
|
|
* @return a random base, excluding the one specified (A, C, G, T)
|
|
|
|
|
*/
|
2010-07-20 03:10:29 +08:00
|
|
|
@Deprecated
|
2010-05-20 22:05:13 +08:00
|
|
|
static public byte getRandomBase(char excludeBase) {
|
2009-06-09 08:47:54 +08:00
|
|
|
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
|
|
|
|
|
}
|
2012-02-08 07:11:53 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
|
2009-06-07 07:05:24 +08:00
|
|
|
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
|
2012-02-08 07:11:53 +08:00
|
|
|
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
|
|
|
|
|
* of 2 (it has a period of 4 as well), and so does
|
|
|
|
|
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
|
2009-06-07 07:05:24 +08:00
|
|
|
* the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
|
|
|
|
|
* or if specified minPeriod is greater than the sequence length.
|
2012-02-08 07:11:53 +08:00
|
|
|
*
|
2009-06-07 07:05:24 +08:00
|
|
|
* @param seq
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2010-01-28 12:17:20 +08:00
|
|
|
public static int sequencePeriod(byte[] seq, int minPeriod) {
|
2012-02-08 07:11:53 +08:00
|
|
|
int period = (minPeriod > seq.length ? seq.length : minPeriod);
|
|
|
|
|
// we assume that bases [0,period-1] repeat themselves and check this assumption
|
|
|
|
|
// until we find correct period
|
|
|
|
|
|
|
|
|
|
for (int pos = period; pos < seq.length; pos++) {
|
|
|
|
|
|
|
|
|
|
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
|
|
|
|
|
// if our current hypothesis holds, base[pos] must be the same as base[offset]
|
|
|
|
|
|
|
|
|
|
if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
|
|
|
|
|
|
|
|
|
|
// period we have been trying so far does not work.
|
|
|
|
|
// two possibilities:
|
|
|
|
|
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
|
|
|
|
|
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
|
|
|
|
|
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
|
|
|
|
|
// pos will be autoincremented and we will be checking next base
|
|
|
|
|
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
|
|
|
|
|
// hence we should first check if it matches the first base of the sequence, and to do that
|
|
|
|
|
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
|
|
|
|
|
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
|
|
|
|
|
// on the next loop re-entrance after pos is autoincremented)
|
|
|
|
|
if (offset == 0)
|
|
|
|
|
period = pos + 1;
|
|
|
|
|
else
|
|
|
|
|
period = pos--;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return period;
|
2009-06-07 07:05:24 +08:00
|
|
|
}
|
2009-04-13 08:46:23 +08:00
|
|
|
}
|
2009-06-07 07:05:24 +08:00
|
|
|
|
|
|
|
|
/* code snippet for testing sequencePeriod():
|
|
|
|
|
*
|
|
|
|
|
* String str = "CCTTG";
|
|
|
|
|
int p = 0;
|
|
|
|
|
System.out.print("Periods of " + str +" are:");
|
|
|
|
|
while ( p < str.length() ) {
|
|
|
|
|
p = sequencePeriod(str, p+1);
|
|
|
|
|
System.out.print(" "+p);
|
|
|
|
|
}
|
|
|
|
|
System.out.println(); System.exit(1);
|
|
|
|
|
*/
|