2013-01-11 06:04:08 +08:00
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2012 The Broad Institute
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person
|
|
|
|
|
* obtaining a copy of this software and associated documentation
|
|
|
|
|
* files (the "Software"), to deal in the Software without
|
|
|
|
|
* restriction, including without limitation the rights to use,
|
|
|
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
* copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following
|
|
|
|
|
* conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice shall be
|
|
|
|
|
* included in all copies or substantial portions of the Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
|
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
|
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
|
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
|
|
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2013-01-31 01:54:57 +08:00
|
|
|
package org.broadinstitute.sting.utils;
|
2009-04-13 08:46:23 +08:00
|
|
|
|
2012-08-22 23:26:08 +08:00
|
|
|
import net.sf.samtools.util.StringUtil;
|
2013-01-31 01:54:57 +08:00
|
|
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|
|
|
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
|
|
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
2009-11-25 11:51:41 +08:00
|
|
|
|
2012-06-28 04:55:49 +08:00
|
|
|
import java.util.Arrays;
|
|
|
|
|
|
2009-04-14 22:49:12 +08:00
|
|
|
/**
|
|
|
|
|
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
|
|
|
|
*/
|
2009-04-13 08:46:23 +08:00
|
|
|
public class BaseUtils {
|
2010-05-21 05:02:44 +08:00
|
|
|
|
2013-01-16 23:22:43 +08:00
|
|
|
public enum Base {
|
|
|
|
|
A ((byte)'A'),
|
|
|
|
|
C ((byte)'C'),
|
|
|
|
|
G ((byte)'G'),
|
|
|
|
|
T ((byte)'T'),
|
|
|
|
|
N ((byte)'N'),
|
|
|
|
|
D ((byte)'D');
|
2010-05-21 05:02:44 +08:00
|
|
|
|
2013-01-16 23:22:43 +08:00
|
|
|
public byte base;
|
|
|
|
|
|
|
|
|
|
private Base(final byte base) {
|
|
|
|
|
this.base = base;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// todo -- add this to the generalized base abstraction using the Base enum.
|
2012-02-08 07:11:53 +08:00
|
|
|
public final static byte[] BASES = {'A', 'C', 'G', 'T'};
|
|
|
|
|
public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
|
2010-03-29 05:45:22 +08:00
|
|
|
|
2012-06-28 04:55:49 +08:00
|
|
|
static private final int[] baseIndexMap = new int[256];
|
|
|
|
|
static {
|
|
|
|
|
Arrays.fill(baseIndexMap, -1);
|
2013-01-16 23:22:43 +08:00
|
|
|
baseIndexMap['A'] = Base.A.ordinal();
|
|
|
|
|
baseIndexMap['a'] = Base.A.ordinal();
|
|
|
|
|
baseIndexMap['*'] = Base.A.ordinal(); // the wildcard character counts as an A
|
|
|
|
|
baseIndexMap['C'] = Base.C.ordinal();
|
|
|
|
|
baseIndexMap['c'] = Base.C.ordinal();
|
|
|
|
|
baseIndexMap['G'] = Base.G.ordinal();
|
|
|
|
|
baseIndexMap['g'] = Base.G.ordinal();
|
|
|
|
|
baseIndexMap['T'] = Base.T.ordinal();
|
|
|
|
|
baseIndexMap['t'] = Base.T.ordinal();
|
2012-06-28 04:55:49 +08:00
|
|
|
}
|
|
|
|
|
|
2013-01-16 23:22:43 +08:00
|
|
|
static private final int[] baseIndexWithIupacMap = baseIndexMap.clone();
|
|
|
|
|
static {
|
|
|
|
|
baseIndexWithIupacMap['*'] = -1; // the wildcard character is bad
|
|
|
|
|
baseIndexWithIupacMap['N'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['n'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['R'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['r'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['Y'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['y'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['M'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['m'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['K'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['k'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['W'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['w'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['S'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['s'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['B'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['b'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['D'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['d'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['H'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['h'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['V'] = Base.N.ordinal();
|
|
|
|
|
baseIndexWithIupacMap['v'] = Base.N.ordinal();
|
|
|
|
|
}
|
2010-03-15 05:08:14 +08:00
|
|
|
|
2009-05-23 01:05:06 +08:00
|
|
|
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
|
|
|
|
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
|
|
|
|
// Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions.
|
|
|
|
|
public enum BaseSubstitutionType {
|
|
|
|
|
TRANSITION, // A <-> G or C <-> T
|
|
|
|
|
TRANSVERSION
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the base substitution type of the 2 state SNP
|
2012-02-08 07:11:53 +08:00
|
|
|
*
|
2009-05-23 01:05:06 +08:00
|
|
|
* @param base1
|
|
|
|
|
* @param base2
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2012-02-08 07:11:53 +08:00
|
|
|
public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) {
|
2009-05-23 01:05:06 +08:00
|
|
|
BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION;
|
|
|
|
|
//System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t);
|
|
|
|
|
return t;
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
public static boolean isTransition(byte base1, byte base2) {
|
2013-01-16 23:22:43 +08:00
|
|
|
final int b1 = simpleBaseToBaseIndex(base1);
|
|
|
|
|
final int b2 = simpleBaseToBaseIndex(base2);
|
|
|
|
|
return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() ||
|
|
|
|
|
b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal();
|
2009-05-23 01:05:06 +08:00
|
|
|
}
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
public static boolean isTransversion(byte base1, byte base2) {
|
|
|
|
|
return !isTransition(base1, base2);
|
2009-05-23 01:05:06 +08:00
|
|
|
}
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
/**
|
|
|
|
|
* Private constructor. No instantiating this class!
|
|
|
|
|
*/
|
2009-04-24 01:45:39 +08:00
|
|
|
private BaseUtils() {}
|
2009-05-22 06:25:16 +08:00
|
|
|
|
|
|
|
|
static public boolean basesAreEqual(byte base1, byte base2) {
|
2010-05-20 22:05:13 +08:00
|
|
|
return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2);
|
2009-05-22 06:25:16 +08:00
|
|
|
}
|
|
|
|
|
|
2011-01-06 04:05:56 +08:00
|
|
|
static public boolean extendedBasesAreEqual(byte base1, byte base2) {
|
|
|
|
|
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
|
|
|
|
}
|
|
|
|
|
|
2012-06-08 22:42:42 +08:00
|
|
|
/**
|
|
|
|
|
* @return true iff the bases array contains at least one instance of base
|
|
|
|
|
*/
|
|
|
|
|
static public boolean containsBase(final byte[] bases, final byte base) {
|
|
|
|
|
for ( final byte b : bases ) {
|
|
|
|
|
if ( b == base )
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
Work on GSA-508 / CachingIndexedFastaReader should internally upper case bases loading data
-- As one might expect, CachingIndexedFastaSequenceFile now internally upper cases the FASTA reference bases. This is now done by default, unless requested explicitly to preserve the original bases.
-- This is really the correct place to do this for a variety of reasons. First, you don't need to work about upper casing bases throughout the code. Second, the cache is only upper cased once, no matter how often the bases are accessed, which walkers cannot optimize themselves. Finally, this uses the fastest function for this -- Picard's toUpperCase(byte[]) which is way better than String.toUpperCase()
-- Added unit tests to ensure this functionality works correct.
-- Removing unnecessary upper casing of bases in some core GATK tools, now that RefContext guarentees that the reference bases are all upper case.
-- Added contracts to ensure this is the case.
-- Remove a ton of sh*t from BaseUtils that was so old I had no idea what it was doing any longer, and didn't have any unit tests to ensure it was correct, and wasn't used anywhere in our code
2012-10-30 07:06:05 +08:00
|
|
|
public static boolean isUpperCase(final byte[] bases) {
|
|
|
|
|
for ( byte base : bases )
|
|
|
|
|
if ( ! isUpperCase(base) )
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static boolean isUpperCase(final byte base) {
|
|
|
|
|
return base >= 'A' && base <= 'Z';
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-17 03:55:33 +08:00
|
|
|
public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) {
|
2013-01-16 23:22:43 +08:00
|
|
|
final int length = bases.length;
|
2013-01-17 03:55:33 +08:00
|
|
|
final int start = ignoreConversionOfFirstByte ? 1 : 0;
|
|
|
|
|
|
|
|
|
|
for ( int i = start; i < length; i++ ) {
|
2013-01-16 23:22:43 +08:00
|
|
|
final int baseIndex = baseIndexWithIupacMap[bases[i]];
|
|
|
|
|
if ( baseIndex == Base.N.ordinal() ) {
|
|
|
|
|
bases[i] = 'N';
|
|
|
|
|
} else if ( errorOnBadReferenceBase && baseIndex == -1 ) {
|
2013-01-31 01:54:57 +08:00
|
|
|
throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'");
|
2013-01-16 23:22:43 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return bases;
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-16 02:34:41 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a IUPAC nucleotide code to a pair of bases
|
|
|
|
|
*
|
|
|
|
|
* @param code
|
|
|
|
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-07-16 02:34:41 +08:00
|
|
|
static public char[] iupacToBases(char code) {
|
|
|
|
|
char[] bases = new char[2];
|
|
|
|
|
switch (code) {
|
|
|
|
|
case '*': // the wildcard character counts as an A
|
|
|
|
|
case 'A':
|
|
|
|
|
case 'a':
|
|
|
|
|
bases[0] = bases[1] = 'A';
|
|
|
|
|
break;
|
|
|
|
|
case 'C':
|
|
|
|
|
case 'c':
|
|
|
|
|
bases[0] = bases[1] = 'C';
|
|
|
|
|
break;
|
|
|
|
|
case 'G':
|
|
|
|
|
case 'g':
|
|
|
|
|
bases[0] = bases[1] = 'G';
|
|
|
|
|
break;
|
|
|
|
|
case 'T':
|
|
|
|
|
case 't':
|
|
|
|
|
bases[0] = bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'R':
|
|
|
|
|
case 'r':
|
|
|
|
|
bases[0] = 'A';
|
|
|
|
|
bases[1] = 'G';
|
|
|
|
|
break;
|
|
|
|
|
case 'Y':
|
|
|
|
|
case 'y':
|
|
|
|
|
bases[0] = 'C';
|
|
|
|
|
bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'S':
|
|
|
|
|
case 's':
|
|
|
|
|
bases[0] = 'G';
|
|
|
|
|
bases[1] = 'C';
|
|
|
|
|
break;
|
|
|
|
|
case 'W':
|
|
|
|
|
case 'w':
|
|
|
|
|
bases[0] = 'A';
|
|
|
|
|
bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'K':
|
|
|
|
|
case 'k':
|
|
|
|
|
bases[0] = 'G';
|
|
|
|
|
bases[1] = 'T';
|
|
|
|
|
break;
|
|
|
|
|
case 'M':
|
|
|
|
|
case 'm':
|
|
|
|
|
bases[0] = 'A';
|
|
|
|
|
bases[1] = 'C';
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
bases[0] = bases[1] = 'N';
|
|
|
|
|
}
|
|
|
|
|
return bases;
|
|
|
|
|
}
|
2010-05-20 20:38:06 +08:00
|
|
|
|
2010-07-20 03:10:29 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a simple base to a base index
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param base [AaCcGgTt]
|
2010-07-20 03:10:29 +08:00
|
|
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
|
|
|
|
*/
|
2012-10-13 03:05:03 +08:00
|
|
|
static public int simpleBaseToBaseIndex(final byte base) {
|
|
|
|
|
if ( base < 0 || base >= 256 )
|
2013-01-31 01:54:57 +08:00
|
|
|
throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)");
|
2012-06-28 04:55:49 +08:00
|
|
|
return baseIndexMap[base];
|
2010-07-20 03:10:29 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-14 22:49:12 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a simple base to a base index
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param base [AaCcGgTt]
|
2009-04-14 22:49:12 +08:00
|
|
|
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-04-13 08:46:23 +08:00
|
|
|
static public int simpleBaseToBaseIndex(char base) {
|
2012-06-28 04:55:49 +08:00
|
|
|
return baseIndexMap[base];
|
2009-04-13 08:46:23 +08:00
|
|
|
}
|
|
|
|
|
|
2010-05-21 05:02:44 +08:00
|
|
|
static public int extendedBaseToBaseIndex(byte base) {
|
2010-03-04 23:00:02 +08:00
|
|
|
switch (base) {
|
|
|
|
|
case 'd':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'D':
|
2013-01-16 23:22:43 +08:00
|
|
|
return Base.D.ordinal();
|
2010-03-04 23:00:02 +08:00
|
|
|
case 'n':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'N':
|
2013-01-16 23:22:43 +08:00
|
|
|
return Base.N.ordinal();
|
2010-03-04 23:00:02 +08:00
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
default:
|
|
|
|
|
return simpleBaseToBaseIndex(base);
|
2010-03-04 23:00:02 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2012-08-21 01:41:08 +08:00
|
|
|
static public boolean isRegularBase( final char base ) {
|
2009-09-10 05:19:36 +08:00
|
|
|
return simpleBaseToBaseIndex(base) != -1;
|
|
|
|
|
}
|
|
|
|
|
|
2012-08-21 01:41:08 +08:00
|
|
|
static public boolean isRegularBase( final byte base ) {
|
2010-05-21 05:02:44 +08:00
|
|
|
return simpleBaseToBaseIndex(base) != -1;
|
2009-12-02 23:41:35 +08:00
|
|
|
}
|
|
|
|
|
|
2012-08-21 01:41:08 +08:00
|
|
|
static public boolean isAllRegularBases( final byte[] bases ) {
|
|
|
|
|
for( final byte base : bases) {
|
|
|
|
|
if( !isRegularBase(base) ) { return false; }
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2010-01-06 23:01:51 +08:00
|
|
|
static public boolean isNBase(byte base) {
|
2011-05-20 03:50:39 +08:00
|
|
|
return base == 'N' || base == 'n';
|
2010-01-06 23:01:51 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-14 22:49:12 +08:00
|
|
|
/**
|
|
|
|
|
* Converts a base index to a simple base
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param baseIndex 0, 1, 2, 3
|
2009-04-14 22:49:12 +08:00
|
|
|
* @return A, C, G, T, or '.' if the index can't be understood
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
static public byte baseIndexToSimpleBase(int baseIndex) {
|
2009-04-13 08:46:23 +08:00
|
|
|
switch (baseIndex) {
|
2012-02-08 07:11:53 +08:00
|
|
|
case 0:
|
|
|
|
|
return 'A';
|
|
|
|
|
case 1:
|
|
|
|
|
return 'C';
|
|
|
|
|
case 2:
|
|
|
|
|
return 'G';
|
|
|
|
|
case 3:
|
|
|
|
|
return 'T';
|
|
|
|
|
default:
|
|
|
|
|
return '.';
|
2009-04-13 08:46:23 +08:00
|
|
|
}
|
|
|
|
|
}
|
2009-04-14 22:49:12 +08:00
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
/**
|
2009-12-07 22:26:27 +08:00
|
|
|
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
|
|
|
|
|
*
|
|
|
|
|
* @param base the base [AaCcGgTt]
|
2009-05-23 03:32:20 +08:00
|
|
|
* @return the complementary base, or the input base if it's not one of the understood ones
|
2009-05-15 02:57:48 +08:00
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
static public byte simpleComplement(byte base) {
|
2009-04-22 06:25:33 +08:00
|
|
|
switch (base) {
|
|
|
|
|
case 'A':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'a':
|
|
|
|
|
return 'T';
|
2009-04-22 06:25:33 +08:00
|
|
|
case 'C':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'c':
|
|
|
|
|
return 'G';
|
2009-04-22 06:25:33 +08:00
|
|
|
case 'G':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 'g':
|
|
|
|
|
return 'C';
|
2009-04-22 06:25:33 +08:00
|
|
|
case 'T':
|
2012-02-08 07:11:53 +08:00
|
|
|
case 't':
|
|
|
|
|
return 'A';
|
|
|
|
|
default:
|
|
|
|
|
return base;
|
2009-04-22 06:25:33 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
Work on GSA-508 / CachingIndexedFastaReader should internally upper case bases loading data
-- As one might expect, CachingIndexedFastaSequenceFile now internally upper cases the FASTA reference bases. This is now done by default, unless requested explicitly to preserve the original bases.
-- This is really the correct place to do this for a variety of reasons. First, you don't need to work about upper casing bases throughout the code. Second, the cache is only upper cased once, no matter how often the bases are accessed, which walkers cannot optimize themselves. Finally, this uses the fastest function for this -- Picard's toUpperCase(byte[]) which is way better than String.toUpperCase()
-- Added unit tests to ensure this functionality works correct.
-- Removing unnecessary upper casing of bases in some core GATK tools, now that RefContext guarentees that the reference bases are all upper case.
-- Added contracts to ensure this is the case.
-- Remove a ton of sh*t from BaseUtils that was so old I had no idea what it was doing any longer, and didn't have any unit tests to ensure it was correct, and wasn't used anywhere in our code
2012-10-30 07:06:05 +08:00
|
|
|
static private char simpleComplement(char base) {
|
2012-02-08 07:11:53 +08:00
|
|
|
return (char) simpleComplement((byte) base);
|
2010-05-20 22:05:13 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-15 02:57:48 +08:00
|
|
|
/**
|
2009-05-22 03:39:39 +08:00
|
|
|
* Reverse complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
|
|
|
|
*
|
2009-12-07 22:26:27 +08:00
|
|
|
* @param bases the byte array of bases
|
2009-05-15 02:57:48 +08:00
|
|
|
* @return the reverse complement of the base byte array
|
|
|
|
|
*/
|
2009-04-22 06:25:33 +08:00
|
|
|
static public byte[] simpleReverseComplement(byte[] bases) {
|
|
|
|
|
byte[] rcbases = new byte[bases.length];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < bases.length; i++) {
|
2010-07-20 03:10:29 +08:00
|
|
|
rcbases[i] = simpleComplement(bases[bases.length - 1 - i]);
|
2009-04-22 06:25:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return rcbases;
|
|
|
|
|
}
|
2009-05-22 02:30:45 +08:00
|
|
|
|
2009-12-18 04:07:26 +08:00
|
|
|
/**
|
|
|
|
|
* Reverse complement a char array of bases
|
|
|
|
|
*
|
|
|
|
|
* @param bases the char array of bases
|
|
|
|
|
* @return the reverse complement of the char byte array
|
|
|
|
|
*/
|
2010-05-20 22:05:13 +08:00
|
|
|
@Deprecated
|
2009-12-18 04:07:26 +08:00
|
|
|
static public char[] simpleReverseComplement(char[] bases) {
|
|
|
|
|
char[] rcbases = new char[bases.length];
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < bases.length; i++) {
|
|
|
|
|
rcbases[i] = simpleComplement(bases[bases.length - 1 - i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return rcbases;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:39:39 +08:00
|
|
|
/**
|
|
|
|
|
* Reverse complement a String of bases. Preserves ambiguous bases.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param bases the String of bases
|
2009-05-22 03:39:39 +08:00
|
|
|
* @return the reverse complement of the String
|
|
|
|
|
*/
|
2010-07-20 03:10:29 +08:00
|
|
|
@Deprecated
|
2009-05-22 03:39:39 +08:00
|
|
|
static public String simpleReverseComplement(String bases) {
|
2009-05-23 03:32:20 +08:00
|
|
|
return new String(simpleReverseComplement(bases.getBytes()));
|
2009-05-22 03:39:39 +08:00
|
|
|
}
|
|
|
|
|
|
2012-07-26 13:50:39 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the uppercased version of the bases
|
|
|
|
|
*
|
|
|
|
|
* @param bases the bases
|
|
|
|
|
* @return the upper cased version
|
|
|
|
|
*/
|
2012-08-22 23:26:08 +08:00
|
|
|
static public void convertToUpperCase(final byte[] bases) {
|
|
|
|
|
StringUtil.toUpperCase(bases);
|
2012-07-26 13:50:39 +08:00
|
|
|
}
|
|
|
|
|
|
2011-07-29 06:58:36 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the index of the most common base in the basecounts array. To be used with
|
|
|
|
|
* pileup.getBaseCounts.
|
|
|
|
|
*
|
|
|
|
|
* @param baseCounts counts of a,c,g,t in order.
|
|
|
|
|
* @return the index of the most common base
|
|
|
|
|
*/
|
|
|
|
|
static public int mostFrequentBaseIndex(int[] baseCounts) {
|
|
|
|
|
int mostFrequentBaseIndex = 0;
|
|
|
|
|
for (int baseIndex = 1; baseIndex < 4; baseIndex++) {
|
|
|
|
|
if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) {
|
|
|
|
|
mostFrequentBaseIndex = baseIndex;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return mostFrequentBaseIndex;
|
|
|
|
|
}
|
|
|
|
|
|
2011-08-05 05:49:08 +08:00
|
|
|
static public int mostFrequentBaseIndexNotRef(int[] baseCounts, int refBaseIndex) {
|
|
|
|
|
int tmp = baseCounts[refBaseIndex];
|
|
|
|
|
baseCounts[refBaseIndex] = -1;
|
|
|
|
|
int result = mostFrequentBaseIndex(baseCounts);
|
|
|
|
|
baseCounts[refBaseIndex] = tmp;
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static public int mostFrequentBaseIndexNotRef(int[] baseCounts, byte refSimpleBase) {
|
|
|
|
|
return mostFrequentBaseIndexNotRef(baseCounts, simpleBaseToBaseIndex(refSimpleBase));
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-29 06:58:36 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param baseCounts counts of a,c,g,t in order.
|
2011-07-29 06:58:36 +08:00
|
|
|
* @return the most common base
|
|
|
|
|
*/
|
|
|
|
|
static public byte mostFrequentSimpleBase(int[] baseCounts) {
|
|
|
|
|
return baseIndexToSimpleBase(mostFrequentBaseIndex(baseCounts));
|
|
|
|
|
}
|
2009-08-13 04:16:22 +08:00
|
|
|
|
2009-05-22 04:35:31 +08:00
|
|
|
/**
|
|
|
|
|
* For the most frequent base in the sequence, return the percentage of the read it constitutes.
|
|
|
|
|
*
|
2012-02-08 07:11:53 +08:00
|
|
|
* @param sequence the read sequence
|
|
|
|
|
* @return the percentage of the read that's made up of the most frequent base
|
2009-05-22 04:35:31 +08:00
|
|
|
*/
|
|
|
|
|
static public double mostFrequentBaseFraction(byte[] sequence) {
|
|
|
|
|
int[] baseCounts = new int[4];
|
|
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
for (byte base : sequence) {
|
2010-07-20 03:10:29 +08:00
|
|
|
int baseIndex = simpleBaseToBaseIndex(base);
|
2009-05-22 04:35:31 +08:00
|
|
|
|
|
|
|
|
if (baseIndex >= 0) {
|
|
|
|
|
baseCounts[baseIndex]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-29 06:58:36 +08:00
|
|
|
int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts);
|
2009-05-22 04:35:31 +08:00
|
|
|
|
2012-02-08 07:11:53 +08:00
|
|
|
return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length);
|
2009-05-22 04:35:31 +08:00
|
|
|
}
|
2009-06-09 08:47:54 +08:00
|
|
|
|
2010-05-20 22:05:13 +08:00
|
|
|
// --------------------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// random bases
|
|
|
|
|
//
|
|
|
|
|
// --------------------------------------------------------------------------------
|
|
|
|
|
|
2009-06-09 08:47:54 +08:00
|
|
|
/**
|
|
|
|
|
* Return a random base index (A=0, C=1, G=2, T=3).
|
|
|
|
|
*
|
|
|
|
|
* @return a random base index (A=0, C=1, G=2, T=3)
|
|
|
|
|
*/
|
|
|
|
|
static public int getRandomBaseIndex() {
|
|
|
|
|
return getRandomBaseIndex(-1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a random base index, excluding some base index.
|
|
|
|
|
*
|
|
|
|
|
* @param excludeBaseIndex the base index to exclude
|
|
|
|
|
* @return a random base index, excluding the one specified (A=0, C=1, G=2, T=3)
|
|
|
|
|
*/
|
|
|
|
|
static public int getRandomBaseIndex(int excludeBaseIndex) {
|
|
|
|
|
int randomBaseIndex = excludeBaseIndex;
|
|
|
|
|
|
|
|
|
|
while (randomBaseIndex == excludeBaseIndex) {
|
2013-01-31 01:54:57 +08:00
|
|
|
randomBaseIndex = GenomeAnalysisEngine.getRandomGenerator().nextInt(4);
|
2009-06-09 08:47:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return randomBaseIndex;
|
|
|
|
|
}
|
2013-01-27 15:36:31 +08:00
|
|
|
|
|
|
|
|
public static byte getComplement(byte base) {
|
|
|
|
|
switch(base) {
|
|
|
|
|
case 'a':
|
|
|
|
|
case 'A':
|
|
|
|
|
return 'T';
|
|
|
|
|
case 'c':
|
|
|
|
|
case 'C':
|
|
|
|
|
return 'G';
|
|
|
|
|
case 'g':
|
|
|
|
|
case 'G':
|
|
|
|
|
return 'C';
|
|
|
|
|
case 't':
|
|
|
|
|
case 'T':
|
|
|
|
|
return 'A';
|
|
|
|
|
case 'n':
|
|
|
|
|
case 'N':
|
|
|
|
|
return 'N';
|
|
|
|
|
default:
|
2013-01-31 01:54:57 +08:00
|
|
|
throw new ReviewedStingException("base must be A, C, G or T. " + (char) base + " is not a valid base.");
|
2013-01-27 15:36:31 +08:00
|
|
|
}
|
|
|
|
|
}
|
2009-04-13 08:46:23 +08:00
|
|
|
}
|