diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 72c2b2829..311e33f8a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.ArrayList; import java.util.List; @@ -284,7 +285,7 @@ public class RecalDataManager { public static void parseColorSpace(final GATKSAMRecord read) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { + if (ReadUtils.isSOLiDRead(read)) { if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { @@ -382,7 +383,7 @@ public class RecalDataManager { } public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { - if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { + if (ReadUtils.isSOLiDRead(read)) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { byte[] colorSpace; @@ -611,21 +612,17 @@ public class RecalDataManager { final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for (int i = 0; i < numRequestedCovariates; i++) { + for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType); - for (int j = 0; j < readLength; j++) { - //copy values into a 2D array that allows all covar types to be extracted at once for - //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. - covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; - } + for (int j = 0; j < readLength; j++) + covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. } return covariateValues_offset_x_covar; } /** - * Perform a ceratin transversion (A <-> C or G <-> T) on the base. + * Perform a certain transversion (A <-> C or G <-> T) on the base. * * @param base the base [AaCcGgTt] * @return the transversion of the base, or the input base if it's not one of the understood ones diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 673b1524d..61812629c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -2,57 +2,59 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; - /** * BaseUtils contains some basic utilities for manipulating nucleotides. */ public class BaseUtils { - public final static byte A = (byte)'A'; - public final static byte C = (byte)'C'; - public final static byte G = (byte)'G'; - public final static byte T = (byte)'T'; + public final static byte A = (byte) 'A'; + public final static byte C = (byte) 'C'; + public final static byte G = (byte) 'G'; + public final static byte T = (byte) 'T'; - public final static byte N = (byte)'N'; - public final static byte D = (byte)'D'; + public final static byte N = (byte) 'N'; + public final static byte D = (byte) 'D'; // // todo -- we need a generalized base abstraction using the Base enum. // - public final static byte[] BASES = { 'A', 'C', 'G', 'T' }; - public final static byte[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' }; + public final static byte[] BASES = {'A', 'C', 'G', 'T'}; + public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; public enum Base { - A ( 'A', 0 ), - C ( 'C', 1 ), - G ( 'G', 2 ), - T ( 'T', 3 ); + A('A', 0), + C('C', 1), + G('G', 2), + T('T', 3); byte b; int index; + private Base(char base, int index) { - this.b = (byte)base; + this.b = (byte) base; this.index = index; } public byte getBase() { return b; } - public char getBaseAsChar() { return (char)b; } + + public char getBaseAsChar() { return (char) b; } + public int getIndex() { return index; } public boolean sameBase(byte o) { return b == o; } - public boolean sameBase(char o) { return b == (byte)o; } - public boolean sameBase(int i) { return index == i; } - } + public boolean sameBase(char o) { return b == (byte) o; } + + public boolean sameBase(int i) { return index == i; } + } // todo -- fix me (enums?) public static final byte DELETION_INDEX = 4; public static final byte NO_CALL_INDEX = 5; // (this is 'N') - public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte)'G'); - public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte)'C'); - public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte)'A'); - public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte)'T'); - + public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G'); + public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C'); + public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A'); + public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T'); /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). @@ -64,28 +66,31 @@ public class BaseUtils { /** * Returns the base substitution type of the 2 state SNP + * * @param base1 * @param base2 * @return */ - public static BaseSubstitutionType SNPSubstitutionType( byte base1, byte base2 ) { + public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) { BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION; //System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t); return t; } - public static boolean isTransition( byte base1, byte base2 ) { + public static boolean isTransition(byte base1, byte base2) { int b1 = simpleBaseToBaseIndex(base1); int b2 = simpleBaseToBaseIndex(base2); return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 || - b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; + b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; } - public static boolean isTransversion( byte base1, byte base2 ) { - return ! isTransition(base1, base2); + public static boolean isTransversion(byte base1, byte base2) { + return !isTransition(base1, base2); } - /** Private constructor. No instantiating this class! */ + /** + * Private constructor. No instantiating this class! + */ private BaseUtils() {} static public boolean basesAreEqual(byte base1, byte base2) { @@ -96,7 +101,6 @@ public class BaseUtils { return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); } - /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -163,33 +167,37 @@ public class BaseUtils { /** * Converts a simple base to a base index * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ static public int simpleBaseToBaseIndex(byte base) { switch (base) { case '*': // the wildcard character counts as an A case 'A': - case 'a': return 0; + case 'a': + return 0; case 'C': - case 'c': return 1; + case 'c': + return 1; case 'G': - case 'g': return 2; + case 'g': + return 2; case 'T': - case 't': return 3; + case 't': + return 3; - default: return -1; + default: + return -1; } } - /** * Converts a simple base to a base index * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ @Deprecated @@ -197,29 +205,37 @@ public class BaseUtils { switch (base) { case '*': // the wildcard character counts as an A case 'A': - case 'a': return 0; + case 'a': + return 0; case 'C': - case 'c': return 1; + case 'c': + return 1; case 'G': - case 'g': return 2; + case 'g': + return 2; case 'T': - case 't': return 3; + case 't': + return 3; - default: return -1; + default: + return -1; } } static public int extendedBaseToBaseIndex(byte base) { switch (base) { case 'd': - case 'D': return DELETION_INDEX; + case 'D': + return DELETION_INDEX; case 'n': - case 'N': return NO_CALL_INDEX; + case 'N': + return NO_CALL_INDEX; - default: return simpleBaseToBaseIndex(base); + default: + return simpleBaseToBaseIndex(base); } } @@ -232,11 +248,6 @@ public class BaseUtils { return simpleBaseToBaseIndex(base) != -1; } - @Deprecated - static public boolean isNBase(char base) { - return isNBase((byte)base); - } - static public boolean isNBase(byte base) { return base == 'N' || base == 'n'; } @@ -244,68 +255,83 @@ public class BaseUtils { /** * Converts a base index to a simple base * - * @param baseIndex 0, 1, 2, 3 + * @param baseIndex 0, 1, 2, 3 * @return A, C, G, T, or '.' if the index can't be understood */ static public byte baseIndexToSimpleBase(int baseIndex) { switch (baseIndex) { - case 0: return 'A'; - case 1: return 'C'; - case 2: return 'G'; - case 3: return 'T'; - default: return '.'; + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + return '.'; } } @Deprecated static public char baseIndexToSimpleBaseAsChar(int baseIndex) { - return (char)baseIndexToSimpleBase(baseIndex); + return (char) baseIndexToSimpleBase(baseIndex); } /** * Converts a base index to a base index representing its cross-talk partner * - * @param baseIndex 0, 1, 2, 3 + * @param baseIndex 0, 1, 2, 3 * @return 1, 0, 3, 2, or -1 if the index can't be understood */ static public int crossTalkPartnerIndex(int baseIndex) { switch (baseIndex) { - case 0: return 1; // A -> C - case 1: return 0; // C -> A - case 2: return 3; // G -> T - case 3: return 2; // T -> G - default: return -1; + case 0: + return 1; // A -> C + case 1: + return 0; // C -> A + case 2: + return 3; // G -> T + case 3: + return 2; // T -> G + default: + return -1; } } /** * Converts a base to the base representing its cross-talk partner * - * @param base [AaCcGgTt] + * @param base [AaCcGgTt] * @return C, A, T, G, or '.' if the base can't be understood */ @Deprecated static public char crossTalkPartnerBase(char base) { - return (char)baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); + return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); } /** * Return the complement of a base index. * - * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) + * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) * @return the complementary base index */ static public byte complementIndex(int baseIndex) { switch (baseIndex) { - case 0: return 3; // a -> t - case 1: return 2; // c -> g - case 2: return 1; // g -> c - case 3: return 0; // t -> a - default: return -1; // wtf? + case 0: + return 3; // a -> t + case 1: + return 2; // c -> g + case 2: + return 1; // g -> c + case 3: + return 0; // t -> a + default: + return -1; // wtf? } } - /** + /** * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). * * @param base the base [AaCcGgTt] @@ -314,20 +340,25 @@ public class BaseUtils { static public byte simpleComplement(byte base) { switch (base) { case 'A': - case 'a': return 'T'; + case 'a': + return 'T'; case 'C': - case 'c': return 'G'; + case 'c': + return 'G'; case 'G': - case 'g': return 'C'; + case 'g': + return 'C'; case 'T': - case 't': return 'A'; - default: return base; + case 't': + return 'A'; + default: + return base; } } @Deprecated static public char simpleComplement(char base) { - return (char)simpleComplement((byte)base); + return (char) simpleComplement((byte) base); } /** @@ -349,7 +380,7 @@ public class BaseUtils { /** * Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) * - * @param bases the byte array of bases + * @param bases the byte array of bases * @return the complement of the base byte array */ static public byte[] simpleComplement(byte[] bases) { @@ -382,7 +413,7 @@ public class BaseUtils { /** * Complement a char array of bases * - * @param bases the char array of bases + * @param bases the char array of bases * @return the complement of the base char array */ @Deprecated @@ -399,7 +430,7 @@ public class BaseUtils { /** * Reverse complement a String of bases. Preserves ambiguous bases. * - * @param bases the String of bases + * @param bases the String of bases * @return the reverse complement of the String */ @Deprecated @@ -407,11 +438,10 @@ public class BaseUtils { return new String(simpleReverseComplement(bases.getBytes())); } - /** * Complement a String of bases. Preserves ambiguous bases. * - * @param bases the String of bases + * @param bases the String of bases * @return the complement of the String */ @Deprecated @@ -451,7 +481,7 @@ public class BaseUtils { /** * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. * - * @param baseCounts counts of a,c,g,t in order. + * @param baseCounts counts of a,c,g,t in order. * @return the most common base */ static public byte mostFrequentSimpleBase(int[] baseCounts) { @@ -461,13 +491,13 @@ public class BaseUtils { /** * For the most frequent base in the sequence, return the percentage of the read it constitutes. * - * @param sequence the read sequence - * @return the percentage of the read that's made up of the most frequent base + * @param sequence the read sequence + * @return the percentage of the read that's made up of the most frequent base */ static public double mostFrequentBaseFraction(byte[] sequence) { int[] baseCounts = new int[4]; - for ( byte base : sequence ) { + for (byte base : sequence) { int baseIndex = simpleBaseToBaseIndex(base); if (baseIndex >= 0) { @@ -477,7 +507,7 @@ public class BaseUtils { int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts); - return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length); + return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length); } // -------------------------------------------------------------------------------- @@ -531,50 +561,50 @@ public class BaseUtils { static public byte getRandomBase(char excludeBase) { return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase))); } - - - /** Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, + + /** + * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, * that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p). - * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period - * of 2 (it has a period of 4 as well), and so does - * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is + * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period + * of 2 (it has a period of 4 as well), and so does + * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is * the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range * or if specified minPeriod is greater than the sequence length. - * + * * @param seq * @return */ public static int sequencePeriod(byte[] seq, int minPeriod) { - int period = ( minPeriod > seq.length ? seq.length : minPeriod ); - // we assume that bases [0,period-1] repeat themselves and check this assumption - // until we find correct period - - for ( int pos = period ; pos < seq.length ; pos++ ) { - - int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' - // if our current hypothesis holds, base[pos] must be the same as base[offset] - - if ( Character.toUpperCase( seq[pos] ) != - Character.toUpperCase( seq[offset] ) - ) { - - // period we have been trying so far does not work. - // two possibilities: - // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; - // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; - // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance - // pos will be autoincremented and we will be checking next base - // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? - // hence we should first check if it matches the first base of the sequence, and to do that - // we set period to pos (thus trying the hypothesis that bases from start up to the current one, - // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base - // on the next loop re-entrance after pos is autoincremented) - if ( offset == 0 ) period = pos+1; - else period = pos-- ; - - } - } - return period; + int period = (minPeriod > seq.length ? seq.length : minPeriod); + // we assume that bases [0,period-1] repeat themselves and check this assumption + // until we find correct period + + for (int pos = period; pos < seq.length; pos++) { + + int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' + // if our current hypothesis holds, base[pos] must be the same as base[offset] + + if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) { + + // period we have been trying so far does not work. + // two possibilities: + // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; + // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; + // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance + // pos will be autoincremented and we will be checking next base + // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? + // hence we should first check if it matches the first base of the sequence, and to do that + // we set period to pos (thus trying the hypothesis that bases from start up to the current one, + // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base + // on the next loop re-entrance after pos is autoincremented) + if (offset == 0) + period = pos + 1; + else + period = pos--; + + } + } + return period; } }