laying groundwork to have insertions and deletions going through the system.

2012-02-07 18:11:53 -05:00 · 2012-02-07 18:11:53 -05:00 · e89887cd8e
parent 0d3ea0401c
commit e89887cd8e
2 changed files with 163 additions and 136 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
 import org.broadinstitute.sting.utils.sam.AlignmentUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 import java.util.ArrayList;
 import java.util.List;
@ -284,7 +285,7 @@ public class RecalDataManager {
    public static void parseColorSpace(final GATKSAMRecord read) {
        // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
-        if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) {
+        if (ReadUtils.isSOLiDRead(read)) {
            if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
                final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
                if (attr != null) {
@ -382,7 +383,7 @@ public class RecalDataManager {
    }
    public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
-        if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) {
+        if (ReadUtils.isSOLiDRead(read)) {
            final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
            if (attr != null) {
                byte[] colorSpace;
@ -611,21 +612,17 @@ public class RecalDataManager {
        final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
        final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
-        // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
+        for (int i = 0; i < numRequestedCovariates; i++) {                              // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
        for (int i = 0; i < numRequestedCovariates; i++) {
            requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType);
-            for (int j = 0; j < readLength; j++) {
+            for (int j = 0; j < readLength; j++)
-                //copy values into a 2D array that allows all covar types to be extracted at once for
+                covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j];    // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
                //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
                covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j];
            }
        }
        return covariateValues_offset_x_covar;
    }
    /**
-     * Perform a ceratin transversion (A <-> C or G <-> T) on the base.
+     * Perform a certain transversion (A <-> C or G <-> T) on the base.
     *
     * @param base the base [AaCcGgTt]
     * @return the transversion of the base, or the input base if it's not one of the understood ones
--- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@ -2,57 +2,59 @@ package org.broadinstitute.sting.utils;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 /**
 * BaseUtils contains some basic utilities for manipulating nucleotides.
 */
 public class BaseUtils {
-    public final static byte A = (byte)'A';
+    public final static byte A = (byte) 'A';
-    public final static byte C = (byte)'C';
+    public final static byte C = (byte) 'C';
-    public final static byte G = (byte)'G';
+    public final static byte G = (byte) 'G';
-    public final static byte T = (byte)'T';
+    public final static byte T = (byte) 'T';
-    public final static byte N = (byte)'N';
+    public final static byte N = (byte) 'N';
-    public final static byte D = (byte)'D';
+    public final static byte D = (byte) 'D';
    //
    // todo -- we need a generalized base abstraction using the Base enum.
    //
-    public final static byte[] BASES = { 'A', 'C', 'G', 'T' };
+    public final static byte[] BASES = {'A', 'C', 'G', 'T'};
-    public final static byte[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' };
+    public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
    public enum Base {
-        A ( 'A', 0 ),
+        A('A', 0),
-        C ( 'C', 1 ),
+        C('C', 1),
-        G ( 'G', 2 ),
+        G('G', 2),
-        T ( 'T', 3 );
+        T('T', 3);
        byte b;
        int index;
        private Base(char base, int index) {
-            this.b = (byte)base;
+            this.b = (byte) base;
            this.index = index;
        }
        public byte getBase() { return b; }
-        public char getBaseAsChar() { return (char)b; }
+
        public char getBaseAsChar() { return (char) b; }
        public int getIndex() { return index; }
        public boolean sameBase(byte o) { return b == o; }
        public boolean sameBase(char o) { return b == (byte)o; }
        public boolean sameBase(int i)  { return index == i; }
    }
        public boolean sameBase(char o) { return b == (byte) o; }
        public boolean sameBase(int i) { return index == i; }
    }
    // todo -- fix me (enums?)
    public static final byte DELETION_INDEX = 4;
    public static final byte NO_CALL_INDEX = 5; // (this is 'N')
-    public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte)'G');
+    public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G');
-    public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte)'C');
+    public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C');
-    public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte)'A');
+    public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A');
-    public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte)'T');
+    public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T');
    /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
    // a pyrimidine to another pyrimidine nucleotide (C <-> T).
@ -64,28 +66,31 @@ public class BaseUtils {
    /**
     * Returns the base substitution type of the 2 state SNP
     *
     * @param base1
     * @param base2
     * @return
     */
-    public static BaseSubstitutionType SNPSubstitutionType( byte base1, byte base2 ) {
+    public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) {
        BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION;
        //System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t);
        return t;
    }
-    public static boolean isTransition( byte base1, byte  base2 ) {
+    public static boolean isTransition(byte base1, byte base2) {
        int b1 = simpleBaseToBaseIndex(base1);
        int b2 = simpleBaseToBaseIndex(base2);
        return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 ||
-               b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
+                b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
    }
-    public static boolean isTransversion( byte  base1, byte  base2 ) {
+    public static boolean isTransversion(byte base1, byte base2) {
-        return ! isTransition(base1, base2);
+        return !isTransition(base1, base2);
    }
-    /** Private constructor.  No instantiating this class! */
+    /**
     * Private constructor.  No instantiating this class!
     */
    private BaseUtils() {}
    static public boolean basesAreEqual(byte base1, byte base2) {
@ -96,7 +101,6 @@ public class BaseUtils {
        return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
    }
    /**
     * Converts a IUPAC nucleotide code to a pair of bases
     *
@ -163,33 +167,37 @@ public class BaseUtils {
    /**
     * Converts a simple base to a base index
     *
-     * @param base  [AaCcGgTt]
+     * @param base [AaCcGgTt]
     * @return 0, 1, 2, 3, or -1 if the base can't be understood
     */
    static public int simpleBaseToBaseIndex(byte base) {
        switch (base) {
            case '*':               // the wildcard character counts as an A
            case 'A':
-            case 'a': return 0;
+            case 'a':
                return 0;
            case 'C':
-            case 'c': return 1;
+            case 'c':
                return 1;
            case 'G':
-            case 'g': return 2;
+            case 'g':
                return 2;
            case 'T':
-            case 't': return 3;
+            case 't':
                return 3;
-            default: return -1;
+            default:
                return -1;
        }
    }
    /**
     * Converts a simple base to a base index
     *
-     * @param base  [AaCcGgTt]
+     * @param base [AaCcGgTt]
     * @return 0, 1, 2, 3, or -1 if the base can't be understood
     */
    @Deprecated
@ -197,29 +205,37 @@ public class BaseUtils {
        switch (base) {
            case '*':               // the wildcard character counts as an A
            case 'A':
-            case 'a': return 0;
+            case 'a':
                return 0;
            case 'C':
-            case 'c': return 1;
+            case 'c':
                return 1;
            case 'G':
-            case 'g': return 2;
+            case 'g':
                return 2;
            case 'T':
-            case 't': return 3;
+            case 't':
                return 3;
-            default: return -1;
+            default:
                return -1;
        }
    }
    static public int extendedBaseToBaseIndex(byte base) {
        switch (base) {
            case 'd':
-            case 'D': return DELETION_INDEX;
+            case 'D':
                return DELETION_INDEX;
            case 'n':
-            case 'N': return NO_CALL_INDEX;
+            case 'N':
                return NO_CALL_INDEX;
-            default: return simpleBaseToBaseIndex(base);
+            default:
                return simpleBaseToBaseIndex(base);
        }
    }
@ -232,11 +248,6 @@ public class BaseUtils {
        return simpleBaseToBaseIndex(base) != -1;
    }
    @Deprecated
    static public boolean isNBase(char base) {
        return isNBase((byte)base);
    }
    static public boolean isNBase(byte base) {
        return base == 'N' || base == 'n';
    }
@ -244,68 +255,83 @@ public class BaseUtils {
    /**
     * Converts a base index to a simple base
     *
-     * @param baseIndex  0, 1, 2, 3
+     * @param baseIndex 0, 1, 2, 3
     * @return A, C, G, T, or '.' if the index can't be understood
     */
    static public byte baseIndexToSimpleBase(int baseIndex) {
        switch (baseIndex) {
-            case 0: return 'A';
+            case 0:
-            case 1: return 'C';
+                return 'A';
-            case 2: return 'G';
+            case 1:
-            case 3: return 'T';
+                return 'C';
-            default: return '.';
+            case 2:
                return 'G';
            case 3:
                return 'T';
            default:
                return '.';
        }
    }
    @Deprecated
    static public char baseIndexToSimpleBaseAsChar(int baseIndex) {
-        return (char)baseIndexToSimpleBase(baseIndex);
+        return (char) baseIndexToSimpleBase(baseIndex);
    }
    /**
     * Converts a base index to a base index representing its cross-talk partner
     *
-     * @param baseIndex  0, 1, 2, 3
+     * @param baseIndex 0, 1, 2, 3
     * @return 1, 0, 3, 2, or -1 if the index can't be understood
     */
    static public int crossTalkPartnerIndex(int baseIndex) {
        switch (baseIndex) {
-            case 0: return 1; // A -> C
+            case 0:
-            case 1: return 0; // C -> A
+                return 1; // A -> C
-            case 2: return 3; // G -> T
+            case 1:
-            case 3: return 2; // T -> G
+                return 0; // C -> A
-            default: return -1;
+            case 2:
                return 3; // G -> T
            case 3:
                return 2; // T -> G
            default:
                return -1;
        }
    }
    /**
     * Converts a base to the base representing its cross-talk partner
     *
-     * @param base  [AaCcGgTt]
+     * @param base [AaCcGgTt]
     * @return C, A, T, G, or '.' if the base can't be understood
     */
    @Deprecated
    static public char crossTalkPartnerBase(char base) {
-        return (char)baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
+        return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
    }
    /**
     * Return the complement of a base index.
     *
-     * @param baseIndex  the base index (0:A, 1:C, 2:G, 3:T)
+     * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
     * @return the complementary base index
     */
    static public byte complementIndex(int baseIndex) {
        switch (baseIndex) {
-            case 0: return 3; // a -> t
+            case 0:
-            case 1: return 2; // c -> g
+                return 3; // a -> t
-            case 2: return 1; // g -> c
+            case 1:
-            case 3: return 0; // t -> a
+                return 2; // c -> g
-            default: return -1; // wtf?
+            case 2:
                return 1; // g -> c
            case 3:
                return 0; // t -> a
            default:
                return -1; // wtf?
        }
    }
-   /**
+    /**
     * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
     *
     * @param base the base [AaCcGgTt]
@ -314,20 +340,25 @@ public class BaseUtils {
    static public byte simpleComplement(byte base) {
        switch (base) {
            case 'A':
-            case 'a': return 'T';
+            case 'a':
                return 'T';
            case 'C':
-            case 'c': return 'G';
+            case 'c':
                return 'G';
            case 'G':
-            case 'g': return 'C';
+            case 'g':
                return 'C';
            case 'T':
-            case 't': return 'A';
+            case 't':
-            default: return base;
+                return 'A';
            default:
                return base;
        }
    }
    @Deprecated
    static public char simpleComplement(char base) {
-        return (char)simpleComplement((byte)base);
+        return (char) simpleComplement((byte) base);
    }
    /**
@ -349,7 +380,7 @@ public class BaseUtils {
    /**
     * Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
     *
-     * @param bases  the byte array of bases
+     * @param bases the byte array of bases
     * @return the complement of the base byte array
     */
    static public byte[] simpleComplement(byte[] bases) {
@ -382,7 +413,7 @@ public class BaseUtils {
    /**
     * Complement a char array of bases
     *
-     * @param bases  the char array of bases
+     * @param bases the char array of bases
     * @return the complement of the base char array
     */
    @Deprecated
@ -399,7 +430,7 @@ public class BaseUtils {
    /**
     * Reverse complement a String of bases.  Preserves ambiguous bases.
     *
-     * @param bases  the String of bases
+     * @param bases the String of bases
     * @return the reverse complement of the String
     */
    @Deprecated
@ -407,11 +438,10 @@ public class BaseUtils {
        return new String(simpleReverseComplement(bases.getBytes()));
    }
    /**
     * Complement a String of bases.  Preserves ambiguous bases.
     *
-     * @param bases  the String of bases
+     * @param bases the String of bases
     * @return the complement of the String
     */
    @Deprecated
@ -451,7 +481,7 @@ public class BaseUtils {
    /**
     * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts.
     *
-     * @param  baseCounts counts of a,c,g,t in order.
+     * @param baseCounts counts of a,c,g,t in order.
     * @return the most common base
     */
    static public byte mostFrequentSimpleBase(int[] baseCounts) {
@ -461,13 +491,13 @@ public class BaseUtils {
    /**
     * For the most frequent base in the sequence, return the percentage of the read it constitutes.
     *
-     * @param sequence  the read sequence
+     * @param sequence the read sequence
-     * @return  the percentage of the read that's made up of the most frequent base
+     * @return the percentage of the read that's made up of the most frequent base
     */
    static public double mostFrequentBaseFraction(byte[] sequence) {
        int[] baseCounts = new int[4];
-        for ( byte base : sequence ) {
+        for (byte base : sequence) {
            int baseIndex = simpleBaseToBaseIndex(base);
            if (baseIndex >= 0) {
@ -477,7 +507,7 @@ public class BaseUtils {
        int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts);
-        return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length);
+        return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length);
    }
    // --------------------------------------------------------------------------------
@ -531,50 +561,50 @@ public class BaseUtils {
    static public byte getRandomBase(char excludeBase) {
        return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
    }
-    
+
-    
+    /**
-    /** Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, 
+     * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
     * that for all  i = 0... seq.length-1,  seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
-     *  The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period 
+     * The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
-     *  of 2 (it has a period of 4 as well), and so does
+     * of 2 (it has a period of 4 as well), and so does
-     * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is 
+     * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
     * the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
     * or if specified minPeriod is greater than the sequence length.
-     *   
+     *
     * @param seq
     * @return
     */
    public static int sequencePeriod(byte[] seq, int minPeriod) {
-    	int period = ( minPeriod > seq.length ? seq.length : minPeriod );
+        int period = (minPeriod > seq.length ? seq.length : minPeriod);
-    	// we assume that bases [0,period-1] repeat themselves and check this assumption
+        // we assume that bases [0,period-1] repeat themselves and check this assumption
-    	// until we find correct period
+        // until we find correct period
-    	
+
-    	for ( int pos = period ; pos < seq.length ; pos++ ) {
+        for (int pos = period; pos < seq.length; pos++) {
-    		
+
-    		int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
+            int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
-    		                                                // if our current hypothesis holds, base[pos] must be the same as base[offset]
+            // if our current hypothesis holds, base[pos] must be the same as base[offset]
-    		
+
-    		if ( Character.toUpperCase( seq[pos] ) !=
+            if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
-    				Character.toUpperCase( seq[offset] )
+
-    			) {
+                // period we have been trying so far does not work.
-    			
+                // two possibilities:
-    			// period we have been trying so far does not work.
+                // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
-    			// two possibilities:
+                //      in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
-    			// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
+                //       so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
-    			//      in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
+                //      pos will be autoincremented and we will be checking next base
-    		   //       so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance 
+                // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
-    			//      pos will be autoincremented and we will be checking next base
+                //     hence we should first check if it matches the first base of the sequence, and to do that
-    			// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
+                //     we set period to pos  (thus trying the hypothesis that bases from start up to the current one,
-    			//     hence we should first check if it matches the first base of the sequence, and to do that
+                //     non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
-    			//     we set period to pos  (thus trying the hypothesis that bases from start up to the current one,
+                // on the next loop re-entrance after pos is autoincremented)
-    			//     non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
+                if (offset == 0)
-    			// on the next loop re-entrance after pos is autoincremented)
+                    period = pos + 1;
-    			if ( offset == 0 ) period = pos+1;
+                else
-    			else period = pos-- ;
+                    period = pos--;
-    		
+
-    		} 
+            }
-    	}
+        }
-    	return period;
+        return period;
    }
 }