Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-01-03 09:38:49 -05:00
commit ab8d47d9a5
13 changed files with 1078 additions and 612 deletions

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -49,8 +50,11 @@ public class MathUtils {
*/
/** Private constructor. No instantiating this class! */
private MathUtils() {}
/**
* Private constructor. No instantiating this class!
*/
private MathUtils() {
}
@Requires({"d > 0.0"})
public static int fastPositiveRound(double d) {
@ -100,8 +104,12 @@ public class MathUtils {
public static double variance(Collection<Number> numbers, Number mean, boolean ignoreNan) {
double mn = mean.doubleValue();
double var = 0;
for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; }
if ( ignoreNan ) { return var/(nonNanSize(numbers)-1); }
for (Number n : numbers) {
var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0;
}
if (ignoreNan) {
return var / (nonNanSize(numbers) - 1);
}
return var / (numbers.size() - 1);
}
@ -126,6 +134,7 @@ public class MathUtils {
/**
* Calculates the log10 cumulative sum of an array with log10 probabilities
*
* @param log10p the array with log10 probabilites
* @param upTo index in the array to calculate the cumsum up to
* @return the log10 of the cumulative sum
@ -136,6 +145,7 @@ public class MathUtils {
/**
* Converts a real space array of probabilities into a log10 array
*
* @param prRealSpace
* @return
*/
@ -219,7 +229,9 @@ public class MathUtils {
* @param b the second double value
* @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a.
*/
public static byte compareDoubles(double a, double b) { return compareDoubles(a, b, 1e-6); }
public static byte compareDoubles(double a, double b) {
return compareDoubles(a, b, 1e-6);
}
/**
* Compares double values for equality (within epsilon), or inequality.
@ -229,10 +241,13 @@ public class MathUtils {
* @param epsilon the precision within which two double values will be considered equal
* @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a.
*/
public static byte compareDoubles(double a, double b, double epsilon)
{
if (Math.abs(a - b) < epsilon) { return 0; }
if (a > b) { return -1; }
public static byte compareDoubles(double a, double b, double epsilon) {
if (Math.abs(a - b) < epsilon) {
return 0;
}
if (a > b) {
return -1;
}
return 1;
}
@ -243,7 +258,9 @@ public class MathUtils {
* @param b the second float value
* @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a.
*/
public static byte compareFloats(float a, float b) { return compareFloats(a, b, 1e-6f); }
public static byte compareFloats(float a, float b) {
return compareFloats(a, b, 1e-6f);
}
/**
* Compares float values for equality (within epsilon), or inequality.
@ -253,15 +270,17 @@ public class MathUtils {
* @param epsilon the precision within which two float values will be considered equal
* @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a.
*/
public static byte compareFloats(float a, float b, float epsilon)
{
if (Math.abs(a - b) < epsilon) { return 0; }
if (a > b) { return -1; }
public static byte compareFloats(float a, float b, float epsilon) {
if (Math.abs(a - b) < epsilon) {
return 0;
}
if (a > b) {
return -1;
}
return 1;
}
public static double NormalDistribution(double mean, double sd, double x)
{
public static double NormalDistribution(double mean, double sd, double x) {
double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI));
double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd)));
return a * b;
@ -270,17 +289,17 @@ public class MathUtils {
public static double binomialCoefficient(int n, int k) {
return Math.pow(10, log10BinomialCoefficient(n, k));
}
/**
* Computes a binomial probability. This is computed using the formula
*
* <p/>
* B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k )
*
* <p/>
* where n is the number of trials, k is the number of successes, and p is the probability of success
*
* @param n number of Bernoulli trials
* @param k number of successes
* @param p probability of success
*
* @return the binomial probability of the specified configuration. Computes values down to about 1e-237.
*/
public static double binomialProbability(int n, int k, double p) {
@ -289,6 +308,7 @@ public class MathUtils {
/**
* Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space.
*
* @param start - start of the cumulant sum (over hits)
* @param end - end of the cumulant sum (over hits)
* @param total - number of attempts for the number of hits
@ -318,9 +338,9 @@ public class MathUtils {
/**
* Computes a multinomial coefficient efficiently avoiding overflow even for large numbers.
* This is computed using the formula:
*
* <p/>
* M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ]
*
* <p/>
* where xi represents the number of times outcome i was observed, n is the number of total observations.
* In this implementation, the value of n is inferred as the sum over i of xi.
*
@ -339,9 +359,9 @@ public class MathUtils {
/**
* Computes a multinomial probability efficiently avoiding overflow even for large numbers.
* This is computed using the formula:
*
* <p/>
* M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk)
*
* <p/>
* where xi represents the number of times outcome i was observed, n is the number of total observations, and
* pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is
* inferred as the sum over i of xi.
@ -365,6 +385,7 @@ public class MathUtils {
/**
* calculate the Root Mean Square of an array of integers
*
* @param x an byte[] of numbers
* @return the RMS of the specified numbers.
*/
@ -382,6 +403,7 @@ public class MathUtils {
/**
* calculate the Root Mean Square of an array of integers
*
* @param x an int[] of numbers
* @return the RMS of the specified numbers.
*/
@ -398,6 +420,7 @@ public class MathUtils {
/**
* calculate the Root Mean Square of an array of doubles
*
* @param x a double[] of numbers
* @return the RMS of the specified numbers.
*/
@ -444,7 +467,6 @@ public class MathUtils {
*
* @param array the array to be normalized
* @param takeLog10OfOutput if true, the output will be transformed back into log10 units
*
* @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed
*/
public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) {
@ -504,11 +526,11 @@ public class MathUtils {
return normalized;
}
/**
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
*
* @param array the array to be normalized
*
* @return a newly allocated array corresponding the normalized values in array
*/
public static double[] normalizeFromLog10(double[] array) {
@ -551,6 +573,10 @@ public class MathUtils {
return array[minElementIndex(array)];
}
public static int arrayMin(int[] array) {
return array[minElementIndex(array)];
}
public static byte arrayMin(byte[] array) {
return array[minElementIndex(array)];
}
@ -579,6 +605,18 @@ public class MathUtils {
return minI;
}
public static int minElementIndex(int[] array) {
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
int minI = -1;
for (int i = 0; i < array.length; i++) {
if (minI == -1 || array[i] < array[minI])
minI = i;
}
return minI;
}
public static int arrayMaxInt(List<Integer> array) {
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!");
@ -749,7 +787,9 @@ public class MathUtils {
}
/** Draw N random elements from list. */
/**
* Draw N random elements from list.
*/
public static <T> List<T> randomSubset(List<T> list, int N) {
if (list.size() <= N) {
return list;
@ -770,6 +810,25 @@ public class MathUtils {
return ans;
}
/**
* Draw N random elements from an array.
*
* @param array your objects
* @param n number of elements to select at random from the list
* @return a new list with the N randomly chosen elements from list
*/
@Requires({"array != null", "n>=0"})
@Ensures({"result != null", "result.length == Math.min(n, array.length)"})
public static Object[] randomSubset(final Object[] array, final int n) {
if (array.length <= n)
return array.clone();
Object[] shuffledArray = arrayShuffle(array);
Object[] result = new Object[n];
System.arraycopy(shuffledArray, 0, result, 0, n);
return result;
}
public static double percentage(double x, double base) {
return (base > 0 ? (x / base) * 100.0 : 0);
}
@ -872,7 +931,7 @@ public class MathUtils {
/**
* Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times
*
* @param indices the list of indices for elements to extract
* @param list the list from which the elements should be extracted
* @param <T> the template type of the ArrayList
@ -967,7 +1026,8 @@ public class MathUtils {
return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.));
}
/** A utility class that computes on the fly average and standard deviation for a stream of numbers.
/**
* A utility class that computes on the fly average and standard deviation for a stream of numbers.
* The number of observations does not have to be known in advance, and can be also very big (so that
* it could overflow any naive summation-based scheme or cause loss of precision).
* Instead, adding a new number <code>observed</code>
@ -993,10 +1053,21 @@ public class MathUtils {
}
}
public double mean() { return mean; }
public double stddev() { return Math.sqrt(s/(obs_count - 1)); }
public double var() { return s/(obs_count - 1); }
public long observationCount() { return obs_count; }
public double mean() {
return mean;
}
public double stddev() {
return Math.sqrt(s / (obs_count - 1));
}
public double var() {
return s / (obs_count - 1);
}
public long observationCount() {
return obs_count;
}
public RunningAverage clone() {
RunningAverage ra = new RunningAverage();
@ -1018,14 +1089,29 @@ public class MathUtils {
//
// useful common utility routines
//
public static double rate(long n, long d) { return n / (1.0 * Math.max(d, 1)); }
public static double rate(int n, int d) { return n / (1.0 * Math.max(d, 1)); }
public static double rate(long n, long d) {
return n / (1.0 * Math.max(d, 1));
}
public static long inverseRate(long n, long d) { return n == 0 ? 0 : d / Math.max(n, 1); }
public static long inverseRate(int n, int d) { return n == 0 ? 0 : d / Math.max(n, 1); }
public static double rate(int n, int d) {
return n / (1.0 * Math.max(d, 1));
}
public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); }
public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); }
public static long inverseRate(long n, long d) {
return n == 0 ? 0 : d / Math.max(n, 1);
}
public static long inverseRate(int n, int d) {
return n == 0 ? 0 : d / Math.max(n, 1);
}
public static double ratio(int num, int denom) {
return ((double) num) / (Math.max(denom, 1));
}
public static double ratio(long num, long denom) {
return ((double) num) / (Math.max(denom, 1));
}
public static final double[] log10Cache;
public static final double[] jacobianLogTable;
@ -1093,8 +1179,7 @@ public class MathUtils {
else if (diff >= 0) {
int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
return x + jacobianLogTable[ind];
}
else {
} else {
int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
return y + jacobianLogTable[ind];
}
@ -1110,6 +1195,7 @@ public class MathUtils {
/**
* Returns the phred scaled value of probability p
*
* @param p probability (between 0 and 1).
* @return phred scaled probability of p
*/
@ -1123,6 +1209,7 @@ public class MathUtils {
/**
* Converts LN to LOG10
*
* @param ln log(x)
* @return log10(x)
*/
@ -1240,14 +1327,28 @@ public class MathUtils {
else if (ix < 0x40000000) {
if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */
r = -Math.log(x);
if(ix>=0x3FE76944) {y = one-x; i= 0;}
else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
else {y = x; i=2;}
if (ix >= 0x3FE76944) {
y = one - x;
i = 0;
} else if (ix >= 0x3FCDA661) {
y = x - (tc - one);
i = 1;
} else {
y = x;
i = 2;
}
} else {
r = zero;
if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
else {y=x-one;i=2;}
if (ix >= 0x3FFBB4C3) {
y = 2.0 - x;
i = 0;
} /* [1.7316,2] */ else if (ix >= 0x3FF3B4C4) {
y = x - tc;
i = 1;
} /* [1.23,1.73] */ else {
y = x - one;
i = 2;
}
}
switch (i) {
@ -1256,7 +1357,8 @@ public class MathUtils {
p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
p = y * p1 + p2;
r += (p-0.5*y); break;
r += (p - 0.5 * y);
break;
case 1:
z = y * y;
w = z * y;
@ -1264,14 +1366,14 @@ public class MathUtils {
p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
p = z * p1 - (tt - w * (p2 + y * p3));
r += (tf + p); break;
r += (tf + p);
break;
case 2:
p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
r += (-0.5 * y + p1 / p2);
}
}
else if(ix<0x40200000) { /* x < 8.0 */
} else if (ix < 0x40200000) { /* x < 8.0 */
i = (int) x;
t = zero;
y = x - (double) i;
@ -1280,12 +1382,18 @@ public class MathUtils {
r = half * y + p / q;
z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
switch (i) {
case 7: z *= (y+6.0); /* FALLTHRU */
case 6: z *= (y+5.0); /* FALLTHRU */
case 5: z *= (y+4.0); /* FALLTHRU */
case 4: z *= (y+3.0); /* FALLTHRU */
case 3: z *= (y+2.0); /* FALLTHRU */
r += Math.log(z); break;
case 7:
z *= (y + 6.0); /* FALLTHRU */
case 6:
z *= (y + 5.0); /* FALLTHRU */
case 5:
z *= (y + 4.0); /* FALLTHRU */
case 4:
z *= (y + 3.0); /* FALLTHRU */
case 3:
z *= (y + 2.0); /* FALLTHRU */
r += Math.log(z);
break;
}
/* 8.0 <= x < 2**58 */
} else if (ix < 0x43900000) {
@ -1372,4 +1480,40 @@ public class MathUtils {
public static double log10Factorial(int x) {
return log10Gamma(x + 1);
}
/**
* Adds two arrays together and returns a new array with the sum.
*
* @param a one array
* @param b another array
* @return a new array with the sum of a and b
*/
@Requires("a.length == b.length")
@Ensures("result.length == a.length")
public static int[] addArrays(int[] a, int[] b) {
int[] c = new int[a.length];
for (int i = 0; i < a.length; i++)
c[i] = a[i] + b[i];
return c;
}
/**
* Quick implementation of the Knuth-shuffle algorithm to generate a random
* permutation of the given array.
*
* @param array the original array
* @return a new array with the elements shuffled
*/
public static Object[] arrayShuffle(Object[] array) {
int n = array.length;
Object[] shuffled = array.clone();
for (int i = 0; i < n; i++) {
int j = i + GenomeAnalysisEngine.getRandomGenerator().nextInt(n - i);
Object tmp = shuffled[i];
shuffled[i] = shuffled[j];
shuffled[j] = tmp;
}
return shuffled;
}
}

View File

@ -1,10 +1,14 @@
package org.broadinstitute.sting.utils;
import org.apache.commons.io.comparator.LastModifiedFileComparator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
@ -17,6 +21,8 @@ import java.util.List;
* A set of static utility methods for common operations on paths.
*/
public class PathUtils {
private static Logger logger = Logger.getLogger(PathUtils.class);
/**
* Constructor access disallowed...static utility methods only!
*/
@ -118,4 +124,47 @@ public class PathUtils {
}
}
/**
* Walk over the GATK released directories to find the most recent JAR files corresponding
* to the version prefix. For example, providing input "1.2" will
* return the full path to the most recent GenomeAnalysisTK.jar in the GATK_RELEASE_DIR
* in directories that match gatkReleaseDir/GenomeAnalysisTK-1.2*
*
* @param gatkReleaseDir Path to directory containing GATK release binaries (e.g., /humgen/gsa-hpprojects/GATK/bin/)
* @param releaseVersionNumber Desired GATK version number (e.g., 1.2)
* @return A file pointing to the most recent GATK file in the release directory with GATK release number
*/
public static File findMostRecentGATKVersion(final File gatkReleaseDir, final String releaseVersionNumber) {
final String versionString = "GenomeAnalysisTK-" + releaseVersionNumber;
final List<File> gatkJars = new ArrayList<File>();
for ( final String path : gatkReleaseDir.list(new isGATKVersion(versionString)) ) {
gatkJars.add(new File(gatkReleaseDir.getAbsolutePath() + "/" + path + "/GenomeAnalysisTK.jar"));
}
if ( gatkJars.isEmpty() )
return null;
else {
Collections.sort(gatkJars, LastModifiedFileComparator.LASTMODIFIED_REVERSE);
//for ( File jar : gatkJars ) logger.info(String.format("%s => %d", jar, jar.lastModified()));
final File last = gatkJars.get(0);
logger.debug(String.format("findMostRecentGATKVersion: Found %d jars for %s, keeping last one %s",
gatkJars.size(), releaseVersionNumber, last));
return last;
}
}
private final static class isGATKVersion implements FilenameFilter {
private final String versionString;
private isGATKVersion(final String versionString) {
this.versionString = versionString;
}
@Override
public boolean accept(final File file, final String s) {
return s.contains(versionString);
}
}
}

View File

@ -134,8 +134,7 @@ public class ClippingOp {
unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH));
matchesCount = 0;
unclippedCigar.add(element);
}
else
} else
unclippedCigar.add(element);
}
if (matchesCount > 0)
@ -287,7 +286,6 @@ public class ClippingOp {
private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) {
if (start == 0 && stop == read.getReadLength() - 1)
return GATKSAMRecord.emptyRead(read);
// return new GATKSAMRecord(read.getHeader());
// If the read is unmapped there is no Cigar string and neither should we create a new cigar string
@ -325,7 +323,7 @@ public class ClippingOp {
Cigar newCigar = new Cigar();
int index = 0;
int totalHardClipCount = stop - start + 1;
int alignmentShift = 0; // caused by hard clipping insertions or deletions
int alignmentShift = 0; // caused by hard clipping deletions
// hard clip the beginning of the cigar string
if (start == 0) {
@ -448,7 +446,7 @@ public class ClippingOp {
CigarElement cigarElement = cigarStack.pop();
if (!readHasStarted &&
cigarElement.getOperator() != CigarOperator.INSERTION &&
// cigarElement.getOperator() != CigarOperator.INSERTION &&
cigarElement.getOperator() != CigarOperator.DELETION &&
cigarElement.getOperator() != CigarOperator.HARD_CLIP)
readHasStarted = true;
@ -456,9 +454,6 @@ public class ClippingOp {
else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP)
totalHardClip += cigarElement.getLength();
else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION)
shift += cigarElement.getLength();
else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
totalHardClip += cigarElement.getLength();
@ -470,8 +465,7 @@ public class ClippingOp {
addedHardClips = true;
}
inverseCigarStack.push(cigarElement);
}
else {
} else {
if (!addedHardClips) {
if (totalHardClip > 0)
cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));

View File

@ -374,24 +374,43 @@ public class ReadClipper {
* Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail
* and hardClipByReferenceCoordinatesRightTail. Should not be used directly.
*
* Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the
* left of right tail) by specifying either refStart < 0 or refStop < 0.
*
* @param refStart first base to clip (inclusive)
* @param refStop last base to clip (inclusive)
* @return a new read, without the clipped bases
*/
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
@Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip
protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
if (read.isEmpty())
return read;
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
return GATKSAMRecord.emptyRead(read);
// return new GATKSAMRecord(read.getHeader());
int start;
int stop;
// Determine the read coordinate to start and stop hard clipping
if (refStart < 0) {
if (refStop < 0)
throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")");
start = 0;
stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
}
else {
if (refStop >= 0)
throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")");
start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
stop = read.getReadLength() - 1;
}
// if ((start == 0 && stop == read.getReadLength() - 1))
// return GATKSAMRecord.emptyRead(read);
if (start < 0 || stop > read.getReadLength() - 1)
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
if ( start > stop )
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop));
if ( start > 0 && stop < read.getReadLength() - 1)
throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString()));

View File

@ -526,6 +526,42 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
}
}
/**
* Gets the pileup for a set of read groups. Horrendously inefficient at this point.
* @param rgSet List of identifiers for the read groups.
* @return A read-backed pileup containing only the reads in the given read groups.
*/
@Override
public RBP getPileupForReadGroups(final HashSet<String> rgSet) {
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
PerSamplePileupElementTracker<PE> filteredTracker = new PerSamplePileupElementTracker<PE>();
for(final String sample: tracker.getSamples()) {
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
AbstractReadBackedPileup<RBP,PE> pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroups(rgSet);
if(pileup != null)
filteredTracker.addElements(sample,pileup.pileupElementTracker);
}
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
}
else {
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>();
for(PE p: pileupElementTracker) {
GATKSAMRecord read = p.getRead();
if(rgSet != null && !rgSet.isEmpty()) {
if(read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId()))
filteredTracker.add(p);
}
else {
if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null)
filteredTracker.add(p);
}
}
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
}
}
@Override
public RBP getPileupForLane(String laneID) {
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
/**
@ -129,6 +130,13 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
*/
public ReadBackedPileup getPileupForReadGroup(String readGroupId);
/**
* Gets all the reads associated with a given read groups.
* @param rgSet Set of identifiers for the read group.
* @return A pileup containing only the reads in the given read groups.
*/
public ReadBackedPileup getPileupForReadGroups(final HashSet<String> rgSet);
/**
* Gets all reads in a given lane id. (Lane ID is the read group
* id stripped of the last .XX sample identifier added by the GATK).

View File

@ -238,7 +238,7 @@ public class ArtificialSAMUtils {
*/
public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar);
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
}

View File

@ -24,7 +24,6 @@
package org.broadinstitute.sting.utils.sam;
import com.google.java.contract.Ensures;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.NGSPlatform;
@ -184,6 +183,12 @@ public class GATKSAMRecord extends BAMRecord {
return getReducedReadCounts() != null;
}
/**
* The number of bases corresponding the i'th base of the reduced read.
*
* @param i the read based coordinate inside the read
* @return the number of bases corresponding to the i'th base of the reduced read
*/
public final byte getReducedCount(final int i) {
byte firstCount = getReducedReadCounts()[0];
byte offsetCount = getReducedReadCounts()[i];
@ -277,7 +282,6 @@ public class GATKSAMRecord extends BAMRecord {
*
* @return the unclipped start of the read taking soft clips (but not hard clips) into account
*/
@Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"})
public int getSoftStart() {
int start = this.getUnclippedStart();
for (CigarElement cigarElement : this.getCigar().getCigarElements()) {
@ -286,17 +290,17 @@ public class GATKSAMRecord extends BAMRecord {
else
break;
}
return start;
}
/**
* Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
*
* Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips.
* Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
*
* @return the unclipped end of the read taking soft clips (but not hard clips) into account
*/
@Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"})
public int getSoftEnd() {
int stop = this.getUnclippedStart();
@ -313,6 +317,7 @@ public class GATKSAMRecord extends BAMRecord {
else
shift = 0;
}
return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
}

View File

@ -29,6 +29,7 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -58,7 +59,7 @@ public class ReadUtils {
/**
* A HashMap of the SAM spec read flag names
* <p/>
*
* Note: This is not being used right now, but can be useful in the future
*/
private static final Map<Integer, String> readFlagNames = new HashMap<Integer, String>();
@ -79,49 +80,47 @@ public class ReadUtils {
/**
* This enum represents all the different ways in which a read can overlap an interval.
* <p/>
*
* NO_OVERLAP_CONTIG:
* read and interval are in different contigs.
* <p/>
*
* NO_OVERLAP_LEFT:
* the read does not overlap the interval.
* <p/>
*
* |----------------| (interval)
* <----------------> (read)
* <p/>
*
* NO_OVERLAP_RIGHT:
* the read does not overlap the interval.
* <p/>
*
* |----------------| (interval)
* <----------------> (read)
* <p/>
*
* OVERLAP_LEFT:
* the read starts before the beginning of the interval but ends inside of it
* <p/>
*
* |----------------| (interval)
* <----------------> (read)
* <p/>
*
* OVERLAP_RIGHT:
* the read starts inside the interval but ends outside of it
* <p/>
*
* |----------------| (interval)
* <----------------> (read)
* <p/>
*
* OVERLAP_LEFT_AND_RIGHT:
* the read starts before the interval and ends after the interval
* <p/>
*
* |-----------| (interval)
* <-------------------> (read)
* <p/>
*
* OVERLAP_CONTAINED:
* the read starts and ends inside the interval
* <p/>
*
* |----------------| (interval)
* <--------> (read)
*/
public enum ReadAndIntervalOverlap {
NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED
}
public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED}
/**
* Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular
@ -141,12 +140,12 @@ public class ReadUtils {
/**
* is this base inside the adaptor of the read?
* <p/>
*
* There are two cases to treat here:
* <p/>
*
* 1) Read is in the negative strand => Adaptor boundary is on the left tail
* 2) Read is in the positive strand => Adaptor boundary is on the right tail
* <p/>
*
* Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event)
*
* @param read the read to test
@ -166,21 +165,21 @@ public class ReadUtils {
* the read boundary. If the read is in the positive strand, this is the first base after the end of the
* fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the
* beginning of the fragment.
* <p/>
*
* There are two cases we need to treat here:
* <p/>
*
* 1) Our read is in the reverse strand :
* <p/>
*
* <----------------------| *
* |--------------------->
* <p/>
*
* in these cases, the adaptor boundary is at the mate start (minus one)
* <p/>
*
* 2) Our read is in the forward strand :
* <p/>
*
* |----------------------> *
* <----------------------|
* <p/>
*
* in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one)
*
* @param read the read being tested for the adaptor boundary
@ -264,7 +263,7 @@ public class ReadUtils {
/**
* If a read starts in INSERTION, returns the first element length.
* <p/>
*
* Warning: If the read has Hard or Soft clips before the insertion this function will return 0.
*
* @param read
@ -280,7 +279,7 @@ public class ReadUtils {
/**
* If a read ends in INSERTION, returns the last element length.
* <p/>
*
* Warning: If the read has Hard or Soft clips after the insertion this function will return 0.
*
* @param read
@ -297,7 +296,6 @@ public class ReadUtils {
/**
* Determines what is the position of the read in relation to the interval.
* Note: This function uses the UNCLIPPED ENDS of the reads for the comparison.
*
* @param read the read
* @param interval the interval
* @return the overlap type as described by ReadAndIntervalOverlap enum (see above)
@ -340,36 +338,52 @@ public class ReadUtils {
}
/**
* Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in
* a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns
* the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the
* deletion.
* Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of
* two corner cases:
*
* 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside
* a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it
* doesn't matter because it already returns the previous base by default.
*
* 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the
* read starts with an insertion, and you're requesting the first read based coordinate, it will skip
* the leading insertion (because it has the same reference coordinate as the following base).
*
* @param read
* @param refCoord
* @param tail
* @return the read coordinate corresponding to the requested reference coordinate for clipping.
*/
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"})
@Ensures({"result >= 0", "result < read.getReadLength()"})
public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) {
Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(read, refCoord);
int readCoord = result.getFirst();
// Corner case one: clipping the right tail and falls on deletion, move to the next
// read coordinate. It is not a problem for the left tail because the default answer
// from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate.
if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL)
readCoord++;
// clipping the left tail and first base is insertion, go to the next read coordinate
// with the same reference coordinate. Advance to the next cigar element, or to the
// end of the read if there is no next element.
Pair<Boolean, CigarElement> firstElementIsInsertion = readStartsWithInsertion(read);
if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst())
readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1);
return readCoord;
}
/**
* Returns the read coordinate corresponding to the requested reference coordinate.
* <p/>
*
* WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function
* will return the last read base before the deletion. This function returns a
* Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with
* a deletion.
* <p/>
*
* SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a
* pre-processed result according to normal clipping needs. Or you can use this function and tailor the
* behavior to your needs.
@ -457,7 +471,6 @@ public class ReadUtils {
if (!goalReached)
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
}
@ -465,12 +478,11 @@ public class ReadUtils {
* Compares two SAMRecords only the basis on alignment start. Note that
* comparisons are performed ONLY on the basis of alignment start; any
* two SAM records with the same alignment start will be considered equal.
* <p/>
*
* Unmapped alignments will all be considered equal.
*/
@Requires({"read1 != null", "read2 != null"})
@Ensures("result == 0 || result == 1 || result == -1")
public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) {
AlignmentStartComparator comp = new AlignmentStartComparator();
return comp.compare(read1, read2);
@ -502,4 +514,134 @@ public class ReadUtils {
}
/**
* Checks if a read starts with an insertion. It looks beyond Hard and Soft clips
* if there are any.
*
* @param read
* @return A pair with the answer (true/false) and the element or null if it doesn't exist
*/
public static Pair<Boolean, CigarElement> readStartsWithInsertion(GATKSAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
if (cigarElement.getOperator() == CigarOperator.INSERTION)
return new Pair<Boolean, CigarElement>(true, cigarElement);
else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP)
break;
}
return new Pair<Boolean, CigarElement>(false, null);
}
/**
* Returns the coverage distribution of a list of reads within the desired region.
*
* See getCoverageDistributionOfRead for information on how the coverage is calculated.
*
* @param list the list of reads covering the region
* @param startLocation the first reference coordinate of the region (inclusive)
* @param stopLocation the last reference coordinate of the region (inclusive)
* @return an array with the coverage of each position from startLocation to stopLocation
*/
public static int [] getCoverageDistributionOfReads(List<GATKSAMRecord> list, int startLocation, int stopLocation) {
int [] totalCoverage = new int[stopLocation - startLocation + 1];
for (GATKSAMRecord read : list) {
int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage);
}
return totalCoverage;
}
/**
* Returns the coverage distribution of a single read within the desired region.
*
* Note: This function counts DELETIONS as coverage (since the main purpose is to downsample
* reads for variant regions, and deletions count as variants)
*
* @param read the read to get the coverage distribution of
* @param startLocation the first reference coordinate of the region (inclusive)
* @param stopLocation the last reference coordinate of the region (inclusive)
* @return an array with the coverage of each position from startLocation to stopLocation
*/
public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) {
int [] coverage = new int[stopLocation - startLocation + 1];
int refLocation = read.getSoftStart();
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
switch (cigarElement.getOperator()) {
case S:
case M:
case EQ:
case N:
case X:
case D:
for (int i = 0; i < cigarElement.getLength(); i++) {
if (refLocation >= startLocation && refLocation <= stopLocation) {
int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1;
coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases
}
refLocation++;
}
break;
case P:
case I:
case H:
break;
}
if (refLocation > stopLocation)
break;
}
return coverage;
}
/**
* Makes association maps for the reads and loci coverage as described below :
*
* - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage.
* Note: Locus is in reference coordinates.
* Example: Locus => {read1, read2, ..., readN}
*
* - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage.
* Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage.
* Example: Read => {true, true, false, ... false}
*
* @param readList the list of reads to generate the association mappings
* @param startLocation the first reference coordinate of the region (inclusive)
* @param stopLocation the last reference coordinate of the region (inclusive)
* @return the two hashmaps described above
*/
public static Pair<HashMap<Integer, HashSet<GATKSAMRecord>> , HashMap<GATKSAMRecord, Boolean[]>> getBothReadToLociMappings (List<GATKSAMRecord> readList, int startLocation, int stopLocation) {
int arraySize = stopLocation - startLocation + 1;
HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f);
HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f);
for (int i = startLocation; i <= stopLocation; i++)
locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists
for (GATKSAMRecord read : readList) {
readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays
int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
for (int i=0; i<readCoverage.length; i++) {
int refLocation = i + startLocation;
if (readCoverage[i] > 0) {
// Update the hash for this locus
HashSet<GATKSAMRecord> readSet = locusToReadMap.get(refLocation);
readSet.add(read);
// Add this locus to the read hash
readToLocusMap.get(read)[refLocation - startLocation] = true;
}
else
// Update the boolean array with a 'no coverage' from this read to this locus
readToLocusMap.get(read)[refLocation-startLocation] = false;
}
}
return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap);
}
}

View File

@ -202,7 +202,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibratorSolidIndelsRemoveRefBias() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "613fb2bbe01af8cbe6a188a10c1582ca" );
e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -308,7 +308,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibratorNoIndex() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "13c83656567cee9e93bda9874ee80234" );
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "991f093a0e610df235d28ada418ebf33" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();

View File

@ -26,22 +26,20 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import org.broadinstitute.sting.BaseTest;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.*;
/**
* Basic unit test for MathUtils
*/
public class MathUtilsUnitTest extends BaseTest {
@BeforeClass
public void init() { }
public void init() {
}
/**
* Tests that we get the right values from the binomial distribution
@ -123,7 +121,9 @@ public class MathUtilsUnitTest extends BaseTest {
Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha));
}
/** Tests that we correctly compute mean and standard deviation from a stream of numbers */
/**
* Tests that we correctly compute mean and standard deviation from a stream of numbers
*/
@Test
public void testRunningAverage() {
logger.warn("Executing testRunningAverage");
@ -174,4 +174,56 @@ public class MathUtilsUnitTest extends BaseTest {
Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1);
}
@Test(enabled = true)
public void testRandomSubset() {
Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0);
Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1);
Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2);
Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3);
Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4);
Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5);
Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6);
Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7);
Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8);
Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9);
Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10);
Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10);
for (int i = 0; i < 25; i++)
Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5)));
}
@Test(enabled = true)
public void testArrayShuffle() {
Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
for (int i = 0; i < 25; i++) {
Object[] t = MathUtils.arrayShuffle(x);
Assert.assertTrue(hasUniqueElements(t));
Assert.assertTrue(hasAllElements(x, t));
}
}
private boolean hasUniqueElements(Object[] x) {
for (int i = 0; i < x.length; i++)
for (int j = i + 1; j < x.length; j++)
if (x[i].equals(x[j]) || x[i] == x[j])
return false;
return true;
}
private boolean hasAllElements(final Object[] expected, final Object[] actual) {
HashSet<Object> set = new HashSet<Object>();
set.addAll(Arrays.asList(expected));
set.removeAll(Arrays.asList(actual));
return set.isEmpty();
}
private void p (Object []x) {
for (Object v: x)
System.out.print((Integer) v + " ");
System.out.println();
}
}

View File

@ -112,7 +112,8 @@ public class ReadClipperTestUtils {
}
}
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION)
// if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION)
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION)
return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
}
@ -190,4 +191,18 @@ public class ReadClipperTestUtils {
return invertedCigar;
}
/**
* Checks whether or not the read has any cigar element that is not H or S
*
* @param read
* @return true if it has any M, I or D, false otherwise
*/
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements())
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
return true;
return false;
}
}

View File

@ -30,12 +30,12 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.util.HashMap;
import java.util.List;
/**
@ -63,6 +63,7 @@ public class ReadClipperUnitTest extends BaseTest {
GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i);
Assert.assertTrue(clippedRead.getAlignmentStart() >= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString()));
Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString()));
assertUnclippedLimits(read, clippedRead);
}
}
}
@ -75,9 +76,11 @@ public class ReadClipperUnitTest extends BaseTest {
for (int i = 0; i < readLength; i++) {
GATKSAMRecord clipLeft = ReadClipper.hardClipByReadCoordinates(read, 0, i);
Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString()));
assertUnclippedLimits(read, clipLeft);
GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1);
Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString()));
assertUnclippedLimits(read, clipRight);
}
}
}
@ -86,19 +89,27 @@ public class ReadClipperUnitTest extends BaseTest {
public void testHardClipByReferenceCoordinates() {
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
int alnStart = read.getAlignmentStart();
int alnEnd = read.getAlignmentEnd();
for (int i=alnStart; i<=alnEnd; i++) {
if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side
GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i);
if (!clipLeft.isEmpty())
Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
int start = read.getSoftStart();
int stop = read.getSoftEnd();
// System.out.println(String.format("CIGAR: %s (%d, %d)", cigar.toString(), start, stop));
// if (ReadUtils.readIsEntirelyInsertion(read))
// System.out.println("debug");
for (int i = start; i <= stop; i++) {
GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i);
if (!clipLeft.isEmpty()) {
// System.out.println(String.format("\t left [%d] %s -> %s ", i-start+1, cigar.toString(), clipLeft.getCigarString()));
Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
assertUnclippedLimits(read, clipLeft);
}
if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side
GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd);
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1);
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
// System.out.println(String.format("\t right [%d] %s -> %s ", i-start+1, cigar.toString(), clipRight.getCigarString()));
Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
assertUnclippedLimits(read, clipRight);
}
}
}
@ -113,8 +124,12 @@ public class ReadClipperUnitTest extends BaseTest {
if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side
for (int i = alnStart; i <= alnEnd; i++) {
GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i);
if (!clipLeft.isEmpty())
if (!clipLeft.isEmpty()) {
// System.out.println(String.format("Left Tail [%d]: %s (%d,%d,%d : %d,%d,%d) -> %s (%d,%d,%d : %d,%d,%d)", i, cigar.toString(), read.getUnclippedStart(), read.getSoftStart(), read.getAlignmentStart(), read.getAlignmentEnd(), read.getSoftEnd(), read.getUnclippedEnd(), clipLeft.getCigarString(), clipLeft.getUnclippedStart(), clipLeft.getSoftStart(), clipLeft.getAlignmentStart(), clipLeft.getAlignmentEnd(), clipLeft.getSoftEnd(), clipLeft.getUnclippedEnd()));
Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
assertUnclippedLimits(read, clipLeft);
}
}
}
}
@ -129,8 +144,10 @@ public class ReadClipperUnitTest extends BaseTest {
if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side
for (int i = alnStart; i <= alnEnd; i++) {
GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i);
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
assertUnclippedLimits(read, clipRight);
}
}
}
}
@ -148,40 +165,33 @@ public class ReadClipperUnitTest extends BaseTest {
byte[] quals = new byte[readLength];
for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) {
// create a read with nLowQualBases in the left tail
Utils.fillArrayWithByte(quals, HIGH_QUAL);
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail
for (int addLeft = 0; addLeft < nLowQualBases; addLeft++)
quals[addLeft] = LOW_QUAL;
read.setBaseQualities(quals);
GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
// Tests
assertUnclippedLimits(read, clipLeft); // Make sure limits haven't changed
assertNoLowQualBases(clipLeft, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
// Make sure the low qualities are gone
assertNoLowQualBases(clipLeft, LOW_QUAL);
// Can't run this test with the current contract of no hanging insertions
// Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
// create a read with nLowQualBases in the right tail
Utils.fillArrayWithByte(quals, HIGH_QUAL);
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the right tail
for (int addRight = 0; addRight < nLowQualBases; addRight++)
quals[readLength - addRight - 1] = LOW_QUAL;
read.setBaseQualities(quals);
GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
// Tests
// System.out.println(String.format("Debug [%d]: %s -> %s / %s", nLowQualBases, cigar.toString(), clipLeft.getCigarString(), clipRight.getCigarString()));
// Make sure the low qualities are gone
assertNoLowQualBases(clipRight, LOW_QUAL);
assertUnclippedLimits(read, clipRight); // Make sure limits haven't changed
assertNoLowQualBases(clipRight, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString()));
// Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions
//Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString()));
// create a read with nLowQualBases in the both tails
if (nLowQualBases <= readLength / 2) {
Utils.fillArrayWithByte(quals, HIGH_QUAL);
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases on both tails
for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) {
quals[addBoth] = LOW_QUAL;
quals[readLength - addBoth - 1] = LOW_QUAL;
@ -189,83 +199,25 @@ public class ReadClipperUnitTest extends BaseTest {
read.setBaseQualities(quals);
GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
// Tests
// Make sure the low qualities are gone
assertNoLowQualBases(clipBoth, LOW_QUAL);
// Can't run this test with the current contract of no hanging insertions
//Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString()));
assertUnclippedLimits(read, clipBoth); // Make sure limits haven't changed
assertNoLowQualBases(clipBoth, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2 * nLowQualBases), read.getCigarString(), clipBoth.getCigarString()));
}
}
// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString()));
}
// ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug )
final byte[] BASES = {'A','C','G','T','A','C','G','T'};
final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2};
final String CIGAR = "1S1M5I1S";
final byte[] CLIPPED_BASES = {};
final byte[] CLIPPED_QUALS = {};
final String CLIPPED_CIGAR = "";
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR);
GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR);
ReadClipperTestUtils.assertEqualReads(ReadClipper.hardClipLowQualEnds(read, (byte) 2), expected);
}
@Test(enabled = true)
public void testHardClipSoftClippedBases() {
// Generate a list of cigars to test
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read);
CigarCounter original = new CigarCounter(read);
CigarCounter clipped = new CigarCounter(clippedRead);
int sumHardClips = 0;
int sumMatches = 0;
boolean tail = true;
for (CigarElement element : read.getCigar().getCigarElements()) {
// Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right)
if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP)
tail = true;
// Adds all H, S and D's (next to hard/soft clips).
// All these should be hard clips after clipping.
if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION))
sumHardClips += element.getLength();
// this means we're no longer on the tail (insertions can still potentially be the tail because
// of the current contract of clipping out hanging insertions
else if (element.getOperator() != CigarOperator.INSERTION)
tail = false;
// Adds all matches to verify that they remain the same after clipping
if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH)
sumMatches += element.getLength();
}
for (CigarElement element : clippedRead.getCigar().getCigarElements()) {
// Test if clipped read has Soft Clips (shouldn't have any!)
Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString()));
// Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for
if (element.getOperator() == CigarOperator.HARD_CLIP)
sumHardClips -= element.getLength();
// Make sure all matches are still there
if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH)
sumMatches -= element.getLength();
}
Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips));
Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches));
// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString()));
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS
}
}
@ -276,6 +228,8 @@ public class ReadClipperUnitTest extends BaseTest {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read);
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION);
if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar()))
expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION);
@ -283,16 +237,14 @@ public class ReadClipperUnitTest extends BaseTest {
if (!clippedRead.isEmpty()) {
Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there
Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone
}
else
} else
Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped
}
}
}
@Test(enabled = true)
public void testRevertSoftClippedBases()
{
public void testRevertSoftClippedBases() {
for (Cigar cigar : cigarList) {
final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP);
final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP);
@ -300,14 +252,15 @@ public class ReadClipperUnitTest extends BaseTest {
final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read);
assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed
if (leadingSoftClips > 0 || tailSoftClips > 0) {
final int expectedStart = read.getAlignmentStart() - leadingSoftClips;
final int expectedEnd = read.getAlignmentEnd() + tailSoftClips;
Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart);
Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd);
}
else
} else
Assert.assertEquals(read.getCigarString(), unclipped.getCigarString());
}
}
@ -321,6 +274,19 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
/**
* Asserts that clipping doesn't change the getUnclippedStart / getUnclippedEnd
*
* @param original
* @param clipped
*/
private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) {
if (ReadClipperTestUtils.readHasNonClippedBases(clipped)) {
Assert.assertEquals(original.getUnclippedStart(), clipped.getUnclippedStart());
Assert.assertEquals(original.getUnclippedEnd(), clipped.getUnclippedEnd());
}
}
private boolean startsWithInsertion(Cigar cigar) {
return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0;
}
@ -341,4 +307,40 @@ public class ReadClipperUnitTest extends BaseTest {
return true;
return false;
}
private class CigarCounter {
private HashMap<CigarOperator, Integer> counter;
public Integer getCounterForOp(CigarOperator operator) {
return counter.get(operator);
}
public CigarCounter(GATKSAMRecord read) {
CigarOperator[] operators = CigarOperator.values();
counter = new HashMap<CigarOperator, Integer>(operators.length);
for (CigarOperator op : operators)
counter.put(op, 0);
for (CigarElement cigarElement : read.getCigar().getCigarElements())
counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength());
}
public boolean assertHardClippingSoftClips(CigarCounter clipped) {
for (CigarOperator op : counter.keySet()) {
if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) {
int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP);
int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP);
int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP);
Assert.assertEquals(counterTotal, clippedHard);
Assert.assertTrue(clippedSoft == 0);
} else
Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op));
}
return true;
}
}
}