Minor bug fix in k-means implementation. Updating VQSR integration tests in preparation for VQSRv2 by removing some unused features such as VariantDatum.weight and ti/tv cutting.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5410 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fa7284b7a1
commit
509daac9f7
|
|
@ -87,7 +87,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
private double QUAL_THRESHOLD = 100.0;
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||
private double SHRINKAGE = 0.0001;
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
|
||||
private double DIRICHLET_PARAMETER = 1000.0;
|
||||
|
||||
/////////////////////////////
|
||||
|
|
|
|||
|
|
@ -224,7 +224,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
empiricalMu = new double[numAnnotations];
|
||||
final double[][] sigmaVals = new double[numAnnotations][numAnnotations];
|
||||
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
empiricalMu[jjj] = 0.0;
|
||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||
|
|
@ -268,7 +267,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
||||
//mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
||||
mu[kkk] = new double[numAnnotations];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
|
||||
}
|
||||
|
|
@ -310,10 +310,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
int numAssigned = 0;
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
if(assignment[iii] == kkk) {
|
||||
numAssigned++;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] += data[iii].annotations[jjj];
|
||||
final VariantDatum datum = data[iii];
|
||||
if(datum.weight > 0.0) {
|
||||
if(assignment[iii] == kkk) {
|
||||
numAssigned++;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] += data[iii].annotations[jjj];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -924,6 +927,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
return Math.log(x) - ( 1.0 / (2.0 * x) )
|
||||
- ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
|
||||
+ ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
|
||||
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
|
||||
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -371,6 +371,14 @@ public class MathUtils {
|
|||
return Math.sqrt(rms);
|
||||
}
|
||||
|
||||
public static double distanceSquared( final double[] x, final double[] y ) {
|
||||
double dist = 0.0;
|
||||
for(int iii = 0; iii < x.length; iii++) {
|
||||
dist += (x[iii] - y[iii]) * (x[iii] - y[iii]);
|
||||
}
|
||||
return dist;
|
||||
}
|
||||
|
||||
/**
|
||||
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
||||
*
|
||||
|
|
@ -401,6 +409,27 @@ public class MathUtils {
|
|||
return normalized;
|
||||
}
|
||||
|
||||
public static double[] normalizeFromLog10(List<Double> array, boolean takeLog10OfOutput) {
|
||||
double[] normalized = new double[array.size()];
|
||||
|
||||
// for precision purposes, we need to add (or really subtract, since they're
|
||||
// all negative) the largest value; also, we need to convert to normal-space.
|
||||
double maxValue = MathUtils.arrayMax( array );
|
||||
for (int i = 0; i < array.size(); i++)
|
||||
normalized[i] = Math.pow(10, array.get(i) - maxValue);
|
||||
|
||||
// normalize
|
||||
double sum = 0.0;
|
||||
for (int i = 0; i < array.size(); i++)
|
||||
sum += normalized[i];
|
||||
for (int i = 0; i < array.size(); i++) {
|
||||
double x = normalized[i] / sum;
|
||||
if ( takeLog10OfOutput ) x = Math.log10(x);
|
||||
normalized[i] = x;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
/**
|
||||
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
||||
*
|
||||
|
|
@ -412,6 +441,10 @@ public class MathUtils {
|
|||
return normalizeFromLog10(array, false);
|
||||
}
|
||||
|
||||
public static double[] normalizeFromLog10(List<Double> array) {
|
||||
return normalizeFromLog10(array, false);
|
||||
}
|
||||
|
||||
public static int maxElementIndex(double[] array) {
|
||||
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
|
|
@ -469,6 +502,15 @@ public class MathUtils {
|
|||
return m;
|
||||
}
|
||||
|
||||
public static double arrayMax(List<Double> array) {
|
||||
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!");
|
||||
|
||||
double m = array.get(0);
|
||||
for ( double e : array ) m = Math.max(m, e);
|
||||
return m;
|
||||
}
|
||||
|
||||
public static double average(List<Long> vals, int maxI) {
|
||||
long sum = 0L;
|
||||
|
||||
|
|
@ -814,6 +856,15 @@ public class MathUtils {
|
|||
return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.));
|
||||
}
|
||||
|
||||
// from http://en.wikipedia.org/wiki/Digamma_function
|
||||
// According to J.M. Bernardo AS 103 algorithm the digamma function for x, a real number, can be approximated by:
|
||||
public static double diGamma(final double x) {
|
||||
return Math.log(x) - ( 1.0 / (2.0 * x) )
|
||||
- ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
|
||||
+ ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
|
||||
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
|
||||
}
|
||||
|
||||
/** A utility class that computes on the fly average and standard deviation for a stream of numbers.
|
||||
* The number of observations does not have to be known in advance, and can be also very big (so that
|
||||
* it could overflow any naive summation-based scheme or cause loss of precision).
|
||||
|
|
|
|||
|
|
@ -29,16 +29,16 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||
"4eeffa7a1965ce0c25c5edd0bae76290", // in vcf
|
||||
"a04e00d00c991d76900634376bc1a9d1", // tranches
|
||||
"f34b36c1da8bcb080a584592d1f6dae7", // recalVCF
|
||||
"ee07ff6c0ff6108e00b131ce3c7a7f65"); // cut VCF
|
||||
"e14079c3a02c112665a6c194fd4f5d5c", // cluster file
|
||||
"dce581b880ffb6ea39cbada1ecc95915", // tranches
|
||||
"c3e8a2f43656eab7d847dbf850f844a6", // recalVCF
|
||||
"50f752a72643db9ad0aa94b3fc4e23d6"); // cut VCF
|
||||
|
||||
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
|
||||
"8937a3ae7f176dacf47b8ee6c0023416", // in vcf
|
||||
"8da48dec888c9d4e3c3c7ed7943c0c61", // tranches
|
||||
"ae6a1e0874c966312e891b5a3c47b0e3", // recalVCF
|
||||
"3ac16abdc5bbd5148f0cf88e8a7af3c9"); // cut VCF
|
||||
"b0c0f8c8d9fe3d1ed2bde0b1eb82a22d", // cluster file
|
||||
"66edae83c50f4e8601fef7fafba774af", // tranches
|
||||
"0123537e373657386068a534c0f5c91b", // recalVCF
|
||||
"2172368e8585841e5ad96c95d0827c4b"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
|
|
@ -51,14 +51,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 1.0 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClusters" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an SB" +
|
||||
" -an QD -an MQ -an SB" +
|
||||
" -clusterFile %s",
|
||||
Arrays.asList(params.clusterMD5));
|
||||
executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst();
|
||||
|
|
@ -72,13 +72,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -B:truthHapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -T VariantRecalibrator" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" --ignore_filter HARD_TO_VALIDATE" +
|
||||
" -clusterFile " + getFileForMD5(params.clusterMD5) +
|
||||
" -titv 2.07" +
|
||||
" -sm TRUTH_SENSITIVITY" +
|
||||
" -o %s" +
|
||||
" -tranchesFile %s",
|
||||
Arrays.asList(params.recalVCFMD5, params.tranchesMD5));
|
||||
|
|
|
|||
Loading…
Reference in New Issue