Minor bug fix in k-means implementation. Updating VQSR integration tests in preparation for VQSRv2 by removing some unused features such as VariantDatum.weight and ti/tv cutting.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5410 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fa7284b7a1
commit
509daac9f7
|
|
@ -87,7 +87,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
||||||
private double QUAL_THRESHOLD = 100.0;
|
private double QUAL_THRESHOLD = 100.0;
|
||||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||||
private double SHRINKAGE = 0.0001;
|
private double SHRINKAGE = 0.0001;
|
||||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
|
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
|
||||||
private double DIRICHLET_PARAMETER = 1000.0;
|
private double DIRICHLET_PARAMETER = 1000.0;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
|
||||||
|
|
@ -224,7 +224,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
empiricalMu = new double[numAnnotations];
|
empiricalMu = new double[numAnnotations];
|
||||||
final double[][] sigmaVals = new double[numAnnotations][numAnnotations];
|
final double[][] sigmaVals = new double[numAnnotations][numAnnotations];
|
||||||
|
|
||||||
|
|
||||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
empiricalMu[jjj] = 0.0;
|
empiricalMu[jjj] = 0.0;
|
||||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||||
|
|
@ -268,7 +267,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
final int numAnnotations = data[0].annotations.length;
|
final int numAnnotations = data[0].annotations.length;
|
||||||
|
|
||||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||||
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
//mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
||||||
|
mu[kkk] = new double[numAnnotations];
|
||||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
|
mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
|
||||||
}
|
}
|
||||||
|
|
@ -310,10 +310,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
int numAssigned = 0;
|
int numAssigned = 0;
|
||||||
|
|
||||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||||
if(assignment[iii] == kkk) {
|
final VariantDatum datum = data[iii];
|
||||||
numAssigned++;
|
if(datum.weight > 0.0) {
|
||||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
if(assignment[iii] == kkk) {
|
||||||
mu[kkk][jjj] += data[iii].annotations[jjj];
|
numAssigned++;
|
||||||
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
|
mu[kkk][jjj] += data[iii].annotations[jjj];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -924,6 +927,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
return Math.log(x) - ( 1.0 / (2.0 * x) )
|
return Math.log(x) - ( 1.0 / (2.0 * x) )
|
||||||
- ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
|
- ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
|
||||||
+ ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
|
+ ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
|
||||||
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
|
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -371,6 +371,14 @@ public class MathUtils {
|
||||||
return Math.sqrt(rms);
|
return Math.sqrt(rms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static double distanceSquared( final double[] x, final double[] y ) {
|
||||||
|
double dist = 0.0;
|
||||||
|
for(int iii = 0; iii < x.length; iii++) {
|
||||||
|
dist += (x[iii] - y[iii]) * (x[iii] - y[iii]);
|
||||||
|
}
|
||||||
|
return dist;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
||||||
*
|
*
|
||||||
|
|
@ -401,6 +409,27 @@ public class MathUtils {
|
||||||
return normalized;
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static double[] normalizeFromLog10(List<Double> array, boolean takeLog10OfOutput) {
|
||||||
|
double[] normalized = new double[array.size()];
|
||||||
|
|
||||||
|
// for precision purposes, we need to add (or really subtract, since they're
|
||||||
|
// all negative) the largest value; also, we need to convert to normal-space.
|
||||||
|
double maxValue = MathUtils.arrayMax( array );
|
||||||
|
for (int i = 0; i < array.size(); i++)
|
||||||
|
normalized[i] = Math.pow(10, array.get(i) - maxValue);
|
||||||
|
|
||||||
|
// normalize
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 0; i < array.size(); i++)
|
||||||
|
sum += normalized[i];
|
||||||
|
for (int i = 0; i < array.size(); i++) {
|
||||||
|
double x = normalized[i] / sum;
|
||||||
|
if ( takeLog10OfOutput ) x = Math.log10(x);
|
||||||
|
normalized[i] = x;
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
||||||
*
|
*
|
||||||
|
|
@ -412,6 +441,10 @@ public class MathUtils {
|
||||||
return normalizeFromLog10(array, false);
|
return normalizeFromLog10(array, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static double[] normalizeFromLog10(List<Double> array) {
|
||||||
|
return normalizeFromLog10(array, false);
|
||||||
|
}
|
||||||
|
|
||||||
public static int maxElementIndex(double[] array) {
|
public static int maxElementIndex(double[] array) {
|
||||||
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||||
|
|
||||||
|
|
@ -469,6 +502,15 @@ public class MathUtils {
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static double arrayMax(List<Double> array) {
|
||||||
|
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||||
|
if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!");
|
||||||
|
|
||||||
|
double m = array.get(0);
|
||||||
|
for ( double e : array ) m = Math.max(m, e);
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
public static double average(List<Long> vals, int maxI) {
|
public static double average(List<Long> vals, int maxI) {
|
||||||
long sum = 0L;
|
long sum = 0L;
|
||||||
|
|
||||||
|
|
@ -814,6 +856,15 @@ public class MathUtils {
|
||||||
return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.));
|
return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// from http://en.wikipedia.org/wiki/Digamma_function
|
||||||
|
// According to J.M. Bernardo AS 103 algorithm the digamma function for x, a real number, can be approximated by:
|
||||||
|
public static double diGamma(final double x) {
|
||||||
|
return Math.log(x) - ( 1.0 / (2.0 * x) )
|
||||||
|
- ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
|
||||||
|
+ ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
|
||||||
|
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
|
||||||
|
}
|
||||||
|
|
||||||
/** A utility class that computes on the fly average and standard deviation for a stream of numbers.
|
/** A utility class that computes on the fly average and standard deviation for a stream of numbers.
|
||||||
* The number of observations does not have to be known in advance, and can be also very big (so that
|
* The number of observations does not have to be known in advance, and can be also very big (so that
|
||||||
* it could overflow any naive summation-based scheme or cause loss of precision).
|
* it could overflow any naive summation-based scheme or cause loss of precision).
|
||||||
|
|
|
||||||
|
|
@ -29,16 +29,16 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||||
"4eeffa7a1965ce0c25c5edd0bae76290", // in vcf
|
"e14079c3a02c112665a6c194fd4f5d5c", // cluster file
|
||||||
"a04e00d00c991d76900634376bc1a9d1", // tranches
|
"dce581b880ffb6ea39cbada1ecc95915", // tranches
|
||||||
"f34b36c1da8bcb080a584592d1f6dae7", // recalVCF
|
"c3e8a2f43656eab7d847dbf850f844a6", // recalVCF
|
||||||
"ee07ff6c0ff6108e00b131ce3c7a7f65"); // cut VCF
|
"50f752a72643db9ad0aa94b3fc4e23d6"); // cut VCF
|
||||||
|
|
||||||
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
|
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
|
||||||
"8937a3ae7f176dacf47b8ee6c0023416", // in vcf
|
"b0c0f8c8d9fe3d1ed2bde0b1eb82a22d", // cluster file
|
||||||
"8da48dec888c9d4e3c3c7ed7943c0c61", // tranches
|
"66edae83c50f4e8601fef7fafba774af", // tranches
|
||||||
"ae6a1e0874c966312e891b5a3c47b0e3", // recalVCF
|
"0123537e373657386068a534c0f5c91b", // recalVCF
|
||||||
"3ac16abdc5bbd5148f0cf88e8a7af3c9"); // cut VCF
|
"2172368e8585841e5ad96c95d0827c4b"); // cut VCF
|
||||||
|
|
||||||
@DataProvider(name = "VRTest")
|
@DataProvider(name = "VRTest")
|
||||||
public Object[][] createData1() {
|
public Object[][] createData1() {
|
||||||
|
|
@ -51,14 +51,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
"-R " + b36KGReference +
|
"-R " + b36KGReference +
|
||||||
" -NO_HEADER" +
|
" -NO_HEADER" +
|
||||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
|
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
" -weightDBSNP 1.0 -weightHapMap 1.0" +
|
||||||
" -T GenerateVariantClusters" +
|
" -T GenerateVariantClusters" +
|
||||||
" -B:input,VCF " + params.inVCF +
|
" -B:input,VCF " + params.inVCF +
|
||||||
" -L 1:50,000,000-200,000,000" +
|
" -L 1:50,000,000-200,000,000" +
|
||||||
" -qual 50.0" +
|
" -qual 50.0" +
|
||||||
" --ignore_filter GATK_STANDARD" +
|
" --ignore_filter GATK_STANDARD" +
|
||||||
" -an QD -an HRun -an SB" +
|
" -an QD -an MQ -an SB" +
|
||||||
" -clusterFile %s",
|
" -clusterFile %s",
|
||||||
Arrays.asList(params.clusterMD5));
|
Arrays.asList(params.clusterMD5));
|
||||||
executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst();
|
executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst();
|
||||||
|
|
@ -72,13 +72,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
" -NO_HEADER" +
|
" -NO_HEADER" +
|
||||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||||
|
" -B:truthHapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||||
" -T VariantRecalibrator" +
|
" -T VariantRecalibrator" +
|
||||||
" -B:input,VCF " + params.inVCF +
|
" -B:input,VCF " + params.inVCF +
|
||||||
" -L 1:20,000,000-100,000,000" +
|
" -L 1:20,000,000-100,000,000" +
|
||||||
" --ignore_filter GATK_STANDARD" +
|
" --ignore_filter GATK_STANDARD" +
|
||||||
" --ignore_filter HARD_TO_VALIDATE" +
|
" --ignore_filter HARD_TO_VALIDATE" +
|
||||||
" -clusterFile " + getFileForMD5(params.clusterMD5) +
|
" -clusterFile " + getFileForMD5(params.clusterMD5) +
|
||||||
" -titv 2.07" +
|
" -sm TRUTH_SENSITIVITY" +
|
||||||
" -o %s" +
|
" -o %s" +
|
||||||
" -tranchesFile %s",
|
" -tranchesFile %s",
|
||||||
Arrays.asList(params.recalVCFMD5, params.tranchesMD5));
|
Arrays.asList(params.recalVCFMD5, params.tranchesMD5));
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue