Memory optimization for the VariantRecalibrator. Only add variants to the list if they pass the novelty and qual filters.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4051 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b7c60b9729
commit
8f15b2ba72
|
|
@ -153,7 +153,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
|
||||
final double annotationValues[] = new double[annotationKeys.size()];
|
||||
|
||||
// todo -- do we really need to support multiple tracks -- logic is cleaner without this case -- what's the use case?
|
||||
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
|
|
@ -183,8 +182,10 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
|
||||
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
|
||||
}
|
||||
|
||||
mapList.add( variantDatum );
|
||||
|
||||
if( variantDatum.weight > 0.0 && variantDatum.qual > QUAL_THRESHOLD ) {
|
||||
mapList.add( variantDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -188,77 +188,41 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
// Initialize the Allele Count prior
|
||||
generateAlleleCountPrior();
|
||||
|
||||
int numValid = 0;
|
||||
int numOutlier = 0;
|
||||
int numBadQual = 0;
|
||||
int numZeroWeight = 0;
|
||||
// int numValid = 0;
|
||||
// int numOutlier = 0;
|
||||
// int numBadQual = 0;
|
||||
// int numZeroWeight = 0;
|
||||
|
||||
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value or having low quality score
|
||||
for( final VariantDatum datum : dataManager.data ) {
|
||||
boolean goodVar = true;
|
||||
if(!(datum.weight > 0.0)) {
|
||||
goodVar = false;
|
||||
numZeroWeight++;
|
||||
}
|
||||
if(goodVar) {
|
||||
for( final double val : datum.annotations ) {
|
||||
if( Math.abs(val) > stdThreshold ) {
|
||||
goodVar = false;
|
||||
numOutlier++;
|
||||
break;
|
||||
}
|
||||
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value
|
||||
// Filtering based on known status and qual threshold happens in GenerateVariantClusters
|
||||
for( int iii = 0; iii < dataManager.data.length; iii++ ) {
|
||||
final VariantDatum datum = dataManager.data[iii];
|
||||
for( final double val : datum.annotations ) {
|
||||
if( Math.abs(val) > stdThreshold ) {
|
||||
datum.weight = 0.0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(goodVar) {
|
||||
if( datum.qual < qualThreshold ) {
|
||||
goodVar = false;
|
||||
numBadQual++;
|
||||
}
|
||||
}
|
||||
if(goodVar) { numValid++; }
|
||||
}
|
||||
|
||||
final VariantDatum data[] = new VariantDatum[numValid];
|
||||
int iii = 0;
|
||||
for( final VariantDatum datum : dataManager.data ) {
|
||||
boolean goodVar = true;
|
||||
if(!(datum.weight > 0.0)) {
|
||||
goodVar = false;
|
||||
}
|
||||
if(goodVar) {
|
||||
for( final double val : datum.annotations ) {
|
||||
if( Math.abs(val) > stdThreshold ) {
|
||||
goodVar = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(goodVar) {
|
||||
if( datum.qual < qualThreshold ) {
|
||||
goodVar = false;
|
||||
}
|
||||
}
|
||||
if(goodVar) { data[iii++] = datum; }
|
||||
}
|
||||
// logger.info("Clustering with " + data.length + " valid variants.");
|
||||
// logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
|
||||
// logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
|
||||
// logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
|
||||
|
||||
logger.info("Clustering with " + data.length + " valid variants.");
|
||||
logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
|
||||
logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
|
||||
logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
|
||||
|
||||
generateEmpricalStats( data );
|
||||
generateEmpricalStats( dataManager.data );
|
||||
|
||||
logger.info("Initializing using k-means...");
|
||||
initializeUsingKMeans( data );
|
||||
initializeUsingKMeans( dataManager.data );
|
||||
logger.info("... done!");
|
||||
createClusters( data, 0, maxGaussians, clusterFileName );
|
||||
createClusters( dataManager.data, 0, maxGaussians, clusterFileName );
|
||||
|
||||
// Simply cluster with all the variants. The knowns have been given more weight than the novels
|
||||
//logger.info("Clustering with " + dataManager.data.length + " variants.");
|
||||
//createClusters( dataManager.data, 0, numGaussians, clusterFileName );
|
||||
}
|
||||
|
||||
private void generateEmpricalStats( VariantDatum[] data ) {
|
||||
private void generateEmpricalStats( final VariantDatum[] data ) {
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
|
|
@ -323,7 +287,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
|
||||
public void setSingletonFPRate(double rate) {
|
||||
public void setSingletonFPRate( final double rate ) {
|
||||
this.singletonFPRate = rate;
|
||||
}
|
||||
|
||||
|
|
@ -358,19 +322,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
final int[] assignment = new int[numVariants];
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final VariantDatum datum = data[iii];
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
int minCluster = -1;
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
double dist = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]);
|
||||
}
|
||||
if(dist < minDistance) {
|
||||
minDistance = dist;
|
||||
minCluster = kkk;
|
||||
if(datum.weight > 0.0) {
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
int minCluster = -1;
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
double dist = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]);
|
||||
}
|
||||
if(dist < minDistance) {
|
||||
minDistance = dist;
|
||||
minCluster = kkk;
|
||||
}
|
||||
}
|
||||
assignment[iii] = minCluster;
|
||||
}
|
||||
assignment[iii] = minCluster;
|
||||
}
|
||||
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testGenerateVariantClusters() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "05d1692624a28cd9446feac8fd2408ab" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "005898194773d2f2be45a68f6e4c4c25" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testVariantRecalibrator() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "3c5a41567b60b777875d585a379eb231" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "acb89428115564cbf35ab7c006252108" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue