Memory optimization for the VariantRecalibrator. Only add variants to the list if they pass the novelty and qual filters.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4051 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-08-17 21:57:28 +00:00
parent b7c60b9729
commit 8f15b2ba72
3 changed files with 40 additions and 73 deletions

View File

@ -153,7 +153,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
final double annotationValues[] = new double[annotationKeys.size()]; final double annotationValues[] = new double[annotationKeys.size()];
// todo -- do we really need to support multiple tracks -- logic is cleaner without this case -- what's the use case?
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) { for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) { if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) { if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
@ -183,8 +182,10 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; } else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; } else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
} }
mapList.add( variantDatum ); if( variantDatum.weight > 0.0 && variantDatum.qual > QUAL_THRESHOLD ) {
mapList.add( variantDatum );
}
} }
} }
} }

View File

@ -188,77 +188,41 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Initialize the Allele Count prior // Initialize the Allele Count prior
generateAlleleCountPrior(); generateAlleleCountPrior();
int numValid = 0; // int numValid = 0;
int numOutlier = 0; // int numOutlier = 0;
int numBadQual = 0; // int numBadQual = 0;
int numZeroWeight = 0; // int numZeroWeight = 0;
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value or having low quality score // Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value
for( final VariantDatum datum : dataManager.data ) { // Filtering based on known status and qual threshold happens in GenerateVariantClusters
boolean goodVar = true; for( int iii = 0; iii < dataManager.data.length; iii++ ) {
if(!(datum.weight > 0.0)) { final VariantDatum datum = dataManager.data[iii];
goodVar = false; for( final double val : datum.annotations ) {
numZeroWeight++; if( Math.abs(val) > stdThreshold ) {
} datum.weight = 0.0;
if(goodVar) { break;
for( final double val : datum.annotations ) {
if( Math.abs(val) > stdThreshold ) {
goodVar = false;
numOutlier++;
break;
}
} }
} }
if(goodVar) {
if( datum.qual < qualThreshold ) {
goodVar = false;
numBadQual++;
}
}
if(goodVar) { numValid++; }
} }
final VariantDatum data[] = new VariantDatum[numValid]; // logger.info("Clustering with " + data.length + " valid variants.");
int iii = 0; // logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
for( final VariantDatum datum : dataManager.data ) { // logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
boolean goodVar = true; // logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
if(!(datum.weight > 0.0)) {
goodVar = false;
}
if(goodVar) {
for( final double val : datum.annotations ) {
if( Math.abs(val) > stdThreshold ) {
goodVar = false;
break;
}
}
}
if(goodVar) {
if( datum.qual < qualThreshold ) {
goodVar = false;
}
}
if(goodVar) { data[iii++] = datum; }
}
logger.info("Clustering with " + data.length + " valid variants."); generateEmpricalStats( dataManager.data );
logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
generateEmpricalStats( data );
logger.info("Initializing using k-means..."); logger.info("Initializing using k-means...");
initializeUsingKMeans( data ); initializeUsingKMeans( dataManager.data );
logger.info("... done!"); logger.info("... done!");
createClusters( data, 0, maxGaussians, clusterFileName ); createClusters( dataManager.data, 0, maxGaussians, clusterFileName );
// Simply cluster with all the variants. The knowns have been given more weight than the novels // Simply cluster with all the variants. The knowns have been given more weight than the novels
//logger.info("Clustering with " + dataManager.data.length + " variants."); //logger.info("Clustering with " + dataManager.data.length + " variants.");
//createClusters( dataManager.data, 0, numGaussians, clusterFileName ); //createClusters( dataManager.data, 0, numGaussians, clusterFileName );
} }
private void generateEmpricalStats( VariantDatum[] data ) { private void generateEmpricalStats( final VariantDatum[] data ) {
final int numVariants = data.length; final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length; final int numAnnotations = data[0].annotations.length;
@ -323,7 +287,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
public void setSingletonFPRate(double rate) { public void setSingletonFPRate( final double rate ) {
this.singletonFPRate = rate; this.singletonFPRate = rate;
} }
@ -358,19 +322,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int[] assignment = new int[numVariants]; final int[] assignment = new int[numVariants];
for( int iii = 0; iii < numVariants; iii++ ) { for( int iii = 0; iii < numVariants; iii++ ) {
final VariantDatum datum = data[iii]; final VariantDatum datum = data[iii];
double minDistance = Double.MAX_VALUE; if(datum.weight > 0.0) {
int minCluster = -1; double minDistance = Double.MAX_VALUE;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) { int minCluster = -1;
double dist = 0.0; for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { double dist = 0.0;
dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]); for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
} dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]);
if(dist < minDistance) { }
minDistance = dist; if(dist < minDistance) {
minCluster = kkk; minDistance = dist;
minCluster = kkk;
}
} }
assignment[iii] = minCluster;
} }
assignment[iii] = minCluster;
} }
for( int kkk = 0; kkk < maxGaussians; kkk++ ) { for( int kkk = 0; kkk < maxGaussians; kkk++ ) {

View File

@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test @Test
public void testGenerateVariantClusters() { public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "05d1692624a28cd9446feac8fd2408ab" ); e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "005898194773d2f2be45a68f6e4c4c25" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey(); String vcf = entry.getKey();
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test @Test
public void testVariantRecalibrator() { public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "3c5a41567b60b777875d585a379eb231" ); e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "acb89428115564cbf35ab7c006252108" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey(); String vcf = entry.getKey();