Memory optimization for the VariantRecalibrator. Only add variants to the list if they pass the novelty and qual filters.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4051 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-08-17 21:57:28 +00:00
parent b7c60b9729
commit 8f15b2ba72
3 changed files with 40 additions and 73 deletions

View File

@ -153,7 +153,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
final double annotationValues[] = new double[annotationKeys.size()];
// todo -- do we really need to support multiple tracks -- logic is cleaner without this case -- what's the use case?
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
@ -183,8 +182,10 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
}
mapList.add( variantDatum );
if( variantDatum.weight > 0.0 && variantDatum.qual > QUAL_THRESHOLD ) {
mapList.add( variantDatum );
}
}
}
}

View File

@ -188,77 +188,41 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Initialize the Allele Count prior
generateAlleleCountPrior();
int numValid = 0;
int numOutlier = 0;
int numBadQual = 0;
int numZeroWeight = 0;
// int numValid = 0;
// int numOutlier = 0;
// int numBadQual = 0;
// int numZeroWeight = 0;
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value or having low quality score
for( final VariantDatum datum : dataManager.data ) {
boolean goodVar = true;
if(!(datum.weight > 0.0)) {
goodVar = false;
numZeroWeight++;
}
if(goodVar) {
for( final double val : datum.annotations ) {
if( Math.abs(val) > stdThreshold ) {
goodVar = false;
numOutlier++;
break;
}
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value
// Filtering based on known status and qual threshold happens in GenerateVariantClusters
for( int iii = 0; iii < dataManager.data.length; iii++ ) {
final VariantDatum datum = dataManager.data[iii];
for( final double val : datum.annotations ) {
if( Math.abs(val) > stdThreshold ) {
datum.weight = 0.0;
break;
}
}
if(goodVar) {
if( datum.qual < qualThreshold ) {
goodVar = false;
numBadQual++;
}
}
if(goodVar) { numValid++; }
}
final VariantDatum data[] = new VariantDatum[numValid];
int iii = 0;
for( final VariantDatum datum : dataManager.data ) {
boolean goodVar = true;
if(!(datum.weight > 0.0)) {
goodVar = false;
}
if(goodVar) {
for( final double val : datum.annotations ) {
if( Math.abs(val) > stdThreshold ) {
goodVar = false;
break;
}
}
}
if(goodVar) {
if( datum.qual < qualThreshold ) {
goodVar = false;
}
}
if(goodVar) { data[iii++] = datum; }
}
// logger.info("Clustering with " + data.length + " valid variants.");
// logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
// logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
// logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
logger.info("Clustering with " + data.length + " valid variants.");
logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
generateEmpricalStats( data );
generateEmpricalStats( dataManager.data );
logger.info("Initializing using k-means...");
initializeUsingKMeans( data );
initializeUsingKMeans( dataManager.data );
logger.info("... done!");
createClusters( data, 0, maxGaussians, clusterFileName );
createClusters( dataManager.data, 0, maxGaussians, clusterFileName );
// Simply cluster with all the variants. The knowns have been given more weight than the novels
//logger.info("Clustering with " + dataManager.data.length + " variants.");
//createClusters( dataManager.data, 0, numGaussians, clusterFileName );
}
private void generateEmpricalStats( VariantDatum[] data ) {
private void generateEmpricalStats( final VariantDatum[] data ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
@ -323,7 +287,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
public void setSingletonFPRate(double rate) {
public void setSingletonFPRate( final double rate ) {
this.singletonFPRate = rate;
}
@ -358,19 +322,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int[] assignment = new int[numVariants];
for( int iii = 0; iii < numVariants; iii++ ) {
final VariantDatum datum = data[iii];
double minDistance = Double.MAX_VALUE;
int minCluster = -1;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
double dist = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]);
}
if(dist < minDistance) {
minDistance = dist;
minCluster = kkk;
if(datum.weight > 0.0) {
double minDistance = Double.MAX_VALUE;
int minCluster = -1;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
double dist = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]);
}
if(dist < minDistance) {
minDistance = dist;
minCluster = kkk;
}
}
assignment[iii] = minCluster;
}
assignment[iii] = minCluster;
}
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {

View File

@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "05d1692624a28cd9446feac8fd2408ab" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "005898194773d2f2be45a68f6e4c4c25" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "3c5a41567b60b777875d585a379eb231" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "acb89428115564cbf35ab7c006252108" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();