Misc cleanup in VQSR.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5972 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2011-06-09 18:37:37 +00:00
parent e87c40d89c
commit 17e17d3c3c
5 changed files with 25 additions and 14 deletions

View File

@ -123,7 +123,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
// setup the header fields // setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
final TreeSet<String> samples = new TreeSet<String>(); final TreeSet<String> samples = new TreeSet<String>();
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)); samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));

View File

@ -93,7 +93,7 @@ public class MultivariateGaussian {
try { try {
cachedSigmaInverse = sigma.inverse(); cachedSigmaInverse = sigma.inverse();
} catch( Exception e ) { } catch( Exception e ) {
throw new UserException("Error during clustering. Most likely there are too few variants used during Gaussian mixture modeling."); throw new UserException("Error during clustering. Most likely there are too few variants used during Gaussian mixture modeling. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example).");
} }
} }

View File

@ -144,7 +144,7 @@ public class VariantDataManager {
int numAdded = 0; int numAdded = 0;
while( numAdded < numToAdd ) { while( numAdded < numToAdd ) {
final VariantDatum datum = data.get(index++); final VariantDatum datum = data.get(index++);
if( !datum.failingSTDThreshold ) { if( !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) {
trainingData.add( datum ); trainingData.add( datum );
datum.usedForTraining = -1; datum.usedForTraining = -1;
numAdded++; numAdded++;
@ -209,15 +209,16 @@ public class VariantDataManager {
double value; double value;
try { try {
if( annotationKey.equals("QUAL") ) { if( annotationKey.equalsIgnoreCase("QUAL") ) {
value = vc.getPhredScaledQual(); value = vc.getPhredScaledQual();
} else { } else {
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
if( Double.isInfinite(value) ) { value = Double.NaN; } if( Double.isInfinite(value) ) { value = Double.NaN; }
if( jitter && ( annotationKey.equalsIgnoreCase("HRUN") ) ) { // Integer valued annotations must be jittered a bit to work in this GMM if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.01 ) { value = Double.NaN; }
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
} }
if( annotationKey.equals("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
} }
} catch( final Exception e ) { } catch( final Exception e ) {
@ -226,7 +227,6 @@ public class VariantDataManager {
logger.warn("WARNING: Missing value detected for " + annotationKey + ". The VQSR will work with missing data by marginalizing over this dimension for this variant. This warning message is only shown once so there may be other annotations missing as well."); logger.warn("WARNING: Missing value detected for " + annotationKey + ". The VQSR will work with missing data by marginalizing over this dimension for this variant. This warning message is only shown once so there may be other annotations missing as well.");
warnedUserMissingValue = true; warnedUserMissingValue = true;
} }
} }
return value; return value;

View File

@ -97,6 +97,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Hidden @Hidden
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false) @Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
protected Boolean TRUST_ALL_POLYMORPHIC = false; protected Boolean TRUST_ALL_POLYMORPHIC = false;
@Hidden
@Argument(fullName = "projectConsensus", shortName = "projectConsensus", doc = "Perform 1000G project consensus. This implies an extra prior factor based on the individual participant callsets passed in with consensus=true rod binding tags.", required = false)
protected Boolean PERFORM_PROJECT_CONSENSUS = false;
///////////////////////////// /////////////////////////////
// Private Member Variables // Private Member Variables
@ -174,7 +177,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately // Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
dataManager.parseTrainingSets( tracker, ref, context, vc, datum, TRUST_ALL_POLYMORPHIC ); dataManager.parseTrainingSets( tracker, ref, context, vc, datum, TRUST_ALL_POLYMORPHIC );
double priorFactor = QualityUtils.qualToProb( datum.prior ); double priorFactor = QualityUtils.qualToProb( datum.prior );
if( datum.consensusCount != 0 ) { if( PERFORM_PROJECT_CONSENSUS ) {
final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount ); final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
priorFactor = 1.0 - ((1.0 - priorFactor) * (1.0 - consensusPrior)); priorFactor = 1.0 - ((1.0 - priorFactor) * (1.0 - consensusPrior));
} }
@ -252,7 +255,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Execute Rscript command to create the tranche plot // Execute Rscript command to create the tranche plot
// Print out the command line to make it clear to the user what is being executed and how one might modify it // Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV; final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV;
logger.info( rScriptTranchesCommandLine ); logger.info( "Executing: " + rScriptTranchesCommandLine );
// Execute the RScript command to plot the table of truth values // Execute the RScript command to plot the table of truth values
try { try {
@ -331,12 +334,20 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])"); stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])");
stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])"); stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])");
stream.println("dummyData <- " + dataFrame + "[1,]");
stream.println("dummyData$x <- NaN");
stream.println("dummyData$y <- NaN");
stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")"); stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")");
stream.println("p <- ggplot(data=" + dataFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))"); stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("p2 = p + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_point(data="+ dataFrame + ", aes(x=x, y=y, colour = retained, alpha=0.3, size=1.5)) + scale_colour_gradient(name=\"\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))"); stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)");
stream.println("p3 = p + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_point(data="+ dataFrame + "["+dataFrame+"$training==0,], aes(x=x, y=y, colour = training, alpha=0.3, size=1.5)) + geom_point(data="+ dataFrame + "["+dataFrame+"$training!=0,], aes(x=x, y=y, colour = training, alpha=0.3, size=1.5)) + scale_colour_gradient2(high=\"green\", mid=\"lightgrey\", low=\"purple\",breaks=c(-1,0,1), labels=c(\"bad\", \"\", \"good\"))"); stream.println("p2 = p + q + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))");
stream.println("p4 = p + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_point(data="+ dataFrame + ", aes(x=x, y=y, colour = novelty, alpha=0.3, size=1.5)) + scale_colour_gradient(name=\"\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))"); stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)");
stream.println("p3 = p + q + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1), labels=c(\"neg\", \"pos\"))");
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)");
stream.println("p4 = p + q + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))");
stream.println("arrange(p1, p2, p3, p4, ncol=2)"); stream.println("arrange(p1, p2, p3, p4, ncol=2)");
} }
} }

View File

@ -27,7 +27,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
"d33212a84368e821cbedecd4f59756d6", // tranches "d33212a84368e821cbedecd4f59756d6", // tranches
"a35cd067f378442eee8cd5edeea92be0", // recal file "a35cd067f378442eee8cd5edeea92be0", // recal file
"7259b7daefe57b11ae9e537e38569160"); // cut VCF "126d52843f4a57199ee97750ffc16a07"); // cut VCF
@DataProvider(name = "VRTest") @DataProvider(name = "VRTest")
public Object[][] createData1() { public Object[][] createData1() {