Misc cleanup in VQSR.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5972 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e87c40d89c
commit
17e17d3c3c
|
|
@ -123,7 +123,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ public class MultivariateGaussian {
|
|||
try {
|
||||
cachedSigmaInverse = sigma.inverse();
|
||||
} catch( Exception e ) {
|
||||
throw new UserException("Error during clustering. Most likely there are too few variants used during Gaussian mixture modeling.");
|
||||
throw new UserException("Error during clustering. Most likely there are too few variants used during Gaussian mixture modeling. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example).");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -144,7 +144,7 @@ public class VariantDataManager {
|
|||
int numAdded = 0;
|
||||
while( numAdded < numToAdd ) {
|
||||
final VariantDatum datum = data.get(index++);
|
||||
if( !datum.failingSTDThreshold ) {
|
||||
if( !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) {
|
||||
trainingData.add( datum );
|
||||
datum.usedForTraining = -1;
|
||||
numAdded++;
|
||||
|
|
@ -209,15 +209,16 @@ public class VariantDataManager {
|
|||
double value;
|
||||
|
||||
try {
|
||||
if( annotationKey.equals("QUAL") ) {
|
||||
if( annotationKey.equalsIgnoreCase("QUAL") ) {
|
||||
value = vc.getPhredScaledQual();
|
||||
} else {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
if( Double.isInfinite(value) ) { value = Double.NaN; }
|
||||
if( jitter && ( annotationKey.equalsIgnoreCase("HRUN") ) ) { // Integer valued annotations must be jittered a bit to work in this GMM
|
||||
if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.01 ) { value = Double.NaN; }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
|
||||
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
||||
}
|
||||
if( annotationKey.equals("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
}
|
||||
|
||||
} catch( final Exception e ) {
|
||||
|
|
@ -226,7 +227,6 @@ public class VariantDataManager {
|
|||
logger.warn("WARNING: Missing value detected for " + annotationKey + ". The VQSR will work with missing data by marginalizing over this dimension for this variant. This warning message is only shown once so there may be other annotations missing as well.");
|
||||
warnedUserMissingValue = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return value;
|
||||
|
|
|
|||
|
|
@ -97,6 +97,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "projectConsensus", shortName = "projectConsensus", doc = "Perform 1000G project consensus. This implies an extra prior factor based on the individual participant callsets passed in with consensus=true rod binding tags.", required = false)
|
||||
protected Boolean PERFORM_PROJECT_CONSENSUS = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -174,7 +177,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
|
||||
dataManager.parseTrainingSets( tracker, ref, context, vc, datum, TRUST_ALL_POLYMORPHIC );
|
||||
double priorFactor = QualityUtils.qualToProb( datum.prior );
|
||||
if( datum.consensusCount != 0 ) {
|
||||
if( PERFORM_PROJECT_CONSENSUS ) {
|
||||
final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
|
||||
priorFactor = 1.0 - ((1.0 - priorFactor) * (1.0 - consensusPrior));
|
||||
}
|
||||
|
|
@ -252,7 +255,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
// Execute Rscript command to create the tranche plot
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV;
|
||||
logger.info( rScriptTranchesCommandLine );
|
||||
logger.info( "Executing: " + rScriptTranchesCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
|
|
@ -331,12 +334,20 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
stream.println(surfaceFrame + " <- data.frame(x=s[,1], y=s[,2], lod=s[,3])");
|
||||
stream.println(dataFrame + " <- data.frame(x=d[,1], y=d[,2], retained=d[,3], training=d[,4], novelty=d[,5])");
|
||||
stream.println("dummyData <- " + dataFrame + "[1,]");
|
||||
stream.println("dummyData$x <- NaN");
|
||||
stream.println("dummyData$y <- NaN");
|
||||
stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
|
||||
stream.println("p1 = p + opts(title=\"model PDF\") + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\")");
|
||||
stream.println("p <- ggplot(data=" + dataFrame + ", aes(x=x, y=y)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
|
||||
stream.println("p2 = p + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_point(data="+ dataFrame + ", aes(x=x, y=y, colour = retained, alpha=0.3, size=1.5)) + scale_colour_gradient(name=\"\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))");
|
||||
stream.println("p3 = p + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_point(data="+ dataFrame + "["+dataFrame+"$training==0,], aes(x=x, y=y, colour = training, alpha=0.3, size=1.5)) + geom_point(data="+ dataFrame + "["+dataFrame+"$training!=0,], aes(x=x, y=y, colour = training, alpha=0.3, size=1.5)) + scale_colour_gradient2(high=\"green\", mid=\"lightgrey\", low=\"purple\",breaks=c(-1,0,1), labels=c(\"bad\", \"\", \"good\"))");
|
||||
stream.println("p4 = p + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + geom_point(data="+ dataFrame + ", aes(x=x, y=y, colour = novelty, alpha=0.3, size=1.5)) + scale_colour_gradient(name=\"\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))");
|
||||
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
|
||||
stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)");
|
||||
stream.println("p2 = p + q + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),labels=c(\"filtered\",\"retained\"))");
|
||||
stream.println("p <- qplot(x,y,data="+ dataFrame + "["+dataFrame+"$training != 0,], color=training, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
|
||||
stream.println("q <- geom_point(aes(x=x,y=y,color=training),data=dummyData, alpha=1.0, na.rm=TRUE)");
|
||||
stream.println("p3 = p + q + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + scale_colour_gradient(high=\"green\", low=\"purple\",breaks=c(-1,1), labels=c(\"neg\", \"pos\"))");
|
||||
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=novelty, alpha=I(1/7)) + opts(panel.background = theme_rect(colour = NA), panel.grid.minor = theme_line(colour = NA), panel.grid.major = theme_line(colour = NA))");
|
||||
stream.println("q <- geom_point(aes(x=x,y=y,color=novelty),data=dummyData, alpha=1.0, na.rm=TRUE)");
|
||||
stream.println("p4 = p + q + labs(x=\""+ USE_ANNOTATIONS[iii] +"\", y=\""+ USE_ANNOTATIONS[jjj] +"\") + scale_colour_gradient(name=\"novelty\", high=\"blue\", low=\"red\",breaks=c(-1,1), labels=c(\"novel\",\"known\"))");
|
||||
stream.println("arrange(p1, p2, p3, p4, ncol=2)");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
|
||||
"d33212a84368e821cbedecd4f59756d6", // tranches
|
||||
"a35cd067f378442eee8cd5edeea92be0", // recal file
|
||||
"7259b7daefe57b11ae9e537e38569160"); // cut VCF
|
||||
"126d52843f4a57199ee97750ffc16a07"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue