Adding tranche plot generation back to VQSR

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5736 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2011-05-03 19:26:26 +00:00
parent e73720c2db
commit 5bade81c6d
2 changed files with 34 additions and 8 deletions

View File

@ -52,7 +52,7 @@ alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
numGood = round(alpha * data2$numNovel);
#numGood = round(data2$numNovel * (1-data2$FDRtranche/100))
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
numBad = data2$numNovel - numGood;
numPrevGood = leftShift(numGood, 0)
@ -66,12 +66,12 @@ barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (100
#abline(v= d[2,dim(d)[2]], lty=2)
#abline(v= d[1,3], lty=2)
if ( ! suppressLegend )
legend(5, 1.75, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
legend(5, length(data2$targetTruthSensitivity)-1.25, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
mtext("Ti/Tv",2,line=2.25,at=length(data2$FDRtranche)*1.2,las=1, cex=1)
mtext("FNR",2,line=0,at=length(data2$FDRtranche)*1.2,las=1, cex=1)
axis(2,line=-1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=data2$FDRtranche, las=1, cex.axis=1.0)
axis(2,line=1,at=0.7+(0:(length(data2$FDRtranche)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
# plot sensitivity vs. specificity
sensitivity = data2$truthSensitivity

View File

@ -43,6 +43,7 @@ import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
@ -68,11 +69,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Output(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
private PrintStream RECAL_FILE;
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true)
private PrintStream TRANCHES_FILE;
private File TRANCHES_FILE;
/////////////////////////////
// Additional Command Line Arguments
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
private double TARGET_TITV = 2.15;
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null;
@Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
@ -83,6 +86,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false)
private String RSCRIPT_FILE = null;
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots", required=false)
private double TS_FILTER_LEVEL = 99.0;
@ -97,6 +102,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Private Member Variables
/////////////////////////////
private VariantDataManager dataManager;
private PrintStream tranchesStream;
private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
private final Set<String> inputNames = new HashSet<String>();
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
@ -136,6 +142,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
if( inputNames.size() == 0 ) {
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
}
try {
tranchesStream = new PrintStream(TRANCHES_FILE);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
}
}
//---------------------------------------------------------------------------------------------------------------
@ -224,7 +236,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
final int nCallsAtTruth = TrancheManager.countCallsAtTruth( dataManager.getData(), Double.NEGATIVE_INFINITY );
final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth );
final List<Tranche> tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric );
TRANCHES_FILE.print(Tranche.tranchesString( tranches ));
tranchesStream.print(Tranche.tranchesString( tranches ));
double lodCutoff = 0.0;
for( final Tranche tranche : tranches ) {
@ -240,6 +252,20 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
createVisualizationScript( randomData, goodModel, badModel, lodCutoff );
}
// Execute Rscript command to create the tranche plot
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV;
logger.info( rScriptTranchesCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Process p;
p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
p.waitFor();
} catch ( Exception e ) {
Utils.warnUser("Unable to execute the RScript command. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the recalibration proceded as expected. We highly recommend trying to rerun the script manually if possible.");
}
}
private void createVisualizationScript( final ExpandingArrayList<VariantDatum> randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff ) {