Variant Optimizer accepts a dbSNP rod arugment to use in determining known/novel status as opposed to using the rsID in the vcf record. VO generates plots of annotation values used in clustering broken out by knowns and novels. Useful for showing which annotations are approximately Gaussian.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3332 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-05-09 16:48:07 +00:00
parent 76efa757f0
commit 33a9549896
5 changed files with 156 additions and 14 deletions

View File

@ -0,0 +1,19 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
annotationName = args[2]
data = read.table(input,sep=",",head=T)
outfile = paste(input, ".ClusterReport.pdf", sep="")
pdf(outfile, height=7, width=8)
maxP = max(data$knownDist, data$novelDist)
plot(data$annotationValue, data$knownDist, ylim=c(0,maxP),type="b",col="orange",lwd=2,xlab=annotationName,ylab="fraction of SNPs")
points(data$annotationValue, data$novelDist, type="b",col="blue",lwd=2)
legend('topright', c('knowns','novels'),lwd=2,col=c("orange","blue"))
dev.off()

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrde
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
@ -89,6 +90,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
private VCFWriter vcfWriter;
private Set<String> ignoreInputFilterSet = null;
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
private boolean usingDBSNP = false;
//---------------------------------------------------------------------------------------------------------------
@ -127,18 +129,25 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
final TreeSet<String> samples = new TreeSet<String>();
List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
RMDTrack rod = source.getReferenceOrderedData();
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for ( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getType().equals(VCFCodec.class) ) {
VCFReader reader = new VCFReader(rod.getFile());
Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
final VCFReader reader = new VCFReader(rod.getFile());
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
samples.addAll(vcfSamples);
reader.close();
}
}
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
usingDBSNP = true;
}
}
}
//---------------------------------------------------------------------------------------------------------------
@ -155,13 +164,22 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
}
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
if( vc != null && vc.isSNP() ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
final VariantDatum variantDatum = new VariantDatum();
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
boolean isKnown = !vc.getAttribute("ID").equals(".");
if(usingDBSNP) {
isKnown = false;
for( VariantContext dbsnpVC : tracker.getVariantContexts(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) {
if(dbsnpVC != null && dbsnpVC.isSNP()) {
isKnown=true;
}
}
}
variantDatum.isKnown = isKnown;
final double pTrue = theModel.evaluateVariant( vc.getAttributes(), vc.getPhredScaledQual() );
double recalQual = 400.0 * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) );

View File

@ -107,6 +107,8 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
trueNegGlobal[kkk] = 0;
falseNegGlobal[kkk] = 0;
}
}
//---------------------------------------------------------------------------------------------------------------

View File

@ -506,6 +506,61 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
*/
}
public final void outputClusterReports( final String outputPrefix ) {
final double STD_STEP = 0.2;
final double MAX_STD = 4.0;
final double MIN_STD = -4.0;
final int NUM_BINS = (int)Math.floor((Math.abs(MIN_STD) + Math.abs(MAX_STD)) / STD_STEP);
final int numAnnotations = dataManager.numAnnotations;
int totalCountsKnown = 0;
int totalCountsNovel = 0;
final int counts[][][] = new int[numAnnotations][NUM_BINS][2];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int iii = 0; iii < NUM_BINS; iii++ ) {
counts[jjj][iii][0] = 0;
counts[jjj][iii][1] = 0;
}
}
for( VariantDatum datum : dataManager.data ) {
final int isKnown = ( datum.isKnown ? 1 : 0 );
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
int histBin = (int)Math.round((datum.annotations[jjj]-MIN_STD) * (1.0 / STD_STEP));
if(histBin < 0) { histBin = 0; }
if(histBin > NUM_BINS-1) { histBin = NUM_BINS-1; }
if(histBin >= 0 && histBin <= NUM_BINS-1) {
counts[jjj][histBin][isKnown]++;
}
}
if( isKnown == 1 ) { totalCountsKnown++; }
else { totalCountsNovel++; }
}
int annIndex = 0;
for( final String annotation : dataManager.annotationKeys ) {
PrintStream outputFile;
try {
outputFile = new PrintStream( outputPrefix + "." + annotation + ".dat" );
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
}
outputFile.println("annotationValue,knownDist,novelDist");
for( int iii = 0; iii < NUM_BINS; iii++ ) {
final double annotationValue = (((double)iii * STD_STEP)+MIN_STD) * dataManager.varianceVector[annIndex] + dataManager.meanVector[annIndex];
outputFile.println( annotationValue + "," + ( ((double)counts[annIndex][iii][1])/((double)totalCountsKnown) ) +
"," + ( ((double)counts[annIndex][iii][0])/((double)totalCountsNovel) ));
}
annIndex++;
}
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
}
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
final int numVariants = data.length;
@ -719,7 +774,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
final double denom = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(determinant[kkk], 0.5);
pVarInCluster[kkk] = (1.0 / ((double) numGaussians)) * (Math.exp( -0.5 * sum )) / denom;
pVarInCluster[kkk] = pCluster[kkk] * (Math.exp( -0.5 * sum )) / denom;
/*
if( isUsingTiTvModel ) {

View File

@ -28,14 +28,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.commandline.Argument;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
@ -66,11 +71,16 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
private int NUM_GAUSSIANS = 26;
private int NUM_GAUSSIANS = 7;
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
private int NUM_ITERATIONS = 10;
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
private int MIN_VAR_IN_CLUSTER = 1000;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
//private int NUM_KNN = 2000;
@ -85,6 +95,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
private int numAnnotations = 0;
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
private Set<String> ignoreInputFilterSet = null;
private boolean usingDBSNP = false;
//---------------------------------------------------------------------------------------------------------------
//
@ -96,6 +107,13 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
if( IGNORE_INPUT_FILTERS != null ) {
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
}
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
usingDBSNP = true;
}
}
}
//---------------------------------------------------------------------------------------------------------------
@ -116,7 +134,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && vc.isSNP() ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
annotationKeys.addAll( vc.getAttributes().keySet() );
@ -162,7 +180,16 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
final VariantDatum variantDatum = new VariantDatum();
variantDatum.annotations = annotationValues;
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
boolean isKnown = !vc.getAttribute("ID").equals(".");
if(usingDBSNP) {
isKnown = false;
for( VariantContext dbsnpVC : tracker.getVariantContexts(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) {
if(dbsnpVC != null && dbsnpVC.isSNP()) {
isKnown=true;
}
}
}
variantDatum.isKnown = isKnown;
mapList.add( variantDatum );
}
@ -198,7 +225,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
// Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it
VariantOptimizationModel theModel;
VariantGaussianMixtureModel theModel;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
@ -211,6 +238,27 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
}
theModel.run( CLUSTER_FILENAME );
theModel.outputClusterReports( CLUSTER_FILENAME );
for( final String annotation : annotationKeys ) {
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_ClusterReport.R" + " " + CLUSTER_FILENAME + "." + annotation + ".dat " + annotation;
System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
final Process p = Runtime.getRuntime().exec( rScriptCommandLine );
p.waitFor();
} catch (InterruptedException e) {
e.printStackTrace();
System.exit(-1);
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}
}
}
}