From ba19afd52917bafeb370b1a65349bd95a7462d2e Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 18 Jan 2010 20:47:10 +0000 Subject: [PATCH] Draft version of AnalyzeAnnotations which creates plots of cumulative TiTv ratio versus filter value per each annotation in the input VCF rod. Minor cleanup of recalibration walkers. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2623 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_Annotations_CumulativeTiTv.R | 29 +++++ .../recalibration/CovariateCounterWalker.java | 11 +- .../recalibration/RecalDataManager.java | 2 +- .../TableRecalibrationWalker.java | 13 +-- .../AnalyzeAnnotationsWalker.java | 42 ++++++- .../AnnotationDataManager.java | 104 ++++++++++++++++++ .../variantoptimizer/AnnotationDatum.java | 62 +++++++++++ 7 files changed, 245 insertions(+), 18 deletions(-) create mode 100644 R/plot_Annotations_CumulativeTiTv.R create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java diff --git a/R/plot_Annotations_CumulativeTiTv.R b/R/plot_Annotations_CumulativeTiTv.R new file mode 100644 index 000000000..6b981b2e1 --- /dev/null +++ b/R/plot_Annotations_CumulativeTiTv.R @@ -0,0 +1,29 @@ +#!/broad/tools/apps/R-2.6.0/bin/Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +annotationName = args[2] + + +c <- read.table(input, header=T) + +# +# Plot residual error as a function of the covariate +# + +gt = c[c$GT==1 & c$numVariants>1000,] +lt = c[c$GT==0 & c$numVariants>1000,] + +outfile = paste(input, ".cumulativeTiTv_", annotationName, "_GTfilter.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(gt$value,gt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"> x",sep=" "),pch=20); +dev.off() + +outfile = paste(input, ".cumulativeTiTv_", annotationName, "_LTfilter.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(lt$value,lt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"< x",sep=" "),pch=20); +dev.off() \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 96bb096e4..24445fef6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -94,16 +94,16 @@ public class CovariateCounterWalker extends LocusWalker { ///////////////////////////// // Private Member Variables ///////////////////////////// - private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested + private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private final ArrayList requestedCovariates = new ArrayList(); // A list to hold the covariate objects that were requested private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci - private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci + private final Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci + private final Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) private int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen) private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation @@ -157,7 +157,6 @@ public class CovariateCounterWalker extends LocusWalker { } // Initialize the requested covariates by parsing the -cov argument - requestedCovariates = new ArrayList(); // First add the required covariates if( requiredClasses.size() == 2) { // readGroup and reported quality score requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here @@ -210,8 +209,6 @@ public class CovariateCounterWalker extends LocusWalker { logger.info( "\t" + cov.getClass().getSimpleName() ); cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection } - - dataManager = new RecalDataManager(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 4e9a40df4..f3904810a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -57,7 +57,7 @@ public class RecalDataManager { private static boolean warnUserNullReadGroup = false; private static boolean warnUserNoColorSpace = false; - public static final String versionString = "v2.2.12"; // Major version, minor version, and build number + public static final String versionString = "v2.2.13"; // Major version, minor version, and build number RecalDataManager() { data = new NestedHashMap(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 51e66d0af..d72d05c2b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -91,12 +91,12 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; // List of covariates to be used in this calculation + private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); - private Random coinFlip; // Random number generator is used to remove reference bias in solid bams private static final long RANDOM_SEED = 1032861495; + private final Random coinFlip = new Random( RANDOM_SEED ); // Random number generator is used to remove reference bias in solid bams //--------------------------------------------------------------------------------------------------------------- // @@ -114,7 +114,6 @@ public class TableRecalibrationWalker extends ReadWalker(); - - // Read in the data from the csv file and populate the map + // Read in the data from the csv file and populate the data map and covariates list logger.info( "Reading in the data from input csv file..." ); try { for ( String line : new xReadLines(new File( RAC.RECAL_FILE )) ) { lineNumber++; - if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) { + if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { ; // Skip over the comment lines, (which start with '#') } + // Read in the covariates that were used from the input file else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data if( foundAllCovariates ) { throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RAC.RECAL_FILE ); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java index d80a9df8f..c1389efdb 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java @@ -2,8 +2,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RODRecordList; +import org.broadinstitute.sting.gatk.refdata.RodVCF; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.io.IOException; /* * Copyright (c) 2010 The Broad Institute @@ -42,9 +49,17 @@ public class AnalyzeAnnotationsWalker extends RodWalker { // Command Line Arguments ///////////////////////////// + @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false) + private String OUTPUT_DIR = "analyzeAnnotations/"; + @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) + private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; + @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) + private String PATH_TO_RESOURCES = "R/"; + ///////////////////////////// // Private Member Variables ///////////////////////////// + private final AnnotationDataManager dataManager = new AnnotationDataManager(); //--------------------------------------------------------------------------------------------------------------- // @@ -53,6 +68,15 @@ public class AnalyzeAnnotationsWalker extends RodWalker { //--------------------------------------------------------------------------------------------------------------- public void initialize() { + + // create the output directory where all the data tables and plots will go + try { + Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR); + } catch (IOException e) { + throw new RuntimeException("Couldn't create directory: " + OUTPUT_DIR); + } + if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; } + if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } } //--------------------------------------------------------------------------------------------------------------- @@ -62,7 +86,19 @@ public class AnalyzeAnnotationsWalker extends RodWalker { //--------------------------------------------------------------------------------------------------------------- public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - // Add each VCF record to eacn annotation's data structure + + // Add each VCF record to each annotation's data structure + if( tracker != null ) { + for( ReferenceOrderedDatum rod : tracker.getAllRods() ) { + if( rod != null && rod instanceof RodVCF ) { + RodVCF variant = (RodVCF) rod; + if( variant.isSNP() ) { + dataManager.addAnnotations( variant ); + } + } + } + } + return 1; // This value isn't actually used for anything } @@ -81,6 +117,8 @@ public class AnalyzeAnnotationsWalker extends RodWalker { } public void onTraversalDone( Integer sum ) { + // For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables + dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_DIR); } -} +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java new file mode 100755 index 000000000..d779373fd --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java @@ -0,0 +1,104 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.StingException; + +import java.util.HashMap; +import java.util.TreeSet; +import java.util.Map; +import java.util.Iterator; +import java.io.PrintStream; +import java.io.FileNotFoundException; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Jan 18, 2010 + */ + +public class AnnotationDataManager { + + public final HashMap> data; + + public AnnotationDataManager() { + data = new HashMap>(); + } + + public void addAnnotations( RodVCF variant ) { + + // Loop over each annotation in the vcf record + final Map infoField = variant.getInfoValues(); + for( String annotationKey : infoField.keySet() ) { + final float value = Float.parseFloat( infoField.get( annotationKey ) ); + + TreeSet treeSet = data.get( annotationKey ); + if( treeSet == null ) { // This annotation hasn't been seen before + treeSet = new TreeSet( new AnnotationDatum() ); + data.put( annotationKey, treeSet ); + } + AnnotationDatum datum = new AnnotationDatum( value ); + if( treeSet.contains(datum) ) { + datum = treeSet.tailSet(datum).first(); + } else { + treeSet.add(datum); + } + + if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) { + datum.incrementTi(); + } else { + datum.incrementTv(); + } + } + } + + public void plotCumulativeTables(final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_DIR) { + + for( String annotationKey: data.keySet() ) { + PrintStream output; + try { + output = new PrintStream(OUTPUT_DIR + annotationKey + ".dat"); + } catch (FileNotFoundException e) { + throw new StingException("Can't create intermediate output annotation data file."); + } + // Output a header line + output.println("value\tcumulativeTiTv\tnumVariants\tGT"); + + int numTi = 0; + int numTv = 0; + for( AnnotationDatum datum : data.get( annotationKey )) { + numTi += datum.numTransitions; + numTv += datum.numTransversions; + float titv; + if( numTv == 0) { titv = 0.0f; } + else { titv = ((float) numTi) / ((float) numTv); } + output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t1"); + + } + + numTi = 0; + numTv = 0; + Iterator iter = data.get( annotationKey ).descendingIterator(); + while( iter.hasNext() ) { + final AnnotationDatum datum = iter.next(); + numTi += datum.numTransitions; + numTv += datum.numTransversions; + float titv; + if( numTv == 0) { titv = 0.0f; } + else { titv = ((float) numTi) / ((float) numTv); } + output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t0"); + + } + + System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " + + OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey); + + try { + Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " + + OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey); + } catch (Exception e) { + throw new StingException("Unable to run RScript command"); + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java new file mode 100755 index 000000000..32f3b4021 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDatum.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +import org.broadinstitute.sting.utils.StingException; + +import java.util.Comparator; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Jan 18, 2010 + */ + +public class AnnotationDatum implements Comparator { + public final float value; + public int numTransitions; + public int numTransversions; + + public AnnotationDatum() { + value = 0.0f; + numTransitions = 0; + numTransversions = 0; + } + + public AnnotationDatum( float _value ) { + value = _value; + numTransitions = 0; + numTransversions = 0; + } + + final public void incrementTi() { + numTransitions++; + } + + final public void incrementTv() { + numTransversions++; + } + + final public float calcTiTv() { + + if( numTransitions < 0 || numTransversions < 0) { + throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." ); + } + + if( numTransversions == 0) { + return 0.0f; + } + + return ((float) numTransitions) / ((float) numTransversions); + } + + public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { + if( a1.value < a2.value ) { return -1; } + if( a1.value > a2.value ) { return 1; } + return 0; + } + + public int equals( AnnotationDatum that ) { + if( this.value < that.value ) { return -1; } + if( this.value > that.value ) { return 1; } + return 0; + } +}