Draft version of AnalyzeAnnotations which creates plots of cumulative TiTv ratio versus filter value per each annotation in the input VCF rod. Minor cleanup of recalibration walkers.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2623 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-18 20:47:10 +00:00
parent ff6877a15e
commit ba19afd529
7 changed files with 245 additions and 18 deletions

View File

@ -0,0 +1,29 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
annotationName = args[2]
c <- read.table(input, header=T)
#
# Plot residual error as a function of the covariate
#
gt = c[c$GT==1 & c$numVariants>1000,]
lt = c[c$GT==0 & c$numVariants>1000,]
outfile = paste(input, ".cumulativeTiTv_", annotationName, "_GTfilter.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(gt$value,gt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"> x",sep=" "),pch=20);
dev.off()
outfile = paste(input, ".cumulativeTiTv_", annotationName, "_LTfilter.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(lt$value,lt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"< x",sep=" "),pch=20);
dev.off()

View File

@ -94,16 +94,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
/////////////////////////////
// Private Member Variables
/////////////////////////////
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // A list to hold the covariate objects that were requested
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private final Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private final Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
private int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
@ -157,7 +157,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
// First add the required covariates
if( requiredClasses.size() == 2) { // readGroup and reported quality score
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
@ -210,8 +209,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
logger.info( "\t" + cov.getClass().getSimpleName() );
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
}
dataManager = new RecalDataManager();
}

View File

@ -57,7 +57,7 @@ public class RecalDataManager {
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNoColorSpace = false;
public static final String versionString = "v2.2.12"; // Major version, minor version, and build number
public static final String versionString = "v2.2.13"; // Major version, minor version, and build number
RecalDataManager() {
data = new NestedHashMap();

View File

@ -91,12 +91,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// Private Member Variables
/////////////////////////////
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
private static final long RANDOM_SEED = 1032861495;
private final Random coinFlip = new Random( RANDOM_SEED ); // Random number generator is used to remove reference bias in solid bams
//---------------------------------------------------------------------------------------------------------------
//
@ -114,7 +114,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
logger.info( "Recalibrator version: " + RecalDataManager.versionString );
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
coinFlip = new Random(RANDOM_SEED);
if( !RAC.checkSolidRecalMode() ) {
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS");
}
@ -136,18 +135,16 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
}
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
// Read in the data from the csv file and populate the map
// Read in the data from the csv file and populate the data map and covariates list
logger.info( "Reading in the data from input csv file..." );
try {
for ( String line : new xReadLines(new File( RAC.RECAL_FILE )) ) {
lineNumber++;
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
; // Skip over the comment lines, (which start with '#')
}
// Read in the covariates that were used from the input file
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) {
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RAC.RECAL_FILE );

View File

@ -2,8 +2,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.IOException;
/*
* Copyright (c) 2010 The Broad Institute
@ -42,9 +49,17 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
// Command Line Arguments
/////////////////////////////
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
private String OUTPUT_DIR = "analyzeAnnotations/";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
/////////////////////////////
// Private Member Variables
/////////////////////////////
private final AnnotationDataManager dataManager = new AnnotationDataManager();
//---------------------------------------------------------------------------------------------------------------
//
@ -53,6 +68,15 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
// create the output directory where all the data tables and plots will go
try {
Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR);
} catch (IOException e) {
throw new RuntimeException("Couldn't create directory: " + OUTPUT_DIR);
}
if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; }
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
}
//---------------------------------------------------------------------------------------------------------------
@ -62,7 +86,19 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
//---------------------------------------------------------------------------------------------------------------
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
// Add each VCF record to eacn annotation's data structure
// Add each VCF record to each annotation's data structure
if( tracker != null ) {
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if( rod != null && rod instanceof RodVCF ) {
RodVCF variant = (RodVCF) rod;
if( variant.isSNP() ) {
dataManager.addAnnotations( variant );
}
}
}
}
return 1; // This value isn't actually used for anything
}
@ -81,6 +117,8 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
}
public void onTraversalDone( Integer sum ) {
// For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables
dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_DIR);
}
}
}

View File

@ -0,0 +1,104 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException;
import java.util.HashMap;
import java.util.TreeSet;
import java.util.Map;
import java.util.Iterator;
import java.io.PrintStream;
import java.io.FileNotFoundException;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Jan 18, 2010
*/
public class AnnotationDataManager {
public final HashMap<String, TreeSet<AnnotationDatum>> data;
public AnnotationDataManager() {
data = new HashMap<String, TreeSet<AnnotationDatum>>();
}
public void addAnnotations( RodVCF variant ) {
// Loop over each annotation in the vcf record
final Map<String,String> infoField = variant.getInfoValues();
for( String annotationKey : infoField.keySet() ) {
final float value = Float.parseFloat( infoField.get( annotationKey ) );
TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() );
data.put( annotationKey, treeSet );
}
AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) {
datum = treeSet.tailSet(datum).first();
} else {
treeSet.add(datum);
}
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi();
} else {
datum.incrementTv();
}
}
}
public void plotCumulativeTables(final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_DIR) {
for( String annotationKey: data.keySet() ) {
PrintStream output;
try {
output = new PrintStream(OUTPUT_DIR + annotationKey + ".dat");
} catch (FileNotFoundException e) {
throw new StingException("Can't create intermediate output annotation data file.");
}
// Output a header line
output.println("value\tcumulativeTiTv\tnumVariants\tGT");
int numTi = 0;
int numTv = 0;
for( AnnotationDatum datum : data.get( annotationKey )) {
numTi += datum.numTransitions;
numTv += datum.numTransversions;
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t1");
}
numTi = 0;
numTv = 0;
Iterator<AnnotationDatum> iter = data.get( annotationKey ).descendingIterator();
while( iter.hasNext() ) {
final AnnotationDatum datum = iter.next();
numTi += datum.numTransitions;
numTv += datum.numTransversions;
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t0");
}
System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey);
try {
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey);
} catch (Exception e) {
throw new StingException("Unable to run RScript command");
}
}
}
}

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.utils.StingException;
import java.util.Comparator;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Jan 18, 2010
*/
public class AnnotationDatum implements Comparator<AnnotationDatum> {
public final float value;
public int numTransitions;
public int numTransversions;
public AnnotationDatum() {
value = 0.0f;
numTransitions = 0;
numTransversions = 0;
}
public AnnotationDatum( float _value ) {
value = _value;
numTransitions = 0;
numTransversions = 0;
}
final public void incrementTi() {
numTransitions++;
}
final public void incrementTv() {
numTransversions++;
}
final public float calcTiTv() {
if( numTransitions < 0 || numTransversions < 0) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( numTransversions == 0) {
return 0.0f;
}
return ((float) numTransitions) / ((float) numTransversions);
}
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) {
if( a1.value < a2.value ) { return -1; }
if( a1.value > a2.value ) { return 1; }
return 0;
}
public int equals( AnnotationDatum that ) {
if( this.value < that.value ) { return -1; }
if( this.value > that.value ) { return 1; }
return 0;
}
}