Draft version of AnalyzeAnnotations which creates plots of cumulative TiTv ratio versus filter value per each annotation in the input VCF rod. Minor cleanup of recalibration walkers.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2623 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ff6877a15e
commit
ba19afd529
|
|
@ -0,0 +1,29 @@
|
|||
#!/broad/tools/apps/R-2.6.0/bin/Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
annotationName = args[2]
|
||||
|
||||
|
||||
c <- read.table(input, header=T)
|
||||
|
||||
#
|
||||
# Plot residual error as a function of the covariate
|
||||
#
|
||||
|
||||
gt = c[c$GT==1 & c$numVariants>1000,]
|
||||
lt = c[c$GT==0 & c$numVariants>1000,]
|
||||
|
||||
outfile = paste(input, ".cumulativeTiTv_", annotationName, "_GTfilter.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(gt$value,gt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"> x",sep=" "),pch=20);
|
||||
dev.off()
|
||||
|
||||
outfile = paste(input, ".cumulativeTiTv_", annotationName, "_LTfilter.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(lt$value,lt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"< x",sep=" "),pch=20);
|
||||
dev.off()
|
||||
|
|
@ -94,16 +94,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||
private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // A list to hold the covariate objects that were requested
|
||||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private final Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private final Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
private int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
||||
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
||||
|
|
@ -157,7 +157,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
// First add the required covariates
|
||||
if( requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
|
||||
|
|
@ -210,8 +209,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
|
||||
dataManager = new RecalDataManager();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ public class RecalDataManager {
|
|||
private static boolean warnUserNullReadGroup = false;
|
||||
private static boolean warnUserNoColorSpace = false;
|
||||
|
||||
public static final String versionString = "v2.2.12"; // Major version, minor version, and build number
|
||||
public static final String versionString = "v2.2.13"; // Major version, minor version, and build number
|
||||
|
||||
RecalDataManager() {
|
||||
data = new NestedHashMap();
|
||||
|
|
|
|||
|
|
@ -91,12 +91,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
||||
private static final long RANDOM_SEED = 1032861495;
|
||||
private final Random coinFlip = new Random( RANDOM_SEED ); // Random number generator is used to remove reference bias in solid bams
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -114,7 +114,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
logger.info( "Recalibrator version: " + RecalDataManager.versionString );
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
coinFlip = new Random(RANDOM_SEED);
|
||||
if( !RAC.checkSolidRecalMode() ) {
|
||||
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS");
|
||||
}
|
||||
|
|
@ -136,18 +135,16 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
|
||||
}
|
||||
|
||||
// Read in the covariates that were used from the input file
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
||||
// Read in the data from the csv file and populate the map
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
logger.info( "Reading in the data from input csv file..." );
|
||||
|
||||
try {
|
||||
for ( String line : new xReadLines(new File( RAC.RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RAC.RECAL_FILE );
|
||||
|
|
|
|||
|
|
@ -2,8 +2,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
|||
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
|
|
@ -42,9 +49,17 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
|||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
|
||||
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
|
||||
private String OUTPUT_DIR = "analyzeAnnotations/";
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private final AnnotationDataManager dataManager = new AnnotationDataManager();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -53,6 +68,15 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
|
||||
// create the output directory where all the data tables and plots will go
|
||||
try {
|
||||
Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Couldn't create directory: " + OUTPUT_DIR);
|
||||
}
|
||||
if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; }
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -62,7 +86,19 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
// Add each VCF record to eacn annotation's data structure
|
||||
|
||||
// Add each VCF record to each annotation's data structure
|
||||
if( tracker != null ) {
|
||||
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
|
||||
if( rod != null && rod instanceof RodVCF ) {
|
||||
RodVCF variant = (RodVCF) rod;
|
||||
if( variant.isSNP() ) {
|
||||
dataManager.addAnnotations( variant );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1; // This value isn't actually used for anything
|
||||
}
|
||||
|
||||
|
|
@ -81,6 +117,8 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public void onTraversalDone( Integer sum ) {
|
||||
|
||||
// For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables
|
||||
dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_DIR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.Map;
|
||||
import java.util.Iterator;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 18, 2010
|
||||
*/
|
||||
|
||||
public class AnnotationDataManager {
|
||||
|
||||
public final HashMap<String, TreeSet<AnnotationDatum>> data;
|
||||
|
||||
public AnnotationDataManager() {
|
||||
data = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
}
|
||||
|
||||
public void addAnnotations( RodVCF variant ) {
|
||||
|
||||
// Loop over each annotation in the vcf record
|
||||
final Map<String,String> infoField = variant.getInfoValues();
|
||||
for( String annotationKey : infoField.keySet() ) {
|
||||
final float value = Float.parseFloat( infoField.get( annotationKey ) );
|
||||
|
||||
TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
|
||||
if( treeSet == null ) { // This annotation hasn't been seen before
|
||||
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() );
|
||||
data.put( annotationKey, treeSet );
|
||||
}
|
||||
AnnotationDatum datum = new AnnotationDatum( value );
|
||||
if( treeSet.contains(datum) ) {
|
||||
datum = treeSet.tailSet(datum).first();
|
||||
} else {
|
||||
treeSet.add(datum);
|
||||
}
|
||||
|
||||
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
|
||||
datum.incrementTi();
|
||||
} else {
|
||||
datum.incrementTv();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void plotCumulativeTables(final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_DIR) {
|
||||
|
||||
for( String annotationKey: data.keySet() ) {
|
||||
PrintStream output;
|
||||
try {
|
||||
output = new PrintStream(OUTPUT_DIR + annotationKey + ".dat");
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Can't create intermediate output annotation data file.");
|
||||
}
|
||||
// Output a header line
|
||||
output.println("value\tcumulativeTiTv\tnumVariants\tGT");
|
||||
|
||||
int numTi = 0;
|
||||
int numTv = 0;
|
||||
for( AnnotationDatum datum : data.get( annotationKey )) {
|
||||
numTi += datum.numTransitions;
|
||||
numTv += datum.numTransversions;
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t1");
|
||||
|
||||
}
|
||||
|
||||
numTi = 0;
|
||||
numTv = 0;
|
||||
Iterator<AnnotationDatum> iter = data.get( annotationKey ).descendingIterator();
|
||||
while( iter.hasNext() ) {
|
||||
final AnnotationDatum datum = iter.next();
|
||||
numTi += datum.numTransitions;
|
||||
numTv += datum.numTransversions;
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t0");
|
||||
|
||||
}
|
||||
|
||||
System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
|
||||
OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey);
|
||||
|
||||
try {
|
||||
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
|
||||
OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey);
|
||||
} catch (Exception e) {
|
||||
throw new StingException("Unable to run RScript command");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 18, 2010
|
||||
*/
|
||||
|
||||
public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
||||
public final float value;
|
||||
public int numTransitions;
|
||||
public int numTransversions;
|
||||
|
||||
public AnnotationDatum() {
|
||||
value = 0.0f;
|
||||
numTransitions = 0;
|
||||
numTransversions = 0;
|
||||
}
|
||||
|
||||
public AnnotationDatum( float _value ) {
|
||||
value = _value;
|
||||
numTransitions = 0;
|
||||
numTransversions = 0;
|
||||
}
|
||||
|
||||
final public void incrementTi() {
|
||||
numTransitions++;
|
||||
}
|
||||
|
||||
final public void incrementTv() {
|
||||
numTransversions++;
|
||||
}
|
||||
|
||||
final public float calcTiTv() {
|
||||
|
||||
if( numTransitions < 0 || numTransversions < 0) {
|
||||
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
|
||||
}
|
||||
|
||||
if( numTransversions == 0) {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return ((float) numTransitions) / ((float) numTransversions);
|
||||
}
|
||||
|
||||
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) {
|
||||
if( a1.value < a2.value ) { return -1; }
|
||||
if( a1.value > a2.value ) { return 1; }
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int equals( AnnotationDatum that ) {
|
||||
if( this.value < that.value ) { return -1; }
|
||||
if( this.value > that.value ) { return 1; }
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue