AnalyzeAnnotations now bins variants per each annotation and outputs plots of TiTv ratio as a function of the annotation's value.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2654 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4fc926232c
commit
d9df72e1b5
|
|
@ -0,0 +1,47 @@
|
||||||
|
#!/broad/tools/apps/R-2.6.0/bin/Rscript
|
||||||
|
|
||||||
|
args <- commandArgs(TRUE)
|
||||||
|
verbose = TRUE
|
||||||
|
|
||||||
|
input = args[1]
|
||||||
|
outputDir = args[2]
|
||||||
|
annotationName = args[3]
|
||||||
|
minBinCutoff = as.numeric(args[4])
|
||||||
|
|
||||||
|
|
||||||
|
c <- read.table(input, header=T)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Plot TiTv ratio as a function of the annotation
|
||||||
|
#
|
||||||
|
|
||||||
|
gt = c[c$numVariants>minBinCutoff,]
|
||||||
|
|
||||||
|
outfile = paste(outputDir, "binnedTiTv.", annotationName, ".pdf", sep="")
|
||||||
|
pdf(outfile, height=7, width=7)
|
||||||
|
par(cex=1.1)
|
||||||
|
plot(gt$value,gt$titv,xlab=annotationName,ylab="Ti/Tv ratio",pch=20);
|
||||||
|
m = weighted.mean(gt$value,gt$numVariants/sum(gt$numVariants))
|
||||||
|
ma = gt[gt$value > m,]
|
||||||
|
mb = gt[gt$value < m,]
|
||||||
|
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
|
||||||
|
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
|
||||||
|
abline(v=m,lty=2)
|
||||||
|
abline(v=m75,lty=2)
|
||||||
|
abline(v=m25,lty=2)
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
outfile = paste(outputDir, "binnedTiTv_quartiles.", annotationName, ".pdf", sep="")
|
||||||
|
pdf(outfile, height=7, width=7)
|
||||||
|
par(cex=1.1)
|
||||||
|
plot(gt$value,gt$titv,xlab=annotationName,ylab="Ti/Tv ratio",pch=20,xlim=c(0,80))
|
||||||
|
abline(v=m,lty=2)
|
||||||
|
abline(v=m75,lty=2)
|
||||||
|
abline(v=m25,lty=2)
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
outfile = paste(outputDir, "binnedTiTv_hist.", annotationName, ".pdf", sep="")
|
||||||
|
pdf(outfile, height=7, width=7)
|
||||||
|
par(cex=1.1)
|
||||||
|
plot(gt$value,gt$numVariants,xlab=annotationName,ylab="num Variants in bin",type="h");
|
||||||
|
dev.off()
|
||||||
|
|
@ -4,25 +4,26 @@ args <- commandArgs(TRUE)
|
||||||
verbose = TRUE
|
verbose = TRUE
|
||||||
|
|
||||||
input = args[1]
|
input = args[1]
|
||||||
annotationName = args[2]
|
outputDir = args[2]
|
||||||
|
annotationName = args[3]
|
||||||
|
|
||||||
|
|
||||||
c <- read.table(input, header=T)
|
c <- read.table(input, header=T)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Plot residual error as a function of the covariate
|
# Plot cumulative Ti/Tv ratio as a function of the annotation
|
||||||
#
|
#
|
||||||
|
|
||||||
gt = c[c$GT==1 & c$numVariants>1000,]
|
gt = c[c$GT==1 & c$numVariants>1000,]
|
||||||
lt = c[c$GT==0 & c$numVariants>1000,]
|
lt = c[c$GT==0 & c$numVariants>1000,]
|
||||||
|
|
||||||
outfile = paste(input, ".cumulativeTiTv_", annotationName, "_GTfilter.pdf", sep="")
|
outfile = paste(outputDir, "cumulativeTiTv.", annotationName, ".GTfilter.pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
par(cex=1.1)
|
par(cex=1.1)
|
||||||
plot(gt$value,gt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"> x",sep=" "),pch=20);
|
plot(gt$value,gt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"> x",sep=" "),pch=20);
|
||||||
dev.off()
|
dev.off()
|
||||||
|
|
||||||
outfile = paste(input, ".cumulativeTiTv_", annotationName, "_LTfilter.pdf", sep="")
|
outfile = paste(outputDir, "cumulativeTiTv.", annotationName, ".GTfilter.pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
par(cex=1.1)
|
par(cex=1.1)
|
||||||
plot(lt$value,lt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"< x",sep=" "),pch=20);
|
plot(lt$value,lt$cumulativeTiTv,xlab=annotationName,ylab="Ti/Tv ratio",main=paste("Filter out SNPs with",annotationName,"< x",sep=" "),pch=20);
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,11 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
||||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||||
private String PATH_TO_RESOURCES = "R/";
|
private String PATH_TO_RESOURCES = "R/";
|
||||||
|
@Argument(fullName = "min_variants_per_bin", shortName = "minBinSize", doc = "The minimum number of variants in a bin in order to calculate truth metrics.", required = false)
|
||||||
|
private int MIN_VARIANTS_PER_BIN = 1000;
|
||||||
|
@Argument(fullName = "max_variants_per_bin", shortName = "maxBinSize", doc = "The maximum number of variants in a bin.", required = false)
|
||||||
|
private int MAX_VARIANTS_PER_BIN = 20000;
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Private Member Variables
|
// Private Member Variables
|
||||||
|
|
@ -116,6 +121,6 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
||||||
public void onTraversalDone( Integer sum ) {
|
public void onTraversalDone( Integer sum ) {
|
||||||
|
|
||||||
// For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables
|
// For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables
|
||||||
dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_DIR);
|
dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_DIR, MIN_VARIANTS_PER_BIN, MAX_VARIANTS_PER_BIN);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -52,11 +52,14 @@ public class AnnotationDataManager {
|
||||||
// Loop over each annotation in the vcf record
|
// Loop over each annotation in the vcf record
|
||||||
final Map<String,String> infoField = variant.getInfoValues();
|
final Map<String,String> infoField = variant.getInfoValues();
|
||||||
for( String annotationKey : infoField.keySet() ) {
|
for( String annotationKey : infoField.keySet() ) {
|
||||||
|
if( annotationKey.equalsIgnoreCase("AC") ) {
|
||||||
|
continue; // This annotation isn't a float value
|
||||||
|
}
|
||||||
final float value = Float.parseFloat( infoField.get( annotationKey ) );
|
final float value = Float.parseFloat( infoField.get( annotationKey ) );
|
||||||
|
|
||||||
TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
|
TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
|
||||||
if( treeSet == null ) { // This annotation hasn't been seen before
|
if( treeSet == null ) { // This annotation hasn't been seen before
|
||||||
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() );
|
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
|
||||||
data.put( annotationKey, treeSet );
|
data.put( annotationKey, treeSet );
|
||||||
}
|
}
|
||||||
AnnotationDatum datum = new AnnotationDatum( value );
|
AnnotationDatum datum = new AnnotationDatum( value );
|
||||||
|
|
@ -66,6 +69,7 @@ public class AnnotationDataManager {
|
||||||
treeSet.add(datum);
|
treeSet.add(datum);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Decide if it was a Ti or a Tv
|
||||||
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
|
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
|
||||||
datum.incrementTi();
|
datum.incrementTi();
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -74,21 +78,25 @@ public class AnnotationDataManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void plotCumulativeTables(final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_DIR) {
|
public void plotCumulativeTables( final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_DIR,
|
||||||
|
final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) {
|
||||||
|
|
||||||
for( String annotationKey: data.keySet() ) {
|
for( String annotationKey: data.keySet() ) {
|
||||||
|
|
||||||
|
/*
|
||||||
PrintStream output;
|
PrintStream output;
|
||||||
try {
|
try {
|
||||||
output = new PrintStream(OUTPUT_DIR + annotationKey + ".dat");
|
output = new PrintStream(OUTPUT_DIR + annotationKey + ".cumulative.dat");
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
throw new StingException("Can't create intermediate output annotation data file.");
|
throw new StingException("Can't create intermediate output annotation data file.");
|
||||||
}
|
}
|
||||||
// Output a header line
|
// Output a header line
|
||||||
output.println("value\tcumulativeTiTv\tnumVariants\tGT");
|
output.println("value\tcumulativeTiTv\tnumVariants\tGT");
|
||||||
|
|
||||||
|
// Filter SNPs greater than this annotation value
|
||||||
int numTi = 0;
|
int numTi = 0;
|
||||||
int numTv = 0;
|
int numTv = 0;
|
||||||
for( AnnotationDatum datum : data.get( annotationKey )) {
|
for( AnnotationDatum datum : data.get( annotationKey ) ) {
|
||||||
numTi += datum.numTransitions;
|
numTi += datum.numTransitions;
|
||||||
numTv += datum.numTransversions;
|
numTv += datum.numTransversions;
|
||||||
float titv;
|
float titv;
|
||||||
|
|
@ -98,6 +106,7 @@ public class AnnotationDataManager {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Filter SNPs less than this annotation value
|
||||||
numTi = 0;
|
numTi = 0;
|
||||||
numTv = 0;
|
numTv = 0;
|
||||||
Iterator<AnnotationDatum> iter = data.get( annotationKey ).descendingIterator();
|
Iterator<AnnotationDatum> iter = data.get( annotationKey ).descendingIterator();
|
||||||
|
|
@ -111,15 +120,66 @@ public class AnnotationDataManager {
|
||||||
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t0");
|
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) +"\t0");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
PrintStream output;
|
||||||
|
try {
|
||||||
|
output = new PrintStream(OUTPUT_DIR + annotationKey + ".binned.dat");
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new StingException("Can't create intermediate output annotation data file.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output a header line
|
||||||
|
output.println("value\ttitv\tnumVariants\t");
|
||||||
|
|
||||||
|
// Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv
|
||||||
|
int numTi = 0;
|
||||||
|
int numTv = 0;
|
||||||
|
AnnotationDatum lastDatum = null;
|
||||||
|
for( AnnotationDatum datum : data.get( annotationKey ) ) {
|
||||||
|
numTi += datum.numTransitions;
|
||||||
|
numTv += datum.numTransversions;
|
||||||
|
lastDatum = datum;
|
||||||
|
if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
|
||||||
|
float titv;
|
||||||
|
if( numTv == 0) { titv = 0.0f; }
|
||||||
|
else { titv = ((float) numTi) / ((float) numTv); }
|
||||||
|
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv));
|
||||||
|
numTi = 0;
|
||||||
|
numTv = 0;
|
||||||
|
}
|
||||||
|
// else, continue accumulating variants because this bin isn't full yet
|
||||||
|
}
|
||||||
|
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
|
||||||
|
float titv;
|
||||||
|
if( numTv == 0) { titv = 0.0f; }
|
||||||
|
else { titv = ((float) numTi) / ((float) numTv); }
|
||||||
|
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
|
System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
|
||||||
OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey);
|
OUTPUT_DIR + annotationKey + ".cumulative.dat" + " " + OUTPUT_DIR + " " + annotationKey);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
|
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_CumulativeTiTv.R" + " " +
|
||||||
OUTPUT_DIR + annotationKey + ".dat" + " " + annotationKey);
|
OUTPUT_DIR + annotationKey + ".cumulative.dat"+ " " + OUTPUT_DIR + " " + annotationKey);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new StingException("Unable to run RScript command");
|
throw new StingException("Unable to execute RScript command");
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
//System.out.println(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
|
||||||
|
// OUTPUT_DIR + annotationKey + ".binned.dat" + " " + OUTPUT_DIR + " " + annotationKey +
|
||||||
|
// " " + MIN_VARIANTS_PER_BIN);
|
||||||
|
|
||||||
|
// Execute the RScript command to plot the table of TiTv values
|
||||||
|
try {
|
||||||
|
final Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
|
||||||
|
OUTPUT_DIR + annotationKey + ".binned.dat" + " " + OUTPUT_DIR + " " + annotationKey +
|
||||||
|
" " + MIN_VARIANTS_PER_BIN);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new StingException("Unable to execute RScript command");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue