AnalyzeAnnotations creates a plot of dbsnp rate as a function of the annotations.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2711 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
668defc841
commit
b8ae083d1b
|
|
@ -7,13 +7,8 @@ input = args[1]
|
||||||
annotationName = args[2]
|
annotationName = args[2]
|
||||||
minBinCutoff = as.numeric(args[3])
|
minBinCutoff = as.numeric(args[3])
|
||||||
|
|
||||||
|
|
||||||
c <- read.table(input, header=T)
|
c <- read.table(input, header=T)
|
||||||
|
|
||||||
#
|
|
||||||
# Plot TiTv ratio as a function of the annotation
|
|
||||||
#
|
|
||||||
|
|
||||||
all = c[c$numVariants>minBinCutoff & c$category=="all",]
|
all = c[c$numVariants>minBinCutoff & c$category=="all",]
|
||||||
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
|
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
|
||||||
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
|
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
|
||||||
|
|
@ -24,10 +19,14 @@ ymax = max(d$titv)
|
||||||
xmin = min(d$value)
|
xmin = min(d$value)
|
||||||
xmax = max(d$value)
|
xmax = max(d$value)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Plot TiTv ratio as a function of the annotation
|
||||||
|
#
|
||||||
|
|
||||||
outfile = paste(input, ".TiTv.", annotationName, ".pdf", sep="")
|
outfile = paste(input, ".TiTv.", annotationName, ".pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
par(cex=1.1)
|
par(cex=1.1)
|
||||||
plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||||
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
|
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
|
||||||
ma = all[all$value > m,]
|
ma = all[all$value > m,]
|
||||||
|
|
@ -42,10 +41,14 @@ points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
|
||||||
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
||||||
dev.off()
|
dev.off()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
|
||||||
|
#
|
||||||
|
|
||||||
outfile = paste(input, ".TiTv_log.", annotationName, ".pdf", sep="")
|
outfile = paste(input, ".TiTv_log.", annotationName, ".pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
par(cex=1.1)
|
par(cex=1.1)
|
||||||
plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||||
abline(v=m,lty=2)
|
abline(v=m,lty=2)
|
||||||
abline(v=m75,lty=2)
|
abline(v=m75,lty=2)
|
||||||
|
|
@ -55,10 +58,46 @@ points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
|
||||||
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
||||||
dev.off()
|
dev.off()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Plot dbsnp rate as a function of the annotation
|
||||||
|
#
|
||||||
|
|
||||||
|
outfile = paste(input, ".dbsnpRate.", annotationName, ".pdf", sep="")
|
||||||
|
pdf(outfile, height=7, width=7)
|
||||||
|
par(cex=1.1)
|
||||||
|
plot(all$value,all$dbsnp,xlab=annotationName,ylab="DBsnp Rate",pch=20,xaxt="n",ps=14);
|
||||||
|
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||||
|
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
|
||||||
|
ma = all[all$value > m,]
|
||||||
|
mb = all[all$value < m,]
|
||||||
|
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
|
||||||
|
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
|
||||||
|
abline(v=m,lty=2)
|
||||||
|
abline(v=m75,lty=2)
|
||||||
|
abline(v=m25,lty=2)
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Plot dbsnp rate as a function of the annotation, log scale on the x-axis
|
||||||
|
#
|
||||||
|
|
||||||
|
outfile = paste(input, ".dbsnpRate_log.", annotationName, ".pdf", sep="")
|
||||||
|
pdf(outfile, height=7, width=7)
|
||||||
|
par(cex=1.1)
|
||||||
|
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab="DBsnp Rate",pch=20,xaxt="n",ps=14);
|
||||||
|
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||||
|
abline(v=m,lty=2)
|
||||||
|
abline(v=m75,lty=2)
|
||||||
|
abline(v=m25,lty=2)
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Plot histogram of the annotation's value
|
||||||
|
#
|
||||||
|
|
||||||
outfile = paste(input, "TiTv_hist.", annotationName, ".pdf", sep="")
|
outfile = paste(input, "TiTv_hist.", annotationName, ".pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
par(cex=1.1)
|
par(cex=1.1)
|
||||||
plot(all$value,all$numVariants,xlab=annotationName,ylab="num variants in bin",type="h",xaxt="n",ps=14);
|
plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14);
|
||||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||||
dev.off()
|
dev.off()
|
||||||
|
|
@ -73,7 +73,7 @@ public class AnnotationDataManager {
|
||||||
data.put( annotationKey, treeSet );
|
data.put( annotationKey, treeSet );
|
||||||
}
|
}
|
||||||
AnnotationDatum datum = new AnnotationDatum( value );
|
AnnotationDatum datum = new AnnotationDatum( value );
|
||||||
if( treeSet.contains(datum) ) { // contains uses AnnotationDatum's equals function, so it only checks if the value field is already present
|
if( treeSet.contains(datum) ) { // contains() uses AnnotationDatum's equals function, so it only checks if the value field is already present
|
||||||
datum = treeSet.tailSet(datum).first();
|
datum = treeSet.tailSet(datum).first();
|
||||||
} else {
|
} else {
|
||||||
treeSet.add(datum);
|
treeSet.add(datum);
|
||||||
|
|
@ -95,39 +95,42 @@ public class AnnotationDataManager {
|
||||||
final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) {
|
final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) {
|
||||||
|
|
||||||
final AnnotationDatum thisAnnotationBin = new AnnotationDatum();
|
final AnnotationDatum thisAnnotationBin = new AnnotationDatum();
|
||||||
System.out.println( "\nExecuting RScript commands:" );
|
System.out.println( "\nFinished reading variants into memory. Executing RScript commands:" );
|
||||||
|
|
||||||
// For each annotation we've seen
|
// For each annotation we've seen
|
||||||
for( String annotationKey : data.keySet() ) {
|
for( String annotationKey : data.keySet() ) {
|
||||||
|
|
||||||
PrintStream output;
|
PrintStream output;
|
||||||
try {
|
try {
|
||||||
output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the data file for this annotation
|
output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the intermediate data file for this annotation
|
||||||
} catch ( FileNotFoundException e ) {
|
} catch ( FileNotFoundException e ) {
|
||||||
throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " +
|
throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " +
|
||||||
OUTPUT_PREFIX + annotationKey + ".dat");
|
OUTPUT_PREFIX + annotationKey + ".dat");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output a header line
|
// Output a header line
|
||||||
output.println("value\ttitv\tnumVariants\tcategory");
|
output.println("value\ttitv\tdbsnp\tnumVariants\tcategory");
|
||||||
|
|
||||||
// Bin SNPs and calculate truth metrics for each bin
|
// Bin SNPs and calculate truth metrics for each bin
|
||||||
thisAnnotationBin.clearBin();
|
thisAnnotationBin.clearBin();
|
||||||
for( AnnotationDatum datum : data.get( annotationKey ) ) {
|
for( AnnotationDatum datum : data.get( annotationKey ) ) {
|
||||||
thisAnnotationBin.combine( datum );
|
thisAnnotationBin.combine( datum );
|
||||||
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
|
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
|
||||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() +
|
||||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
||||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
||||||
|
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
||||||
thisAnnotationBin.clearBin();
|
thisAnnotationBin.clearBin();
|
||||||
}
|
}
|
||||||
// else, continue accumulating variants because this bin isn't full yet
|
// else, continue accumulating variants because this bin isn't full yet
|
||||||
}
|
}
|
||||||
|
|
||||||
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // One final bin that may not have been dumped out
|
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // One final bin that may not have been dumped out
|
||||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() +
|
||||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
||||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
||||||
|
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t0.0\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
||||||
|
thisAnnotationBin.clearBin();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -127,13 +127,23 @@ public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final public float calcDBsnpRate() {
|
||||||
|
|
||||||
|
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) /
|
||||||
|
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
|
||||||
|
}
|
||||||
|
|
||||||
final public float calcTPrate() {
|
final public float calcTPrate() {
|
||||||
|
|
||||||
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
|
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
|
return 100.0f * ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
|
||||||
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
|
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue