Added comments to AnalyzeCovariates and R scripts. R script prevents residuals from going off the edge of the plot. Added skeleton code to the recalibration walkers showing how we plan to handle SOLID reference inserting behavior.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2233 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
451a20ed55
commit
46f3d3e39b
|
|
@ -6,36 +6,47 @@ verbose = TRUE
|
|||
input = args[1]
|
||||
covariateName = args[2]
|
||||
|
||||
#X11(width=7, height=14)
|
||||
#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
|
||||
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
c <- c[sort.list(c[,1]),]
|
||||
|
||||
#
|
||||
# Plot residual error as a function of the covariate
|
||||
#
|
||||
|
||||
d.good <- c[c$nBases >= 1000,]
|
||||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
|
||||
rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
|
||||
rmseAll = sqrt(sum((c$Qempirical-c$Qreported)^2 * c$nBases) / sum(c$nBases) )
|
||||
theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE = ", round(rmseAll,digits=3))
|
||||
}
|
||||
}
|
||||
# Don't let residual error go off the edge of the plot
|
||||
d.good$residualError = d.good$Qempirical-d.good$Qreported
|
||||
d.good$residualError[which(d.good$residualError > 10)] = 10
|
||||
d.good$residualError[which(d.good$residualError < -10)] = -10
|
||||
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
|
||||
d.1000$residualError[which(d.1000$residualError > 10)] = 10
|
||||
d.1000$residualError[which(d.1000$residualError < -10)] = -10
|
||||
c$residualError = c$Qempirical-c$Qreported
|
||||
c$residualError[which(c$residualError > 10)] = 10
|
||||
c$residualError[which(c$residualError < -10)] = -10
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qempirical-d.good$Qreported, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="p", col="cornflowerblue", pch=20)
|
||||
} else {
|
||||
plot(c$Covariate, c$Qempirical-c$Qreported, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
|
||||
points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
plot(d.good$Covariate, d.good$residualError, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$residualError, type="p", col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
|
||||
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
|
||||
e = d.good
|
||||
f = d.1000
|
||||
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
|
||||
|
|
@ -46,7 +57,7 @@ if( is.numeric(c$Covariate) ) {
|
|||
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=2, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=2, col="cornflowerblue")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
} else {
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
|
||||
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=7, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xaxt="n")
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
|
||||
|
|
|
|||
|
|
@ -6,11 +6,11 @@ input = args[1]
|
|||
Qcutoff = as.numeric(args[2])
|
||||
|
||||
t=read.table(input, header=T)
|
||||
#t=read.csv(input)
|
||||
#par(mfrow=c(2,1), cex=1.2)
|
||||
|
||||
#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||
#
|
||||
# Plot of reported quality versus empirical quality
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
|
||||
|
|
@ -18,12 +18,12 @@ d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
|
|||
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
|
||||
f <- t[t$Qreported < Qcutoff,]
|
||||
e <- rbind(d.good, d.1000, d.10000)
|
||||
rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
|
||||
rmseGood = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
|
||||
rmseAll = sqrt(sum((e$Qempirical-e$Qreported)^2 * e$nBases) / sum(e$nBases) )
|
||||
theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
if (length(t$nBases) == length(d.good$nBases) ) {
|
||||
theTitle = paste("RMSE = ", round(rmseAll,digits=3));
|
||||
}
|
||||
}
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
|
|
@ -31,8 +31,10 @@ points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
|
|||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
#outfile = paste(input, ".quality_emp_hist.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||
#
|
||||
# Plot Q empirical histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
|
||||
|
|
@ -45,6 +47,7 @@ dev.off()
|
|||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@ import java.util.List;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 1, 2009
|
||||
*
|
||||
* The difference between this AnalysisDataManager and the RecalDataManager used by the Recalibration walkers is that here the collapsed data tables are indexed
|
||||
* by only read group and the given covariate, while in the recalibrator the collapsed tables are indexed by read group, reported quality, and the given covariate.
|
||||
*/
|
||||
|
||||
public class AnalysisDataManager {
|
||||
|
|
|
|||
|
|
@ -292,7 +292,7 @@ public class AnalyzeCovariates {
|
|||
} else { // Analyze all other covariates
|
||||
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
|
||||
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
|
||||
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument which is the name of the covariate to make the plots look nice
|
||||
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private static final String versionString = "v2.0.11"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.0.12"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -268,7 +268,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
int offset;
|
||||
byte refBase;
|
||||
byte prevBase;
|
||||
byte[] colorSpaceQuals;
|
||||
byte[] bases;
|
||||
|
||||
// For each read at this locus
|
||||
|
|
@ -298,16 +297,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
|
||||
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
|
||||
colorSpaceQuals = null;
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base
|
||||
// so decrease the quality of this base by forcing it to be a mismatch
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
|
||||
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
}
|
||||
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
|
||||
{
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( read, offset, refBase );
|
||||
if( RecalDataManager.isInconsistentColorSpace( p.getBase(), read ) ) {
|
||||
refBase = (byte)'X'; // Because we are already filter out all bases that don't pass BaseUtils.isRegularBase this will always be marked as a mismatch
|
||||
}
|
||||
}
|
||||
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( read, offset, refBase );
|
||||
} else {
|
||||
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
|
||||
countedBases++; // Replicating a small bug in the old recalibrator
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ public class RecalDataManager {
|
|||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
private static boolean warnUserNullReadGroup = false;
|
||||
|
||||
RecalDataManager() {
|
||||
|
|
@ -243,4 +244,14 @@ public class RecalDataManager {
|
|||
readGroup.setPlatform( RAC.FORCE_PLATFORM );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
|
||||
* @param base The nucleotide we are checking
|
||||
* @param read The read which contains the color space to check against
|
||||
* @return Returns true if the base is inconsistent with the color space
|
||||
*/
|
||||
public static boolean isInconsistentColorSpace( final byte base, final SAMRecord read ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.0.9"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.0.10"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -328,12 +328,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
|
||||
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
|
||||
}
|
||||
|
||||
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
|
||||
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
|
||||
|
|
@ -430,20 +424,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
* @param colorSpaceQuals The list of color space quality scores for this read
|
||||
*/
|
||||
private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
|
|
|
|||
Loading…
Reference in New Issue