Initial checkin of AnalyzeCovariates.java which replaces analyzeRecalQuals_1KG.py and is updated to use the new Covariates system. It creates similar plots of residual error for each covariate that was used in the calculation. There is also an option to filter out base qualities below a given threshold.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2215 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2838629724
commit
67179e2412
|
|
@ -0,0 +1,55 @@
|
|||
#!/broad/tools/apps/R-2.6.0/bin/Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
covariateName = args[2]
|
||||
|
||||
#X11(width=7, height=14)
|
||||
#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
|
||||
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
c <- c[sort.list(c[,1]),]
|
||||
|
||||
d.good <- c[c$nBases >= 1000,]
|
||||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
|
||||
rmseAll = sqrt(sum((c$Qempirical-c$Qreported)^2 * c$nBases) / sum(c$nBases) )
|
||||
theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE = ", round(rmseAll,digits=3))
|
||||
}
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qempirical-d.good$Qreported, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=16, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="p", col="cornflowerblue", pch=16)
|
||||
} else {
|
||||
plot(c$Covariate, c$Qempirical-c$Qreported, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
|
||||
points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
e = d.good
|
||||
f = d.1000
|
||||
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=4, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n")
|
||||
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=4, col="cornflowerblue")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
} else {
|
||||
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
|
||||
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=4, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xaxt="n")
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
}
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
#!/broad/tools/apps/R-2.6.0/bin/Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
input = args[1]
|
||||
Qcutoff = as.numeric(args[2])
|
||||
|
||||
t=read.table(input, header=T)
|
||||
#t=read.csv(input)
|
||||
#par(mfrow=c(2,1), cex=1.2)
|
||||
|
||||
#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
|
||||
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
|
||||
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
|
||||
f <- t[t$Qreported < Qcutoff,]
|
||||
e <- rbind(d.good, d.1000, d.10000)
|
||||
rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
|
||||
rmseAll = sqrt(sum((e$Qempirical-e$Qreported)^2 * e$nBases) / sum(e$nBases) )
|
||||
theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
if (length(t$nBases) == length(d.good$nBases) ) {
|
||||
theTitle = paste("RMSE = ", round(rmseAll,digits=3));
|
||||
}
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
#outfile = paste(input, ".quality_emp_hist.png", sep="")
|
||||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
|
||||
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count",yaxt="n")
|
||||
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
|
||||
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), main="Reported quality score histogram", xlab="Reported quality score", ylab="Count",yaxt="n")
|
||||
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
|
@ -114,12 +114,13 @@
|
|||
<fileset dir="build" includes="**/alignment/**/*.class" />
|
||||
</jar>
|
||||
|
||||
<jar jarfile="${dist}/FourBaseRecaller.jar" whenmanifestonly="skip">
|
||||
<jar jarfile="${dist}/AnalyzeCovariates.jar" whenmanifestonly="skip">
|
||||
<fileset dir="build">
|
||||
<include name="**/secondarybase/**/*.class" />
|
||||
<include name="**/analyzecovariates/**/*.class" />
|
||||
<include name="**/gatk/walkers/recalibration/*.class" />
|
||||
</fileset>
|
||||
<manifest>
|
||||
<attribute name="Main-Class" value="org.broadinstitute.sting.secondarybase.FourBaseRecaller" />
|
||||
<attribute name="Main-Class" value="org.broadinstitute.sting.analyzecovariates.AnalyzeCovariates" />
|
||||
</manifest>
|
||||
</jar>
|
||||
|
||||
|
|
@ -150,7 +151,7 @@
|
|||
</manifest>
|
||||
</jar>
|
||||
|
||||
<jar jarfile="${dist}/FourBaseRecaller.jar" update="true" whenmanifestonly="skip">
|
||||
<jar jarfile="${dist}/AnalyzeCovariates.jar" update="true" whenmanifestonly="skip">
|
||||
<manifest>
|
||||
<attribute name="Class-Path" value="${jar.classpath}" />
|
||||
</manifest>
|
||||
|
|
|
|||
|
|
@ -0,0 +1,83 @@
|
|||
package org.broadinstitute.sting.analyzecovariates;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.NHashMap;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 1, 2009
|
||||
*/
|
||||
|
||||
public class AnalysisDataManager {
|
||||
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // Tables where everything except read group and given covariate has been collapsed
|
||||
|
||||
AnalysisDataManager() {
|
||||
}
|
||||
|
||||
AnalysisDataManager( final int numCovariates ) {
|
||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||
for( int iii = 0; iii < numCovariates - 1; iii++ ) { // readGroup isn't counted here, its table is separate
|
||||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
*/
|
||||
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum, final int IGNORE_QSCORES_LESS_THAN ) {
|
||||
|
||||
int qscore = Integer.parseInt( key.get(1).toString() );
|
||||
ArrayList<Comparable> newKey;
|
||||
RecalDatum collapsedDatum;
|
||||
|
||||
if( !(qscore < IGNORE_QSCORES_LESS_THAN) ) {
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with just the read group
|
||||
collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
|
||||
if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle etc.
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with the read group ...
|
||||
newKey.add( key.get(iii + 1) ); // and the given covariate
|
||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 1 ); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,307 @@
|
|||
package org.broadinstitute.sting.analyzecovariates;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.*;
|
||||
import org.broadinstitute.sting.utils.PackageUtils;
|
||||
import org.broadinstitute.sting.utils.xReadLines;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 1, 2009
|
||||
*/
|
||||
|
||||
public class AnalyzeCovariates {
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
private static String RECAL_FILE = "output.recal_data.csv";
|
||||
private static String OUTPUT_DIR = "analyzeCovariates/";
|
||||
private static String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
private static String PATH_TO_RESOURCES = "R/";
|
||||
private static int IGNORE_QSCORES_LESS_THAN = 5;
|
||||
private static int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private static AnalysisDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private static ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
// parse command line arguments
|
||||
parseArguments( args );
|
||||
// create the output directory where all the data tables and plots will go
|
||||
try {
|
||||
Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Couldn't create directory: " + OUTPUT_DIR);
|
||||
}
|
||||
if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; }
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
// initialize all the data from the csv file and allocate the list of covariates
|
||||
System.out.println("Reading in input csv file...");
|
||||
initializeData();
|
||||
System.out.println("...Done!");
|
||||
|
||||
// output data tables for Rscript to read in
|
||||
System.out.println("Writing out intermediate tables for R...");
|
||||
writeDataTables();
|
||||
System.out.println("...Done!");
|
||||
|
||||
// perform the analysis using Rscript and output the plots
|
||||
System.out.println("Calling analysis R scripts and writing out figures...");
|
||||
callRScripts();
|
||||
System.out.println("...Done!");
|
||||
|
||||
}
|
||||
|
||||
private static void parseArguments( String[] args ) {
|
||||
int iii = 0;
|
||||
String arg;
|
||||
|
||||
try {
|
||||
while( iii < args.length && args[iii].startsWith("-") ) {
|
||||
arg = args[iii++];
|
||||
|
||||
if( arg.equals( "-recalFile" ) ) {
|
||||
RECAL_FILE = args[iii++];
|
||||
} else if( arg.equals( "-Rscript" ) ) {
|
||||
PATH_TO_RSCRIPT = args[iii++];
|
||||
} else if( arg.equals( "-resources" ) ) {
|
||||
PATH_TO_RESOURCES = args[iii++];
|
||||
} else if( arg.equals( "-ignoreQ" ) ) {
|
||||
IGNORE_QSCORES_LESS_THAN = Integer.parseInt( args[iii++] );
|
||||
} else if (arg.equals( "-numRG" ) ) {
|
||||
NUM_READ_GROUPS_TO_PROCESS = Integer.parseInt( args[iii++] );
|
||||
} else if( arg.equals( "-outputDir" ) ) {
|
||||
OUTPUT_DIR = args[iii++];
|
||||
} else {
|
||||
iii = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( iii != args.length ) {
|
||||
throw new RuntimeException( "Exception" );
|
||||
}
|
||||
} catch(Exception e) {
|
||||
System.out.println( "Usage: [-option param] \n" );
|
||||
System.out.println(" Available options:");
|
||||
System.out.println("\t-recalFile <path>\tPath to input recal csv file. Default value: output.recal_data.csv");
|
||||
System.out.println("\t-Rscript <path>\t\tPath to your implementation of Rscript. Default value: /broad/tools/apps/R-2.6.0/bin/Rscript");
|
||||
System.out.println("\t-resources <path>\tPath to resources folder holding the Sting R scripts. Default value: R/");
|
||||
System.out.println("\t-outputDir <path>\tWhere to put the output plots. Default value: analyzeCovariates/");
|
||||
System.out.println("\t-ignoreQ <int>\t\tIgnore bases with reported quality less than this number. Default value: 5");
|
||||
System.out.println("\t-numRG <int>\t\tOnly process N read groups. Default value: -1 (process all read groups)");
|
||||
System.exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
private static void initializeData() {
|
||||
|
||||
// Get a list of all available covariates
|
||||
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
|
||||
|
||||
// Read in the covariates that were used from the input file
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
||||
try {
|
||||
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new RuntimeException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new RuntimeException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
// Don't want to crash with out of heap space exception
|
||||
if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed
|
||||
estimatedCapacity = 300 * 40 * 200;
|
||||
}
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( new RecalibrationArgumentCollection() );
|
||||
}
|
||||
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new AnalysisDataManager( requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Can not find input file: " + RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
}
|
||||
|
||||
private static void addCSVData(String line) {
|
||||
String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new RuntimeException("Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
ArrayList<Comparable> key = new ArrayList<Comparable>();
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
|
||||
|
||||
}
|
||||
|
||||
private static void writeDataTables() {
|
||||
|
||||
int numReadGroups = 0;
|
||||
|
||||
// for each read group
|
||||
NHashMap<RecalDatum> readGroupTable = dataManager.getCollapsedTable(0);
|
||||
for( List<? extends Comparable> readGroupKey : readGroupTable.keySet() ) {
|
||||
|
||||
if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
|
||||
String readGroup = readGroupKey.get(0).toString();
|
||||
RecalDatum readGroupDatum = readGroupTable.get(readGroupKey);
|
||||
System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" );
|
||||
System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0) - readGroupDatum.getEstimatedQReported()));
|
||||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
Covariate cov = requestedCovariates.get(iii);
|
||||
|
||||
// Create a PrintStream
|
||||
PrintStream output = null;
|
||||
try {
|
||||
output = new PrintStream(new FileOutputStream(OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"));
|
||||
|
||||
} catch (FileNotFoundException e) {
|
||||
System.err.println("Can't create file: " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
// Output the header
|
||||
output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
|
||||
|
||||
// Loop through the covariate table looking for keys with matching read groups
|
||||
// BUGBUG: hopefully rewrite this to be more efficient
|
||||
for( List<? extends Comparable> covariateKey : dataManager.getCollapsedTable(iii).keySet() ) {
|
||||
if( covariateKey.get(0).toString().equals(readGroup) ) {
|
||||
output.print( covariateKey.get(1).toString() + "\t" ); // Covariate
|
||||
RecalDatum thisDatum = dataManager.getCollapsedTable(iii).get(covariateKey);
|
||||
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
|
||||
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0)) + "\t" ); // Qempirical
|
||||
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
|
||||
output.println( thisDatum.getNumObservations() ); // nBases
|
||||
}
|
||||
}
|
||||
|
||||
// Close the PrintStream
|
||||
output.close();
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static void callRScripts() {
|
||||
|
||||
int numReadGroups = 0;
|
||||
|
||||
// for each read group
|
||||
for( List<? extends Comparable> readGroupList : dataManager.getCollapsedTable(0).keySet() ) {
|
||||
|
||||
if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
|
||||
|
||||
String readGroup = readGroupList.get(0).toString();
|
||||
System.out.println("Analyzing read group: " + readGroup);
|
||||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
Covariate cov = requestedCovariates.get(iii);
|
||||
try {
|
||||
if( iii == 1 ) {
|
||||
// Analyze reported quality
|
||||
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
|
||||
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
|
||||
IGNORE_QSCORES_LESS_THAN); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
|
||||
} else { // Analyze all other covariates
|
||||
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
|
||||
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
|
||||
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument which is the name of the covariate to make the plots look nice
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -44,6 +44,7 @@ import net.sf.samtools.SAMReadGroupRecord;
|
|||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
|
||||
public NHashMap<RecalDatum> data; // The full dataset
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
|
|
|
|||
|
|
@ -170,7 +170,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + vals[iii] + ") isn't a valid covariate option." );
|
||||
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -210,7 +210,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Can not find input file: " + RAC.RECAL_FILE);
|
||||
throw new StingException("Can not find input file: " + RAC.RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
|
|
@ -277,7 +277,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
}
|
||||
// Create a new datum using the number of observations and number of mismatches
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum );
|
||||
|
|
@ -368,7 +368,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( collapsedTableKey );
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalDeltaQEmpirical != null ) {
|
||||
Double aggregrateQReported = dataManager.getCollapsedTable(0).get( collapsedTableKey ).getEstimatedQReported();
|
||||
double aggregrateQReported = dataManager.getCollapsedTable(0).get( collapsedTableKey ).getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue