Initial checkin of AnalyzeCovariates.java which replaces analyzeRecalQuals_1KG.py and is updated to use the new Covariates system. It creates similar plots of residual error for each covariate that was used in the calculation. There is also an option to filter out base qualities below a given threshold.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2215 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-12-02 16:47:35 +00:00
parent 2838629724
commit 67179e2412
12 changed files with 510 additions and 8 deletions

View File

@ -0,0 +1,55 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
covariateName = args[2]
#X11(width=7, height=14)
#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
c <- c[sort.list(c[,1]),]
d.good <- c[c$nBases >= 1000,]
d.1000 <- c[c$nBases < 1000,]
rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
rmseAll = sqrt(sum((c$Qempirical-c$Qreported)^2 * c$nBases) / sum(c$nBases) )
theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
if( length(d.good$nBases) == length(c$nBases) ) {
theTitle = paste("RMSE = ", round(rmseAll,digits=3))
}
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$Qempirical-d.good$Qreported, type="p", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=16, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="p", col="cornflowerblue", pch=16)
} else {
plot(c$Covariate, c$Qempirical-c$Qreported, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
points(d.1000$Covariate, d.1000$Qempirical-d.1000$Qreported, type="l", col="cornflowerblue")
}
dev.off()
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)
#
# Plot histogram of the covariate
#
e = d.good
f = d.1000
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
if( is.numeric(c$Covariate) ) {
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=4, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n")
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=4, col="cornflowerblue")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
} else {
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=4, main=paste(covariateName,"histogram"), xlab=covariateName, ylab="Count",yaxt="n",xaxt="n")
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()

View File

@ -0,0 +1,55 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
input = args[1]
Qcutoff = as.numeric(args[2])
t=read.table(input, header=T)
#t=read.csv(input)
#par(mfrow=c(2,1), cex=1.2)
#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
f <- t[t$Qreported < Qcutoff,]
e <- rbind(d.good, d.1000, d.10000)
rmseBlue = sqrt(sum((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases) / sum(d.good$nBases) )
rmseAll = sqrt(sum((e$Qempirical-e$Qreported)^2 * e$nBases) / sum(e$nBases) )
theTitle = paste("RMSE_good = ", round(rmseBlue,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
if (length(t$nBases) == length(d.good$nBases) ) {
theTitle = paste("RMSE = ", round(rmseAll,digits=3));
}
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
abline(0,1, lty=2)
dev.off()
#outfile = paste(input, ".quality_emp_hist.png", sep="")
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count",yaxt="n")
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
#
# Plot Q reported histogram
#
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), main="Reported quality score histogram", xlab="Reported quality score", ylab="Count",yaxt="n")
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -114,12 +114,13 @@
<fileset dir="build" includes="**/alignment/**/*.class" />
</jar>
<jar jarfile="${dist}/FourBaseRecaller.jar" whenmanifestonly="skip">
<jar jarfile="${dist}/AnalyzeCovariates.jar" whenmanifestonly="skip">
<fileset dir="build">
<include name="**/secondarybase/**/*.class" />
<include name="**/analyzecovariates/**/*.class" />
<include name="**/gatk/walkers/recalibration/*.class" />
</fileset>
<manifest>
<attribute name="Main-Class" value="org.broadinstitute.sting.secondarybase.FourBaseRecaller" />
<attribute name="Main-Class" value="org.broadinstitute.sting.analyzecovariates.AnalyzeCovariates" />
</manifest>
</jar>
@ -150,7 +151,7 @@
</manifest>
</jar>
<jar jarfile="${dist}/FourBaseRecaller.jar" update="true" whenmanifestonly="skip">
<jar jarfile="${dist}/AnalyzeCovariates.jar" update="true" whenmanifestonly="skip">
<manifest>
<attribute name="Class-Path" value="${jar.classpath}" />
</manifest>

View File

@ -0,0 +1,83 @@
package org.broadinstitute.sting.analyzecovariates;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.NHashMap;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Dec 1, 2009
*/
public class AnalysisDataManager {
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // Tables where everything except read group and given covariate has been collapsed
AnalysisDataManager() {
}
AnalysisDataManager( final int numCovariates ) {
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
for( int iii = 0; iii < numCovariates - 1; iii++ ) { // readGroup isn't counted here, its table is separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
}
}
/**
* Add the given mapping to all of the collapsed hash tables
* @param key The list of comparables that is the key for this mapping
* @param fullDatum The RecalDatum which is the data for this mapping
*/
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum, final int IGNORE_QSCORES_LESS_THAN ) {
int qscore = Integer.parseInt( key.get(1).toString() );
ArrayList<Comparable> newKey;
RecalDatum collapsedDatum;
if( !(qscore < IGNORE_QSCORES_LESS_THAN) ) {
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // Make a new key with just the read group
collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
}
}
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle etc.
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // Make a new key with the read group ...
newKey.add( key.get(iii + 1) ); // and the given covariate
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
if( collapsedDatum == null ) {
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.combine( fullDatum );
}
}
}
}
/**
* Get the appropriate collapsed table out of the set of all the tables held by this Object
* @param covariate Which covariate indexes the desired collapsed HashMap
* @return The desired collapsed HashMap
*/
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
} else {
return dataCollapsedByCovariate.get( covariate - 1 ); // Table where everything except read group, quality score, and given covariate has been collapsed
}
}
}

View File

@ -0,0 +1,307 @@
package org.broadinstitute.sting.analyzecovariates;
import org.broadinstitute.sting.gatk.walkers.recalibration.*;
import org.broadinstitute.sting.utils.PackageUtils;
import org.broadinstitute.sting.utils.xReadLines;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.io.*;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Dec 1, 2009
*/
public class AnalyzeCovariates {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
private static String RECAL_FILE = "output.recal_data.csv";
private static String OUTPUT_DIR = "analyzeCovariates/";
private static String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
private static String PATH_TO_RESOURCES = "R/";
private static int IGNORE_QSCORES_LESS_THAN = 5;
private static int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
/////////////////////////////
// Private Member Variables
/////////////////////////////
private static AnalysisDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private static ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
public static void main(String[] args) {
// parse command line arguments
parseArguments( args );
// create the output directory where all the data tables and plots will go
try {
Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR);
} catch (IOException e) {
throw new RuntimeException("Couldn't create directory: " + OUTPUT_DIR);
}
if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; }
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
// initialize all the data from the csv file and allocate the list of covariates
System.out.println("Reading in input csv file...");
initializeData();
System.out.println("...Done!");
// output data tables for Rscript to read in
System.out.println("Writing out intermediate tables for R...");
writeDataTables();
System.out.println("...Done!");
// perform the analysis using Rscript and output the plots
System.out.println("Calling analysis R scripts and writing out figures...");
callRScripts();
System.out.println("...Done!");
}
private static void parseArguments( String[] args ) {
int iii = 0;
String arg;
try {
while( iii < args.length && args[iii].startsWith("-") ) {
arg = args[iii++];
if( arg.equals( "-recalFile" ) ) {
RECAL_FILE = args[iii++];
} else if( arg.equals( "-Rscript" ) ) {
PATH_TO_RSCRIPT = args[iii++];
} else if( arg.equals( "-resources" ) ) {
PATH_TO_RESOURCES = args[iii++];
} else if( arg.equals( "-ignoreQ" ) ) {
IGNORE_QSCORES_LESS_THAN = Integer.parseInt( args[iii++] );
} else if (arg.equals( "-numRG" ) ) {
NUM_READ_GROUPS_TO_PROCESS = Integer.parseInt( args[iii++] );
} else if( arg.equals( "-outputDir" ) ) {
OUTPUT_DIR = args[iii++];
} else {
iii = -1;
break;
}
}
if( iii != args.length ) {
throw new RuntimeException( "Exception" );
}
} catch(Exception e) {
System.out.println( "Usage: [-option param] \n" );
System.out.println(" Available options:");
System.out.println("\t-recalFile <path>\tPath to input recal csv file. Default value: output.recal_data.csv");
System.out.println("\t-Rscript <path>\t\tPath to your implementation of Rscript. Default value: /broad/tools/apps/R-2.6.0/bin/Rscript");
System.out.println("\t-resources <path>\tPath to resources folder holding the Sting R scripts. Default value: R/");
System.out.println("\t-outputDir <path>\tWhere to put the output plots. Default value: analyzeCovariates/");
System.out.println("\t-ignoreQ <int>\t\tIgnore bases with reported quality less than this number. Default value: 5");
System.out.println("\t-numRG <int>\t\tOnly process N read groups. Default value: -1 (process all read groups)");
System.exit(-1);
}
}
private static void initializeData() {
// Get a list of all available covariates
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
int lineNumber = 0;
boolean foundAllCovariates = false;
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
try {
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
lineNumber++;
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // Skip over the comment lines, (which start with '#')
}
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) {
throw new RuntimeException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
} else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(",");
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins();
} catch ( InstantiationException e ) {
throw new RuntimeException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new RuntimeException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
}
if( !foundClass ) {
throw new RuntimeException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
}
}
}
} else { // Found a line of data
if( !foundAllCovariates ) {
foundAllCovariates = true;
// At this point all the covariates should have been found and initialized
if( requestedCovariates.size() < 2 ) {
throw new RuntimeException( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE );
}
// Don't want to crash with out of heap space exception
if( estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0 ) { // Could be negative if overflowed
estimatedCapacity = 300 * 40 * 200;
}
// Initialize any covariate member variables using the shared argument collection
for( Covariate cov : requestedCovariates ) {
cov.initialize( new RecalibrationArgumentCollection() );
}
// Initialize the data hashMaps
dataManager = new AnalysisDataManager( requestedCovariates.size() );
}
addCSVData(line); // Parse the line and add the data to the HashMap
}
}
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Can not find input file: " + RECAL_FILE);
} catch ( NumberFormatException e ) {
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
}
}
private static void addCSVData(String line) {
String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
throw new RuntimeException("Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
}
ArrayList<Comparable> key = new ArrayList<Comparable>();
Covariate cov;
int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
cov = requestedCovariates.get( iii );
key.add( cov.getValue( vals[iii] ) );
}
// Create a new datum using the number of observations, number of mismatches, and reported quality score
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
}
private static void writeDataTables() {
int numReadGroups = 0;
// for each read group
NHashMap<RecalDatum> readGroupTable = dataManager.getCollapsedTable(0);
for( List<? extends Comparable> readGroupKey : readGroupTable.keySet() ) {
if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
String readGroup = readGroupKey.get(0).toString();
RecalDatum readGroupDatum = readGroupTable.get(readGroupKey);
System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" );
System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0) - readGroupDatum.getEstimatedQReported()));
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
// Create a PrintStream
PrintStream output = null;
try {
output = new PrintStream(new FileOutputStream(OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"));
} catch (FileNotFoundException e) {
System.err.println("Can't create file: " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
System.exit(-1);
}
// Output the header
output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
// Loop through the covariate table looking for keys with matching read groups
// BUGBUG: hopefully rewrite this to be more efficient
for( List<? extends Comparable> covariateKey : dataManager.getCollapsedTable(iii).keySet() ) {
if( covariateKey.get(0).toString().equals(readGroup) ) {
output.print( covariateKey.get(1).toString() + "\t" ); // Covariate
RecalDatum thisDatum = dataManager.getCollapsedTable(iii).get(covariateKey);
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0)) + "\t" ); // Qempirical
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
output.println( thisDatum.getNumObservations() ); // nBases
}
}
// Close the PrintStream
output.close();
}
} else {
break;
}
}
}
private static void callRScripts() {
int numReadGroups = 0;
// for each read group
for( List<? extends Comparable> readGroupList : dataManager.getCollapsedTable(0).keySet() ) {
if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
String readGroup = readGroupList.get(0).toString();
System.out.println("Analyzing read group: " + readGroup);
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
try {
if( iii == 1 ) {
// Analyze reported quality
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
IGNORE_QSCORES_LESS_THAN); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
} else { // Analyze all other covariates
Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument which is the name of the covariate to make the plots look nice
}
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
} else {
break;
}
}
}
}

View File

@ -44,6 +44,7 @@ import net.sf.samtools.SAMReadGroupRecord;
*/
public class RecalDataManager {
public NHashMap<RecalDatum> data; // The full dataset
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private NHashMap<RecalDatum> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed

View File

@ -170,7 +170,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
if( !foundClass ) {
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + vals[iii] + ") isn't a valid covariate option." );
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
}
}
@ -210,7 +210,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
} catch ( FileNotFoundException e ) {
Utils.scareUser("Can not find input file: " + RAC.RECAL_FILE);
throw new StingException("Can not find input file: " + RAC.RECAL_FILE);
} catch ( NumberFormatException e ) {
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
}
@ -277,7 +277,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
key.add( cov.getValue( vals[iii] ) );
}
}
// Create a new datum using the number of observations and number of mismatches
// Create a new datum using the number of observations, number of mismatches, and reported quality score
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum );
@ -368,7 +368,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( collapsedTableKey );
double globalDeltaQ = 0.0;
if( globalDeltaQEmpirical != null ) {
Double aggregrateQReported = dataManager.getCollapsedTable(0).get( collapsedTableKey ).getEstimatedQReported();
double aggregrateQReported = dataManager.getCollapsedTable(0).get( collapsedTableKey ).getEstimatedQReported();
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
}