implementation of the Gatherer class for CountCovariates, which makes it now scatter/gatherable. Kudos to the @Gather annotation Khalid just introduced!

QuickCCTest is my test script for the gatherer.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5547 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-03-31 21:15:21 +00:00
parent 20344a27b4
commit 0a772688fe
3 changed files with 23 additions and 21 deletions

View File

@ -30,27 +30,24 @@ public class CountCovariatesGatherer extends Gatherer {
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String EOF_MARKER = "EOF"; private static final String EOF_MARKER = "EOF";
private HashMap<String, int[]> dataMap; private HashMap<String, RecalDatumOptimized> dataMap;
private void addCSVData (String line) { private void addCSVData (String line) {
String[] covariates = line.split(","); String[] covariates = line.split(",");
String key = ""; String key = "";
int [] values = new int[3]; RecalDatumOptimized values;
for (int i = 0; i < covariates.length-3; i++) { for (int i = 0; i < covariates.length-3; i++) {
key += covariates[i] + ","; key += covariates[i] + ",";
} }
for (int i = covariates.length-3; i < covariates.length; i++) { values = new RecalDatumOptimized(Integer.parseInt(covariates[covariates.length-3]),
values[i] = Integer.parseInt(covariates[i].trim()); Integer.parseInt(covariates[covariates.length-2]));
}
if (dataMap.get(key) != null) { if (dataMap.get(key) != null) {
int [] currentValues = dataMap.get(key); RecalDatumOptimized currentValues = dataMap.get(key);
for (int i = 0; i < 2; i++) { values.increment(currentValues);
values[i] += currentValues[i];// todo -- update the third value using the CountCovariatesWalker function
}
} }
dataMap.put(key, values); dataMap.put(key, values);
@ -58,7 +55,7 @@ public class CountCovariatesGatherer extends Gatherer {
@Override @Override
public void gather(List<File> inputs, File output) { public void gather(List<File> inputs, File output) {
dataMap = new HashMap<String, int[]>(); dataMap = new HashMap<String, RecalDatumOptimized>();
PrintStream o; PrintStream o;
try { try {
o = new PrintStream(output); o = new PrintStream(output);
@ -67,7 +64,7 @@ public class CountCovariatesGatherer extends Gatherer {
} }
boolean sawEOF = false; boolean sawEOF = false;
boolean headerPrinted = false; boolean printedHeader = false;
// Read input files // Read input files
for ( File RECAL_FILE : inputs) { for ( File RECAL_FILE : inputs) {
@ -76,11 +73,12 @@ public class CountCovariatesGatherer extends Gatherer {
if ( EOF_MARKER.equals(line) ) { if ( EOF_MARKER.equals(line) ) {
sawEOF = true; // sanity check sawEOF = true; // sanity check
} }
else if( COMMENT_PATTERN.matcher(line).matches() || COVARIATE_PATTERN.matcher(line).matches() ) { else if(COMMENT_PATTERN.matcher(line).matches()) {
if (!headerPrinted) { ; // It doesn't make any sense to print intermediate comments, unless we merge them somehow (would require strict definition for the header)
headerPrinted = true; }
else if (COVARIATE_PATTERN.matcher(line).matches()) {
if (!printedHeader)
o.println(line); o.println(line);
} // Skip over the header (could check if headers are the same, but probably not necessary)
} }
else { // Found a line of data else { // Found a line of data
addCSVData(line); // Parse the line and add the data to the HashMap addCSVData(line); // Parse the line and add the data to the HashMap
@ -95,14 +93,15 @@ public class CountCovariatesGatherer extends Gatherer {
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!"; final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
throw new UserException.MalformedFile(RECAL_FILE, errorMessage); throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
} }
printedHeader = true;
} }
// Write output file from dataMap // Write output file from dataMap
for(String key : dataMap.keySet()) { for(String key : dataMap.keySet()) {
int [] values = dataMap.get(key); RecalDatumOptimized values = dataMap.get(key);
String v = "," + values[0] + "," + values[1] + "," + values[2]; String v = values.getNumObservations() + "," + values.getNumMismatches() + "," + values.empiricalQualByte();
o.println(key + v); o.println(key + v);
} }
o.println("EOF");
} }
} }

View File

@ -92,13 +92,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@Output @Output
@Gather(CountCovariatesGatherer.class)
PrintStream out; PrintStream out;
///////////////////////////// /////////////////////////////
// Command Line Arguments // Command Line Arguments
///////////////////////////// /////////////////////////////
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file") @Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file")
@Gather(CountCovariatesGatherer.class)
public PrintStream RECAL_FILE; public PrintStream RECAL_FILE;
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)

View File

@ -33,6 +33,8 @@ class QuickCCTest extends QScript {
def script = { def script = {
val recal = new File("recal.csv")
val cc = new CountCovariates() val cc = new CountCovariates()
cc.reference_sequence = reference cc.reference_sequence = reference
cc.input_file :+= input cc.input_file :+= input
@ -40,7 +42,8 @@ class QuickCCTest extends QScript {
cc.intervalsString = intervals cc.intervalsString = intervals
cc.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") cc.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
cc.scatterCount = 4 cc.scatterCount = 4
cc.recal_file = new File("recal.csv") cc.recal_file = recal
cc.memoryLimit = 4
add(cc); add(cc);
} }