Count Covariates is almost scatter-gatherable now!
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5537 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5ddc0e464a
commit
5d26c66769
|
|
@ -0,0 +1,108 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Gatherer;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: carneiro
|
||||||
|
* Date: 3/29/11
|
||||||
|
* Time: 3:54 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
public class CountCovariatesGatherer extends Gatherer {
|
||||||
|
|
||||||
|
/////////////////////////////
|
||||||
|
// Private Member Variables
|
||||||
|
/////////////////////////////
|
||||||
|
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||||
|
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||||
|
private static final String EOF_MARKER = "EOF";
|
||||||
|
|
||||||
|
private HashMap<String, int[]> dataMap;
|
||||||
|
|
||||||
|
|
||||||
|
private void addCSVData (String line) {
|
||||||
|
String[] covariates = line.split(",");
|
||||||
|
String key = "";
|
||||||
|
int [] values = new int[3];
|
||||||
|
|
||||||
|
for (int i = 0; i < covariates.length-3; i++) {
|
||||||
|
key += covariates[i] + ",";
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = covariates.length-3; i < covariates.length; i++) {
|
||||||
|
values[i] = Integer.parseInt(covariates[i].trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dataMap.get(key) != null) {
|
||||||
|
int [] currentValues = dataMap.get(key);
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
values[i] += currentValues[i];// todo -- update the third value using the CountCovariatesWalker function
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dataMap.put(key, values);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void gather(List<File> inputs, File output) {
|
||||||
|
dataMap = new HashMap<String, int[]>();
|
||||||
|
PrintStream o;
|
||||||
|
try {
|
||||||
|
o = new PrintStream(output);
|
||||||
|
} catch ( FileNotFoundException e) {
|
||||||
|
throw new UserException("File to be output by CountCovariates Gather function was not found");
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean sawEOF = false;
|
||||||
|
boolean headerPrinted = false;
|
||||||
|
|
||||||
|
// Read input files
|
||||||
|
for ( File RECAL_FILE : inputs) {
|
||||||
|
try {
|
||||||
|
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||||
|
if ( EOF_MARKER.equals(line) ) {
|
||||||
|
sawEOF = true; // sanity check
|
||||||
|
}
|
||||||
|
else if( COMMENT_PATTERN.matcher(line).matches() || COVARIATE_PATTERN.matcher(line).matches() ) {
|
||||||
|
if (!headerPrinted) {
|
||||||
|
headerPrinted = true;
|
||||||
|
o.println(line);
|
||||||
|
} // Skip over the header (could check if headers are the same, but probably not necessary)
|
||||||
|
}
|
||||||
|
else { // Found a line of data
|
||||||
|
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( !sawEOF ) {
|
||||||
|
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
|
||||||
|
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write output file from dataMap
|
||||||
|
for(String key : dataMap.keySet()) {
|
||||||
|
int [] values = dataMap.get(key);
|
||||||
|
String v = "," + values[0] + "," + values[1] + "," + values[2];
|
||||||
|
o.println(key + v);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -29,6 +29,7 @@ import org.broad.tribble.bed.BEDCodec;
|
||||||
import org.broad.tribble.dbsnp.DbSNPCodec;
|
import org.broad.tribble.dbsnp.DbSNPCodec;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broad.tribble.vcf.VCFCodec;
|
import org.broad.tribble.vcf.VCFCodec;
|
||||||
|
import org.broadinstitute.sting.commandline.Gather;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
|
@ -75,6 +76,7 @@ import java.util.Map;
|
||||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
||||||
@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality
|
@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality
|
||||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||||
|
@PartitionBy(PartitionType.LOCUS)
|
||||||
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
|
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
@ -90,6 +92,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
||||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||||
|
|
||||||
@Output
|
@Output
|
||||||
|
@Gather(CountCovariatesGatherer.class)
|
||||||
PrintStream out;
|
PrintStream out;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,47 @@
|
||||||
|
/*
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: carneiro
|
||||||
|
* Date: 3/29/11
|
||||||
|
* Time: 5:31 PM
|
||||||
|
*/
|
||||||
|
package oneoffs.carneiro;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.queue.QScript
|
||||||
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||||
|
|
||||||
|
class QuickCCTest extends QScript {
|
||||||
|
qscript =>
|
||||||
|
|
||||||
|
@Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
|
||||||
|
var GATKjar: File = _
|
||||||
|
|
||||||
|
@Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
|
||||||
|
var input: File = _
|
||||||
|
|
||||||
|
@Input(doc="Reference fasta file", shortName="R", required=false)
|
||||||
|
var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
|
||||||
|
|
||||||
|
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false)
|
||||||
|
var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
|
||||||
|
|
||||||
|
@Input(shortName="L", required=false)
|
||||||
|
var intervals: List[String] = Nil
|
||||||
|
|
||||||
|
|
||||||
|
val queueLogDir: String = ".qlog/"
|
||||||
|
|
||||||
|
|
||||||
|
def script = {
|
||||||
|
|
||||||
|
val cc = new CountCovariates()
|
||||||
|
cc.reference_sequence = reference
|
||||||
|
cc.input_file :+= input
|
||||||
|
cc.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||||
|
cc.intervalsString = intervals
|
||||||
|
cc.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
|
||||||
|
cc.scatterCount = 4
|
||||||
|
cc.recal_file = new File("recal.csv")
|
||||||
|
|
||||||
|
add(cc);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue