diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java new file mode 100755 index 000000000..dec88fa3b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java @@ -0,0 +1,108 @@ +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import org.broadinstitute.sting.commandline.Gatherer; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 3/29/11 + * Time: 3:54 PM + * To change this template use File | Settings | File Templates. + */ + + +public class CountCovariatesGatherer extends Gatherer { + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + private static final String EOF_MARKER = "EOF"; + + private HashMap dataMap; + + + private void addCSVData (String line) { + String[] covariates = line.split(","); + String key = ""; + int [] values = new int[3]; + + for (int i = 0; i < covariates.length-3; i++) { + key += covariates[i] + ","; + } + + for (int i = covariates.length-3; i < covariates.length; i++) { + values[i] = Integer.parseInt(covariates[i].trim()); + } + + if (dataMap.get(key) != null) { + int [] currentValues = dataMap.get(key); + for (int i = 0; i < 2; i++) { + values[i] += currentValues[i];// todo -- update the third value using the CountCovariatesWalker function + } + } + + dataMap.put(key, values); + } + + @Override + public void gather(List inputs, File output) { + dataMap = new HashMap(); + PrintStream o; + try { + o = new PrintStream(output); + } catch ( FileNotFoundException e) { + throw new UserException("File to be output by CountCovariates Gather function was not found"); + } + + boolean sawEOF = false; + boolean headerPrinted = false; + + // Read input files + for ( File RECAL_FILE : inputs) { + try { + for ( String line : new XReadLines(RECAL_FILE) ) { + if ( EOF_MARKER.equals(line) ) { + sawEOF = true; // sanity check + } + else if( COMMENT_PATTERN.matcher(line).matches() || COVARIATE_PATTERN.matcher(line).matches() ) { + if (!headerPrinted) { + headerPrinted = true; + o.println(line); + } // Skip over the header (could check if headers are the same, but probably not necessary) + } + else { // Found a line of data + addCSVData(line); // Parse the line and add the data to the HashMap + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); + } + + if ( !sawEOF ) { + final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!"; + throw new UserException.MalformedFile(RECAL_FILE, errorMessage); + } + } + + // Write output file from dataMap + for(String key : dataMap.keySet()) { + int [] values = dataMap.get(key); + String v = "," + values[0] + "," + values[1] + "," + values[2]; + o.println(key + v); + } + + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 3667cf062..e21b16759 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -29,6 +29,7 @@ import org.broad.tribble.bed.BEDCodec; import org.broad.tribble.dbsnp.DbSNPCodec; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFCodec; +import org.broadinstitute.sting.commandline.Gather; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -75,6 +76,7 @@ import java.util.Map; @By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file @ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta +@PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { ///////////////////////////// @@ -90,6 +92,7 @@ public class CountCovariatesWalker extends LocusWalker + + @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) + var GATKjar: File = _ + + @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) + var input: File = _ + + @Input(doc="Reference fasta file", shortName="R", required=false) + var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + + @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) + var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") + + @Input(shortName="L", required=false) + var intervals: List[String] = Nil + + + val queueLogDir: String = ".qlog/" + + + def script = { + + val cc = new CountCovariates() + cc.reference_sequence = reference + cc.input_file :+= input + cc.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + cc.intervalsString = intervals + cc.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") + cc.scatterCount = 4 + cc.recal_file = new File("recal.csv") + + add(cc); + } +} \ No newline at end of file