diff --git a/build.xml b/build.xml index 068c69316..fe4c7a3f4 100644 --- a/build.xml +++ b/build.xml @@ -780,6 +780,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -814,6 +858,22 @@ + + + + + + + + + + + + + + + + diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 06d14366f..a812babaf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -26,16 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.diffengine; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; -import java.util.Arrays; import java.util.Map; -import java.util.zip.GZIPInputStream; /** @@ -58,7 +54,13 @@ public class VCFDiffableReader implements DiffableReader { VCFCodec vcfCodec = new VCFCodec(); // must be read as state is stored in reader itself - vcfCodec.readHeader(lineReader); + VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); + for ( VCFHeaderLine headerLine : header.getMetaData() ) { + String key = headerLine.getKey(); + if ( headerLine instanceof VCFNamedHeaderLine ) + key += "_" + ((VCFNamedHeaderLine) headerLine).getName(); + root.add(key, headerLine.toString()); + } String line = lineReader.readLine(); int count = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 1d9616aac..fbe6e5b5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -242,7 +242,7 @@ public class ReadBackedPhasingWalker extends RodWalker KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList("PQ")); + private static final Set KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); private VariantContext reduceVCToSamples(VariantContext vc, List samplesToPhase) { // for ( String sample : samplesToPhase ) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 01344a117..710127f7a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -7,6 +7,8 @@ import org.broad.tribble.NameAwareCodec; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -96,6 +98,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, for ( String str : headerStrings ) { if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); + if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) + throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); + int arrayIndex = 0; for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { try { @@ -159,12 +164,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } private Feature reallyDecode(String line) { - try { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new IllegalStateException("VCF Header cannot be null when decoding a record"); + if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); if (parts == null) parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; @@ -174,17 +178,18 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data) if (( (header == null || (header != null && !header.hasGenotypingData())) && nParts != NUM_STANDARD_FIELDS) || (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) - throw new IllegalArgumentException("There aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + - " tokens, and saw " + nParts + " )"); + throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + + " tokens, and saw " + nParts + " )", lineNo); return parseVCFLine(parts); - } catch (TribbleException e) { - throw new TribbleException.InvalidDecodeLine(e.getMessage(), line); - } } protected void generateException(String message) { - throw new TribbleException.InvalidDecodeLine(message, lineNo); + throw new UserException.MalformedVCF(message, lineNo); + } + + private static void generateException(String message, int lineNo) { + throw new UserException.MalformedVCF(message, lineNo); } /** @@ -472,10 +477,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, return true; } - private static void generateException(String message, int lineNo) { - throw new TribbleException.InvalidDecodeLine(message, lineNo); - } - private static int computeForwardClipping(List unclippedAlleles, String ref) { boolean clipping = true; // Note that the computation of forward clipping here is meant only to see whether there is a common diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 0be4bec91..17c4a7df4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -154,6 +154,16 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedVCF extends UserException { + public MalformedVCF(String message, String line) { + super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + } + + public MalformedVCF(String message, int lineNo) { + super(String.format("The provided VCF file is malformed at line nmber %d: %s", lineNo, message)); + } + } + public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index dacaf2738..d65f4ec34 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting; import org.apache.commons.lang.StringUtils; +import org.broad.tribble.FeatureCodec; import org.broad.tribble.Tribble; +import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; @@ -64,10 +66,19 @@ public class WalkerTest extends BaseTest { } System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile); - Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec()), "Index on disk from indexing on the fly not equal to the index created after the run completed"); + Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec()); + Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + if ( ! indexFromOutputFile.equals(dynamicIndex) ) { + Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", + indexFromOutputFile.getProperties(), + dynamicIndex.getProperties())); + } } } + + public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s) { List md5s = new ArrayList(); for (int i = 0; i < resultFiles.size(); i++) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java index baa2f0383..a0cb47770 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java @@ -87,7 +87,7 @@ public class DiffableReaderUnitTest extends BaseTest { Assert.assertSame(diff.getParent(), DiffElement.ROOT); DiffNode node = diff.getValueAsNode(); - Assert.assertEquals(node.getElements().size(), 9); + Assert.assertEquals(node.getElements().size(), 10); // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index d6caabd23..6a47d4b97 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -3,14 +3,15 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction - -import scala.io.Source._ -import collection.JavaConversions._ -import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.queue.extensions.picard._ -import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord} +import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel +import org.broadinstitute.sting.utils.baq.BAQ.CalculationMode + +import collection.JavaConversions._ +import net.sf.samtools.SAMFileReader import net.sf.samtools.SAMFileHeader.SortOrder +import org.broadinstitute.sting.queue.util.QScriptUtils class DataProcessingPipeline extends QScript { qscript => @@ -29,7 +30,8 @@ class DataProcessingPipeline extends QScript { @Input(doc="Reference fasta file", fullName="reference", shortName="R", required=true) var reference: File = _ - + @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true) + var dbSNP: File = _ /**************************************************************************** * Optional Parameters @@ -39,14 +41,12 @@ class DataProcessingPipeline extends QScript { // @Input(doc="path to Picard's SortSam.jar (if re-aligning a previously processed BAM file)", fullName="path_to_sort_jar", shortName="sort", required=false) // var sortSamJar: File = _ // - @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) - var bwaPath: File = _ - - @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=false) - var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false) - var indels: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf") + var indels: File = _ + + @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) + var bwaPath: File = _ @Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false) var projectName: String = "project" @@ -103,18 +103,6 @@ class DataProcessingPipeline extends QScript { val ds: String) {} - // Utility function to check if there are multiple samples in a BAM file (currently we can't deal with that) - def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { - var sample: String = "" - for (r <- readGroups) { - if (sample.isEmpty) - sample = r.getSample - else if (sample != r.getSample) - return true; - } - return false - } - // Utility function to merge all bam files of similar samples. Generates one BAM file per sample. // It uses the sample information on the header of the input BAM files. // @@ -135,7 +123,7 @@ class DataProcessingPipeline extends QScript { // only allow one sample per file. Bam files with multiple samples would require pre-processing of the file // with PrintReads to separate the samples. Tell user to do it himself! - assert(!hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) + assert(!QScriptUtils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) // Fill out the sample table with the readgroups in this file for (rg <- readGroups) { @@ -166,12 +154,6 @@ class DataProcessingPipeline extends QScript { return sampleBamFiles.toMap } - // Checks how many contigs are in the dataset. Uses the BAM file header information. - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - // Rebuilds the Read Group string to give BWA def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) { val readGroups = samReader.getFileHeader.getReadGroups @@ -215,19 +197,6 @@ class DataProcessingPipeline extends QScript { return realignedBams } - // Reads a BAM LIST file and creates a scala list with all the files - def createListFromFile(in: File):List[File] = { - if (in.toString.endsWith("bam")) - return List(in) - var l: List[File] = List() - for (bam <- fromFile(in).getLines) { - if (!bam.startsWith("#") && !bam.isEmpty) - l :+= new File(bam.trim) - } - return l - } - - /**************************************************************************** * Main script @@ -237,8 +206,8 @@ class DataProcessingPipeline extends QScript { def script = { // keep a record of the number of contigs in the first bam file in the list - val bams = createListFromFile(input) - nContigs = getNumberOfContigs(bams(0)) + val bams = QScriptUtils.createListFromFile(input) + nContigs = QScriptUtils.getNumberOfContigs(bams(0)) val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams} @@ -319,7 +288,8 @@ class DataProcessingPipeline extends QScript { this.out = outIntervals this.mismatchFraction = 0.0 this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.rodBind :+= RodBind("indels", "VCF", indels) + if (!indels.isEmpty) + this.rodBind :+= RodBind("indels", "VCF", indels) this.scatterCount = nContigs this.analysisName = queueLogDir + outIntervals + ".target" this.jobName = queueLogDir + outIntervals + ".target" @@ -330,7 +300,8 @@ class DataProcessingPipeline extends QScript { this.targetIntervals = tIntervals this.out = outBam this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.rodBind :+= RodBind("indels", "VCF", qscript.indels) + if (!indels.isEmpty) + this.rodBind :+= RodBind("indels", "VCF", indels) this.consensusDeterminationModel = consensusDeterminationModel this.compress = 0 this.scatterCount = nContigs @@ -353,7 +324,7 @@ class DataProcessingPipeline extends QScript { case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { this.input_file :+= inBam this.recal_file = inRecalFile - this.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY + this.baq = CalculationMode.CALCULATE_AS_NECESSARY this.out = outBam if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString) else if (qscript.intervals != null) this.intervals :+= qscript.intervals diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index dc9ae0f4b..fca420816 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -2,7 +2,7 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ -import net.sf.samtools.SAMFileReader +import org.broadinstitute.sting.queue.util.QScriptUtils /** * Created by IntelliJ IDEA. @@ -32,26 +32,25 @@ class RecalibrateBaseQualities extends QScript { val queueLogDir: String = ".qlog/" var nContigs: Int = 0 - def getNumberOfContigs(bamFile: File): Int = { - val samReader = new SAMFileReader(new File(bamFile)) - return samReader.getFileHeader.getSequenceDictionary.getSequences.size() - } - def script = { - nContigs = getNumberOfContigs(input) + val bamList = QScriptUtils.createListFromFile(input) + nContigs = QScriptUtils.getNumberOfContigs(bamList(0)) - val recalFile1: File = swapExt(input, ".bam", ".recal1.csv") - val recalFile2: File = swapExt(input, ".bam", ".recal2.csv") - val recalBam: File = swapExt(input, ".bam", ".recal.bam") - val path1: String = input + "before" - val path2: String = input + "after" - - add(cov(input, recalFile1), - recal(input, recalFile1, recalBam), - cov(recalBam, recalFile2), - analyzeCovariates(recalFile1, path1), - analyzeCovariates(recalFile2, path2)) + for (bam <- bamList) { + + val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv") + val recalBam: File = swapExt(bam, ".bam", ".recal.bam") + val path1: String = bam + ".before" + val path2: String = bam + ".after" + + add(cov(bam, recalFile1), + recal(bam, recalFile1, recalBam), + cov(recalBam, recalFile2), + analyzeCovariates(recalFile1, path1), + analyzeCovariates(recalFile2, path2)) + } } trait CommandLineGATKArgs extends CommandLineGATK { @@ -84,7 +83,7 @@ class RecalibrateBaseQualities extends QScript { case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { this.resources = R this.recal_file = inRecalFile - this.output_dir = outPath.toString + this.output_dir = outPath this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala new file mode 100644 index 000000000..9fb4fa30d --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File +import io.Source._ +import net.sf.samtools.{SAMReadGroupRecord, SAMFileReader} + +import collection.JavaConversions._ + + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/14/11 + * Time: 4:57 PM + * To change this template use File | Settings | File Templates. + */ + +object QScriptUtils { + + /** + * Takes a bam list file and produces a scala list with each file allowing the bam list + * to have empty lines and comment lines (lines starting with #). + */ + def createListFromFile(in: File):List[File] = { + // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // and return a list with only this file. + if (in.toString.endsWith(".bam")) + return List(in) + + var list: List[File] = List() + for (bam <- fromFile(in).getLines) + if (!bam.startsWith("#") && !bam.isEmpty ) + list :+= new File(bam.trim()) + list + } + + /** + * Returns the number of contigs in the BAM file header. + */ + def getNumberOfContigs(bamFile: File): Int = { + val samReader = new SAMFileReader(bamFile) + samReader.getFileHeader.getSequenceDictionary.getSequences.size() + } + + /** + * Check if there are multiple samples in a BAM file + */ + def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { + var sample: String = "" + for (r <- readGroups) { + if (sample.isEmpty) + sample = r.getSample + else if (sample != r.getSample) + return true; + } + false + } + + +} \ No newline at end of file diff --git a/settings/repository/org.broad/tribble-3.jar b/settings/repository/org.broad/tribble-4.jar similarity index 69% rename from settings/repository/org.broad/tribble-3.jar rename to settings/repository/org.broad/tribble-4.jar index f0ab44a05..1f82f3cc0 100644 Binary files a/settings/repository/org.broad/tribble-3.jar and b/settings/repository/org.broad/tribble-4.jar differ diff --git a/settings/repository/org.broad/tribble-3.xml b/settings/repository/org.broad/tribble-4.xml similarity index 58% rename from settings/repository/org.broad/tribble-3.xml rename to settings/repository/org.broad/tribble-4.xml index c35358331..07235efb0 100644 --- a/settings/repository/org.broad/tribble-3.xml +++ b/settings/repository/org.broad/tribble-4.xml @@ -1,4 +1,4 @@ -