diff --git a/build.xml b/build.xml index fe1723587..80627fae0 100644 --- a/build.xml +++ b/build.xml @@ -28,6 +28,7 @@ + @@ -44,11 +45,11 @@ - - + + - - + + @@ -69,8 +70,6 @@ - - @@ -103,7 +102,7 @@ - + @@ -128,14 +127,14 @@ - + - + - + @@ -146,11 +145,7 @@ - - - - - @@ -178,13 +173,23 @@ - - + + + + + - + + + + + + + + @@ -214,12 +219,6 @@ - - - - - - @@ -229,6 +228,10 @@ + + + + @@ -252,7 +255,7 @@ - + @@ -287,7 +290,7 @@ depends="gatk.compile.public.source,gatk.compile.private.source,gatk.compile.external.source" description="compile the GATK source" /> - + @@ -299,7 +302,16 @@ - + + + + + + + + + + @@ -312,7 +324,7 @@ + description="create GATK contracts" if="include.contracts" /> @@ -357,7 +369,6 @@ - @@ -374,7 +385,6 @@ - @@ -452,7 +462,7 @@ - + @@ -464,7 +474,7 @@ - + @@ -663,7 +673,7 @@ - + - - + @@ -820,7 +831,7 @@ - + @@ -828,7 +839,7 @@ - + @@ -921,8 +932,8 @@ - - + + @@ -944,7 +955,7 @@ - + @@ -969,7 +980,7 @@ - + diff --git a/ivy.xml b/ivy.xml index c2a6c4ccd..3f3d1c97f 100644 --- a/ivy.xml +++ b/ivy.xml @@ -48,6 +48,9 @@ + + + @@ -60,6 +63,10 @@ + + + + diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index 3cf43c15a..f8e298d88 100755 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -61,7 +61,7 @@ public class AnalyzeCovariates extends CommandLineProgram { @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) private String PATH_TO_RSCRIPT = "Rscript"; @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) - private String PATH_TO_RESOURCES = "R/"; + private String PATH_TO_RESOURCES = "public/R/"; @Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false) private int IGNORE_QSCORES_LESS_THAN = 5; @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java old mode 100644 new mode 100755 diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java old mode 100644 new mode 100755 index 6ff0384a0..4b4ebe450 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java @@ -55,10 +55,14 @@ public class TableFeature implements Feature { } public List getAllValues() { - return getValuesTo(values.size()-1); + return getValuesTo(values.size()); } public List getValuesTo(int columnPosition) { return values.subList(0,columnPosition); } + + public List getHeader() { + return keys; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 143722d7c..ed10d2072 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -41,8 +42,8 @@ import java.util.*; public class ChromosomeCounts implements InfoFieldAnnotation, StandardAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; - private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, -1, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), - new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, -1, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), + private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), + new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes") }; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 754d28dfd..ee66b50ee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -142,5 +143,5 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnot // public String getIndelBases() public List getKeyNames() { return Arrays.asList("AD"); } - public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), VCFCompoundHeaderLine.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); } + public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java index f287549bb..a670532af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; @@ -200,8 +201,8 @@ public class ReadDepthAndAllelicFractionBySample implements GenotypeAnnotation { 1, VCFHeaderLineType.Integer, "Total read depth per sample, including MQ0"), - new VCFFormatHeaderLine(getKeyNames().get(1), - VCFCompoundHeaderLine.UNBOUNDED, + new VCFFormatHeaderLine(getKeyNames().get(1), + VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index 82f16be42..e2fd2a3d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -65,5 +66,5 @@ public class SampleList implements InfoFieldAnnotation { public List getKeyNames() { return Arrays.asList("Samples"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Samples", VCFInfoHeaderLine.UNBOUNDED, VCFHeaderLineType.String, "List of polymorphic samples")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Samples", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "List of polymorphic samples")); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java new file mode 100644 index 000000000..f7a395d9d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import net.sf.samtools.*; +import net.sf.samtools.util.BlockCompressedInputStream; +import org.broad.tribble.readers.AsciiLineReader; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.zip.GZIPInputStream; + + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Class implementing diffnode reader for VCF + */ +public class BAMDiffableReader implements DiffableReader { + private final static int MAX_RECORDS_TO_READ = 1000; + @Override + public String getName() { return "BAM"; } + + @Override + public DiffElement readFromFile(File file) { + final SAMFileReader reader = new SAMFileReader(file, null); // null because we don't want it to look for the index + reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + + DiffNode root = DiffNode.rooted(file.getName()); + SAMRecordIterator iterator = reader.iterator(); + + int count = 0; + while ( iterator.hasNext() ) { + if ( count++ > MAX_RECORDS_TO_READ ) + break; + final SAMRecord record = iterator.next(); + + // name is the read name + first of pair + String name = record.getReadName().replace('.', '_'); + if ( record.getReadPairedFlag() ) { + name += record.getFirstOfPairFlag() ? "_1" : "_2"; + } + + DiffNode readRoot = DiffNode.empty(name, root); + + // add fields + readRoot.add("NAME", record.getReadName()); + readRoot.add("FLAGS", record.getFlags()); + readRoot.add("RNAME", record.getReferenceName()); + readRoot.add("POS", record.getAlignmentStart()); + readRoot.add("MAPQ", record.getMappingQuality()); + readRoot.add("CIGAR", record.getCigarString()); + readRoot.add("RNEXT", record.getMateReferenceName()); + readRoot.add("PNEXT", record.getMateAlignmentStart()); + readRoot.add("TLEN", record.getInferredInsertSize()); + readRoot.add("SEQ", record.getReadString()); + readRoot.add("QUAL", record.getBaseQualityString()); + + for ( SAMRecord.SAMTagAndValue xt : record.getAttributes() ) { + readRoot.add(xt.tag, xt.value); + } + + // add record to root + if ( ! root.hasElement(name) ) + // protect ourselves from malformed files + root.add(readRoot); + } + + reader.close(); + + return root.getBinding(); + } + + @Override + public boolean canRead(File file) { + final byte[] BAM_MAGIC = "BAM\1".getBytes(); + final byte[] buffer = new byte[BAM_MAGIC.length]; + try { + FileInputStream fstream = new FileInputStream(file); + new BlockCompressedInputStream(fstream).read(buffer,0,BAM_MAGIC.length); + return Arrays.equals(buffer, BAM_MAGIC); + } catch ( IOException e ) { + return false; + } catch ( net.sf.samtools.FileTruncatedException e ) { + return false; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java new file mode 100644 index 000000000..eff24bb88 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffElement.java @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +@Invariant({ + "name != null", + "value != null", + "parent != null || name.equals(\"ROOT\")", + "value == null || value.getBinding() == this"}) +public class DiffElement { + public final static DiffElement ROOT = new DiffElement(); + + final private String name; + final private DiffElement parent; + final private DiffValue value; + + /** + * For ROOT only + */ + private DiffElement() { + this.name = "ROOT"; + this.parent = null; + this.value = new DiffValue(this, "ROOT"); + } + + @Requires({"name != null", "parent != null", "value != null"}) + public DiffElement(String name, DiffElement parent, DiffValue value) { + if ( name.equals("ROOT") ) throw new IllegalArgumentException("Cannot use reserved name ROOT"); + this.name = name; + this.parent = parent; + this.value = value; + this.value.setBinding(this); + } + + @Ensures({"result != null"}) + public String getName() { + return name; + } + + public DiffElement getParent() { + return parent; + } + + @Ensures({"result != null"}) + public DiffValue getValue() { + return value; + } + + public boolean isRoot() { return this == ROOT; } + + @Ensures({"result != null"}) + @Override + public String toString() { + return getName() + "=" + getValue().toString(); + } + + public String toString(int offset) { + return (offset > 0 ? Utils.dupString(' ', offset) : 0) + getName() + "=" + getValue().toString(offset); + } + + @Ensures({"result != null"}) + public final String fullyQualifiedName() { + if ( isRoot() ) + return ""; + else if ( parent.isRoot() ) + return name; + else + return parent.fullyQualifiedName() + "." + name; + } + + @Ensures({"result != null"}) + public String toOneLineString() { + return getName() + "=" + getValue().toOneLineString(); + } + + @Ensures({"result != null"}) + public DiffNode getValueAsNode() { + if ( getValue().isCompound() ) + return (DiffNode)getValue(); + else + throw new ReviewedStingException("Illegal request conversion of a DiffValue into a DiffNode: " + this); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java new file mode 100644 index 000000000..ba2713bff --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:51 PM + * A generic engine for comparing tree-structured objects + */ +public class DiffEngine { + final protected static Logger logger = Logger.getLogger(DiffEngine.class); + + private final Map readers = new HashMap(); + + public DiffEngine() { + loadDiffableReaders(); + } + + // -------------------------------------------------------------------------------- + // + // difference calculation + // + // -------------------------------------------------------------------------------- + + public List diff(DiffElement master, DiffElement test) { + DiffValue masterValue = master.getValue(); + DiffValue testValue = test.getValue(); + + if ( masterValue.isCompound() && masterValue.isCompound() ) { + return diff(master.getValueAsNode(), test.getValueAsNode()); + } else if ( masterValue.isAtomic() && testValue.isAtomic() ) { + return diff(masterValue, testValue); + } else { + // structural difference in types. one is node, other is leaf + return Arrays.asList(new Difference(master, test)); + } + } + + public List diff(DiffNode master, DiffNode test) { + Set allNames = new HashSet(master.getElementNames()); + allNames.addAll(test.getElementNames()); + List diffs = new ArrayList(); + + for ( String name : allNames ) { + DiffElement masterElt = master.getElement(name); + DiffElement testElt = test.getElement(name); + if ( masterElt == null && testElt == null ) { + throw new ReviewedStingException("BUG: unexceptedly got two null elements for field: " + name); + } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value + // todo -- should one of these be a special MISSING item? + diffs.add(new Difference(masterElt, testElt)); + } else { + diffs.addAll(diff(masterElt, testElt)); + } + } + + return diffs; + } + + public List diff(DiffValue master, DiffValue test) { + if ( master.getValue().equals(test.getValue()) ) { + return Collections.emptyList(); + } else { + return Arrays.asList(new Difference(master.getBinding(), test.getBinding())); + } + } + + // -------------------------------------------------------------------------------- + // + // Summarizing differences + // + // -------------------------------------------------------------------------------- + + /** + * Emits a summary of the diffs to out. Suppose you have the following three differences: + * + * A.X.Z:1!=2 + * A.Y.Z:3!=4 + * B.X.Z:5!=6 + * + * The above is the itemized list of the differences. The summary looks for common differences + * in the name hierarchy, counts those shared elements, and emits the differences that occur + * in order of decreasing counts. + * + * So, in the above example, what are the shared elements? + * + * A.X.Z and B.X.Z share X.Z, so there's a *.X.Z with count 2 + * A.X.Z, A.Y.Z, and B.X.Z all share *.*.Z, with count 3 + * Each of A.X.Z, A.Y.Z, and B.X.Z are individually unique, with count 1 + * + * So we would emit the following summary: + * + * *.*.Z: 3 + * *.X.Z: 2 + * A.X.Z: 1 [specific difference: 1!=2] + * A.Y.Z: 1 [specific difference: 3!=4] + * B.X.Z: 1 [specific difference: 5!=6] + * + * The algorithm to accomplish this calculation is relatively simple. Start with all of the + * concrete differences. For each pair of differences A1.A2....AN and B1.B2....BN: + * + * find the longest common subsequence Si.Si+1...SN where Ai = Bi = Si + * If i == 0, then there's no shared substructure + * If i > 0, then generate the summarized value X = *.*...Si.Si+1...SN + * if X is a known summary, increment it's count, otherwise set its count to 1 + * + * Not that only pairs of the same length are considered as potentially equivalent + * + * @param params determines how we display the items + * @param diffs + */ + public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { + printSummaryReport(summarizeDifferences(diffs), params ); + } + + public List summarizeDifferences(List diffs) { + List diffPaths = new ArrayList(diffs.size()); + + for ( Difference diff1 : diffs ) { + diffPaths.add(diffNameToPath(diff1.getFullyQualifiedName())); + } + + return summarizedDifferencesOfPaths(diffPaths); + } + + final protected static String[] diffNameToPath(String diffName) { + return diffName.split("\\."); + } + + protected List summarizedDifferencesOfPaths(List diffPaths) { + Map summaries = new HashMap(); + + // create the initial set of differences + for ( int i = 0; i < diffPaths.size(); i++ ) { + for ( int j = 0; j <= i; j++ ) { + String[] diffPath1 = diffPaths.get(i); + String[] diffPath2 = diffPaths.get(j); + if ( diffPath1.length == diffPath2.length ) { + int lcp = longestCommonPostfix(diffPath1, diffPath2); + String path = lcp > 0 ? summarizedPath(diffPath2, lcp) : Utils.join(".", diffPath2); + addSummary(summaries, path, true); + } + } + } + + // count differences + for ( String[] diffPath : diffPaths ) { + for ( SummarizedDifference sumDiff : summaries.values() ) { + if ( sumDiff.matches(diffPath) ) + addSummary(summaries, sumDiff.getPath(), false); + } + } + + List sortedSummaries = new ArrayList(summaries.values()); + Collections.sort(sortedSummaries); + return sortedSummaries; + } + + private static void addSummary(Map summaries, String path, boolean onlyCatalog) { + if ( summaries.containsKey(path) ) { + if ( ! onlyCatalog ) + summaries.get(path).incCount(); + } else { + SummarizedDifference sumDiff = new SummarizedDifference(path); + summaries.put(sumDiff.getPath(), sumDiff); + } + } + + protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { + GATKReport report = new GATKReport(); + final String tableName = "diffences"; + report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffObjectsWalker_and_SummarizedDifferences for more information"); + GATKReportTable table = report.getTable(tableName); + table.addPrimaryKey("Difference", true); + table.addColumn("NumberOfOccurrences", 0); + + int count = 0, count1 = 0; + for ( SummarizedDifference diff : sortedSummaries ) { + if ( diff.getCount() < params.minSumDiffToShow ) + // in order, so break as soon as the count is too low + break; + + if ( params.maxItemsToDisplay != 0 && count++ > params.maxItemsToDisplay ) + break; + + if ( diff.getCount() == 1 ) { + count1++; + if ( params.maxCountOneItems != 0 && count1 > params.maxCountOneItems ) + break; + } + + table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount()); + } + + table.write(params.out); + } + + protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) { + int i = 0; + for ( ; i < diffPath1.length; i++ ) { + int j = diffPath1.length - i - 1; + if ( ! diffPath1[j].equals(diffPath2[j]) ) + break; + } + return i; + } + + /** + * parts is [A B C D] + * commonPostfixLength: how many parts are shared at the end, suppose its 2 + * We want to create a string *.*.C.D + * + * @param parts + * @param commonPostfixLength + * @return + */ + protected static String summarizedPath(String[] parts, int commonPostfixLength) { + int stop = parts.length - commonPostfixLength; + if ( stop > 0 ) parts = parts.clone(); + for ( int i = 0; i < stop; i++ ) { + parts[i] = "*"; + } + return Utils.join(".", parts); + } + + /** + * TODO -- all of the algorithms above should use SummarizedDifference instead + * TODO -- of some SummarizedDifferences and some low-level String[] + */ + public static class SummarizedDifference implements Comparable { + final String path; // X.Y.Z + final String[] parts; + int count = 0; + + public SummarizedDifference(String path) { + this.path = path; + this.parts = diffNameToPath(path); + } + + public void incCount() { count++; } + + public int getCount() { + return count; + } + + /** + * The fully qualified path object A.B.C etc + * @return + */ + public String getPath() { + return path; + } + + /** + * @return the length of the parts of this summary + */ + public int length() { + return this.parts.length; + } + + /** + * Returns true if the string parts matches this summary. Matches are + * must be equal() everywhere where this summary isn't *. + * @param otherParts + * @return + */ + public boolean matches(String[] otherParts) { + if ( otherParts.length != length() ) + return false; + + // TODO optimization: can start at right most non-star element + for ( int i = 0; i < length(); i++ ) { + String part = parts[i]; + if ( ! part.equals("*") && ! part.equals(otherParts[i]) ) + return false; + } + + return true; + } + + @Override + public String toString() { + return String.format("%s:%d", getPath(), getCount()); + } + + @Override + public int compareTo(SummarizedDifference other) { + // sort first highest to lowest count, then by lowest to highest path + int countCmp = Integer.valueOf(count).compareTo(other.count); + return countCmp != 0 ? -1 * countCmp : path.compareTo(other.path); + } + + + } + + // -------------------------------------------------------------------------------- + // + // plugin manager + // + // -------------------------------------------------------------------------------- + + public void loadDiffableReaders() { + List> drClasses = new PluginManager( DiffableReader.class ).getPlugins(); + + logger.info("Loading diffable modules:"); + for (Class drClass : drClasses ) { + logger.info("\t" + drClass.getSimpleName()); + + try { + DiffableReader dr = drClass.newInstance(); + readers.put(dr.getName(), dr); + } catch (InstantiationException e) { + throw new ReviewedStingException("Unable to instantiate module '" + drClass.getSimpleName() + "'"); + } catch (IllegalAccessException e) { + throw new ReviewedStingException("Illegal access error when trying to instantiate '" + drClass.getSimpleName() + "'"); + } + } + } + + protected Map getReaders() { + return readers; + } + + protected DiffableReader getReader(String name) { + return readers.get(name); + } + + /** + * Returns a reader appropriate for this file, or null if no such reader exists + * @param file + * @return + */ + public DiffableReader findReaderForFile(File file) { + for ( DiffableReader reader : readers.values() ) + if (reader.canRead(file) ) + return reader; + + return null; + } + + /** + * Returns true if reader appropriate for this file, or false if no such reader exists + * @param file + * @return + */ + public boolean canRead(File file) { + return findReaderForFile(file) != null; + } + + public DiffElement createDiffableFromFile(File file) { + DiffableReader reader = findReaderForFile(file); + if ( reader == null ) + throw new UserException("Unsupported file type: " + file); + else + return reader.readFromFile(file); + } + + public static boolean simpleDiffFiles(File masterFile, File testFile, DiffEngine.SummaryReportParams params) { + DiffEngine diffEngine = new DiffEngine(); + + if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { + DiffElement master = diffEngine.createDiffableFromFile(masterFile); + DiffElement test = diffEngine.createDiffableFromFile(testFile); + List diffs = diffEngine.diff(master, test); + diffEngine.reportSummarizedDifferences(diffs, params); + return true; + } else { + return false; + } + } + + public static class SummaryReportParams { + PrintStream out = System.out; + int maxItemsToDisplay = 0; + int maxCountOneItems = 0; + int minSumDiffToShow = 0; + + public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow) { + this.out = out; + this.maxItemsToDisplay = maxItemsToDisplay; + this.maxCountOneItems = maxCountOneItems; + this.minSumDiffToShow = minSumDiffToShow; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java new file mode 100644 index 000000000..0720e18c0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffNode.java @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +public class DiffNode extends DiffValue { + private Map getElementMap() { + return (Map)super.getValue(); + } + private static Map emptyElements() { return new HashMap(); } + + private DiffNode(Map elements) { + super(elements); + } + + private DiffNode(DiffElement binding, Map elements) { + super(binding, elements); + } + + // --------------------------------------------------------------------------- + // + // constructors + // + // --------------------------------------------------------------------------- + + public static DiffNode rooted(String name) { + return empty(name, DiffElement.ROOT); + } + + public static DiffNode empty(String name, DiffElement parent) { + DiffNode df = new DiffNode(emptyElements()); + DiffElement elt = new DiffElement(name, parent, df); + df.setBinding(elt); + return df; + } + + public static DiffNode empty(String name, DiffValue parent) { + return empty(name, parent.getBinding()); + } + + // --------------------------------------------------------------------------- + // + // accessors + // + // --------------------------------------------------------------------------- + + @Override + public boolean isAtomic() { return false; } + + public Collection getElementNames() { + return getElementMap().keySet(); + } + + public Collection getElements() { + return getElementMap().values(); + } + + private Collection getElements(boolean atomicOnly) { + List elts = new ArrayList(); + for ( DiffElement elt : getElements() ) + if ( (atomicOnly && elt.getValue().isAtomic()) || (! atomicOnly && elt.getValue().isCompound())) + elts.add(elt); + return elts; + } + + public Collection getAtomicElements() { + return getElements(true); + } + + public Collection getCompoundElements() { + return getElements(false); + } + + public DiffElement getElement(String name) { + for ( DiffElement elt : getElements() ) + if ( elt.getName().equals(name) ) + return elt; + return null; + } + + /** + * Returns true if name is bound in this node + * @param name + * @return + */ + public boolean hasElement(String name) { + return getElement(name) != null; + } + + // --------------------------------------------------------------------------- + // + // add + // + // --------------------------------------------------------------------------- + + @Requires("elt != null") + public void add(DiffElement elt) { + if ( getElementMap().containsKey(elt.getName()) ) + throw new IllegalArgumentException("Attempting to rebind already existing binding: " + elt + " node=" + this); + getElementMap().put(elt.getName(), elt); + } + + @Requires("elt != null") + public void add(DiffValue elt) { + add(elt.getBinding()); + } + + @Requires("elts != null") + public void add(Collection elts) { + for ( DiffElement e : elts ) + add(e); + } + + public void add(String name, Object value) { + add(new DiffElement(name, this.getBinding(), new DiffValue(value))); + } + + // --------------------------------------------------------------------------- + // + // toString + // + // --------------------------------------------------------------------------- + + @Override + public String toString() { + return toString(0); + } + + @Override + public String toString(int offset) { + String off = offset > 0 ? Utils.dupString(' ', offset) : ""; + StringBuilder b = new StringBuilder(); + + b.append("(").append("\n"); + Collection atomicElts = getAtomicElements(); + for ( DiffElement elt : atomicElts ) { + b.append(elt.toString(offset + 2)).append('\n'); + } + + for ( DiffElement elt : getCompoundElements() ) { + b.append(elt.toString(offset + 4)).append('\n'); + } + b.append(off).append(")").append("\n"); + + return b.toString(); + } + + @Override + public String toOneLineString() { + StringBuilder b = new StringBuilder(); + + b.append('('); + List parts = new ArrayList(); + for ( DiffElement elt : getElements() ) + parts.add(elt.toOneLineString()); + b.append(Utils.join(" ", parts)); + b.append(')'); + + return b.toString(); + } + + // -------------------------------------------------------------------------------- + // + // fromString and toOneLineString + // + // -------------------------------------------------------------------------------- + + public static DiffElement fromString(String tree) { + return fromString(tree, DiffElement.ROOT); + } + + /** + * Doesn't support full tree structure parsing + * @param tree + * @param parent + * @return + */ + private static DiffElement fromString(String tree, DiffElement parent) { + // X=(A=A B=B C=(D=D)) + String[] parts = tree.split("=", 2); + if ( parts.length != 2 ) + throw new ReviewedStingException("Unexpected tree structure: " + tree + " parts=" + parts); + String name = parts[0]; + String value = parts[1]; + + if ( value.length() == 0 ) + throw new ReviewedStingException("Illegal tree structure: " + value + " at " + tree); + + if ( value.charAt(0) == '(' ) { + if ( ! value.endsWith(")") ) + throw new ReviewedStingException("Illegal tree structure. Missing ): " + value + " at " + tree); + String subtree = value.substring(1, value.length()-1); + DiffNode rec = DiffNode.empty(name, parent); + String[] subParts = subtree.split(" "); + for ( String subPart : subParts ) { + rec.add(fromString(subPart, rec.getBinding())); + } + return rec.getBinding(); + } else { + return new DiffValue(name, parent, value).getBinding(); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java new file mode 100644 index 000000000..a08108db2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import org.apache.xmlbeans.impl.tool.Diff; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; + +import java.io.File; +import java.io.PrintStream; +import java.util.List; + +/** + * Compares two record-oriented files, itemizing specific difference between equivalent + * records in the two files. Reports both itemized and summarized differences. + * @author Mark DePristo + * @version 0.1 + */ +@Requires(value={}) +public class DiffObjectsWalker extends RodWalker { + @Output(doc="File to which results should be written",required=true) + protected PrintStream out; + + @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false) + int MAX_RECORDS = 0; + + @Argument(fullName="maxCount1Records", shortName="M1", doc="Max. number of records occuring exactly once in the file to process", required=false) + int MAX_COUNT1_RECORDS = 0; + + @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) + int minCountForDiff = 1; + + @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) + boolean showItemizedDifferences = false; + + @Argument(fullName="master", shortName="m", doc="Master file: expected results", required=true) + File masterFile; + + @Argument(fullName="test", shortName="t", doc="Test file: new results to compare to the master file", required=true) + File testFile; + + final DiffEngine diffEngine = new DiffEngine(); + + @Override + public void initialize() { + + } + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + + @Override + public void onTraversalDone(Integer sum) { + out.printf("Reading master file %s%n", masterFile); + DiffElement master = diffEngine.createDiffableFromFile(masterFile); + out.printf("Reading test file %s%n", testFile); + DiffElement test = diffEngine.createDiffableFromFile(testFile); + +// out.printf("Master diff objects%n"); +// out.println(master.toString()); +// out.printf("Test diff objects%n"); +// out.println(test.toString()); + + List diffs = diffEngine.diff(master, test); + if ( showItemizedDifferences ) { + out.printf("Itemized results%n"); + for ( Difference diff : diffs ) + out.printf("DIFF: %s%n", diff.toString()); + } + + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_RECORDS, MAX_COUNT1_RECORDS, minCountForDiff); + diffEngine.reportSummarizedDifferences(diffs, params); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java new file mode 100644 index 000000000..7245e9e8d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffValue.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import org.broadinstitute.sting.utils.Utils; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +public class DiffValue { + private DiffElement binding = null; + final private Object value; + + public DiffValue(Object value) { + this.value = value; + } + + public DiffValue(DiffElement binding, Object value) { + this.binding = binding; + this.value = value; + } + + public DiffValue(DiffValue parent, Object value) { + this(parent.getBinding(), value); + } + + public DiffValue(String name, DiffElement parent, Object value) { + this.binding = new DiffElement(name, parent, this); + this.value = value; + } + + public DiffValue(String name, DiffValue parent, Object value) { + this(name, parent.getBinding(), value); + } + + public DiffElement getBinding() { + return binding; + } + + protected void setBinding(DiffElement binding) { + this.binding = binding; + } + + public Object getValue() { + return value; + } + + public String toString() { + return getValue().toString(); + } + + public String toString(int offset) { + return toString(); + } + + public String toOneLineString() { + return getValue().toString(); + } + + public boolean isAtomic() { return true; } + public boolean isCompound() { return ! isAtomic(); } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java new file mode 100644 index 000000000..84c2eed10 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReader.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.io.File; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Interface for readers creating diffable objects from a file + */ +public interface DiffableReader { + @Ensures("result != null") + public String getName(); + + @Ensures("result != null") + @Requires("file != null") + public DiffElement readFromFile(File file); + + @Requires("file != null") + public boolean canRead(File file); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java new file mode 100644 index 000000000..6627a4cc5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/Difference.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:53 PM + * + * Represents a specific difference between two specific DiffElements + */ +public class Difference { + DiffElement master, test; + + public Difference(DiffElement master, DiffElement test) { + if ( master == null && test == null ) throw new IllegalArgumentException("Master and test both cannot be null"); + this.master = master; + this.test = test; + } + + public String toString() { + return String.format("%s:%s!=%s", + getFullyQualifiedName(), + getOneLineString(master), + getOneLineString(test)); + } + + public String getFullyQualifiedName() { + return (master == null ? test : master).fullyQualifiedName(); + } + + private static String getOneLineString(DiffElement elt) { + return elt == null ? "MISSING" : elt.getValue().toOneLineString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java new file mode 100644 index 000000000..743178538 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import org.broad.tribble.readers.AsciiLineReader; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; +import java.util.Arrays; +import java.util.Map; +import java.util.zip.GZIPInputStream; + + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Class implementing diffnode reader for VCF + */ +public class VCFDiffableReader implements DiffableReader { + @Override + public String getName() { return "VCF"; } + + @Override + public DiffElement readFromFile(File file) { + DiffNode root = DiffNode.rooted(file.getName()); + try { + LineReader lineReader = new AsciiLineReader(new FileInputStream(file)); + VCFCodec vcfCodec = new VCFCodec(); + VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); + + String line = lineReader.readLine(); + while ( line != null ) { + VariantContext vc = (VariantContext)vcfCodec.decode(line); + String name = vc.getChr() + ":" + vc.getStart(); + DiffNode vcRoot = DiffNode.empty(name, root); + + // add fields + vcRoot.add("CHROM", vc.getChr()); + vcRoot.add("POS", vc.getStart()); + vcRoot.add("ID", vc.hasID() ? vc.getID() : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("REF", vc.getReference()); + vcRoot.add("ALT", vc.getAlternateAlleles()); + vcRoot.add("QUAL", vc.hasNegLog10PError() ? vc.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("FILTER", vc.getFilters()); + + // add info fields + for (Map.Entry attribute : vc.getAttributes().entrySet()) { + if ( ! attribute.getKey().startsWith("_") && ! attribute.getKey().equals(VariantContext.ID_KEY)) + vcRoot.add(attribute.getKey(), attribute.getValue()); + } + + for (Genotype g : vc.getGenotypes().values() ) { + DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); + gRoot.add("GT", g.getGenotypeString()); + gRoot.add("GQ", g.hasNegLog10PError() ? g.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4 ); + + for (Map.Entry attribute : g.getAttributes().entrySet()) { + if ( ! attribute.getKey().startsWith("_") ) + gRoot.add(attribute.getKey(), attribute.getValue()); + } + + vcRoot.add(gRoot); + } + + root.add(vcRoot); + line = lineReader.readLine(); + } + + lineReader.close(); + } catch ( IOException e ) { + return null; + } + + return root.getBinding(); + } + + @Override + public boolean canRead(File file) { + try { + final String VCF4_HEADER = "##fileformat=VCFv4"; + char[] buff = new char[VCF4_HEADER.length()]; + new FileReader(file).read(buff, 0, VCF4_HEADER.length()); + String firstLine = new String(buff); + return firstLine.startsWith(VCF4_HEADER); + } catch ( IOException e ) { + return false; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 1045ca371..055eb0b97 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -92,25 +92,31 @@ public class UnifiedArgumentCollection { @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double INDEL_HETEROZYGOSITY = 1.0/8000; + @Hidden @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false) public double INDEL_GAP_CONTINUATION_PENALTY = 10.0; + @Hidden @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false) public double INDEL_GAP_OPEN_PENALTY = 45.0; + @Hidden @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; + @Hidden @Argument(fullName = "doContextDependentGapPenalties", shortName = "doCDP", doc = "Vary gap penalties by context", required = false) public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true; //gdebug+ - @Hidden // experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!! + @Hidden @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false) public boolean GET_GAP_PENALTIES_FROM_DATA = false; + @Hidden @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); + @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @Hidden diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 7a765c602..fe0084a19 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -37,7 +37,6 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import java.util.*; import java.io.PrintStream; @@ -158,7 +157,7 @@ public class UnifiedGenotyper extends LocusWalker getSupportedHeaderStrings() { + Set result = new HashSet(); + result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); + result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2")); + + return result; + } + /** * Compute at a given locus. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java index aebb0e3eb..df1f4f908 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java @@ -5,6 +5,7 @@ import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.*; @@ -113,9 +114,10 @@ public class ConstrainedMateFixingManager { HashMap forMateMatching = new HashMap(); TreeSet waitingReads = new TreeSet(comparer); - private T remove(TreeSet treeSet) { - final T first = treeSet.first(); - treeSet.remove(first); + private SAMRecord remove(TreeSet treeSet) { + final SAMRecord first = treeSet.first(); + if ( !treeSet.remove(first) ) + throw new UserException("Error caching SAM record " + first.getReadName() + ", which is usually caused by malformed SAM/BAM files in which multiple identical copies of a read are present."); return first; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index ff59c9e29..2cbc66e31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -43,9 +43,9 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = -1; - if ( eval.hasAttribute("AC") ) + if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { AC = eval.getAttributeAsInt("AC"); - else if ( eval.isVariant() ) { + } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getChromosomeCount(allele)); } else diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 02d850211..d4e07dccf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2011 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index a09a30145..17461de2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import Jama.Matrix; @@ -72,7 +97,7 @@ public class GaussianMixtureModel { int ttt = 0; while( ttt++ < numIterations ) { - // Estep: assign each variant to the nearest cluster + // E step: assign each variant to the nearest cluster for( final VariantDatum datum : data ) { double minDistance = Double.MAX_VALUE; MultivariateGaussian minGaussian = null; @@ -87,7 +112,7 @@ public class GaussianMixtureModel { datum.assignment = minGaussian; } - // Mstep: update gaussian means based on assigned variants + // M step: update gaussian means based on assigned variants for( final MultivariateGaussian gaussian : gaussians ) { gaussian.zeroOutMu(); int numAssigned = 0; @@ -204,26 +229,29 @@ public class GaussianMixtureModel { } public double evaluateDatumMarginalized( final VariantDatum datum ) { - int numVals = 0; + int numSamples = 0; double sumPVarInGaussian = 0.0; - int numIter = 10; + final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization final double[] pVarInGaussianLog10 = new double[gaussians.size()]; + // for each dimension for( int iii = 0; iii < datum.annotations.length; iii++ ) { - // marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod + // if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod if( datum.isNull[iii] ) { - for( int ttt = 0; ttt < numIter; ttt++ ) { - datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0); + for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) { + datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution + // evaluate this random data point int gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum ); } - sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); - numVals++; + // add this sample's probability to the pile in order to take an average in the end + sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k)) + numSamples++; } } } - return Math.log10( sumPVarInGaussian / ((double) numVals) ); + return Math.log10( sumPVarInGaussian / ((double) numSamples) ); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java index 0b2edfd10..d077af78e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import Jama.Matrix; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java index 9bbcf395a..6c1a7ddbc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java index fbee64fe2..64fe36637 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2011 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java index 08388db21..19c6d501b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java index 2914385a4..5deb5d8c2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java @@ -1,8 +1,32 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantQualityScore; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index f83e9b2f0..ddeda1699 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -1,6 +1,30 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.variantrecalibration; -import cern.jet.random.Normal; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -58,19 +82,11 @@ public class VariantDataManager { } foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6); - if( annotationKeys.get(iii).toLowerCase().contains("ranksum") ) { // BUGBUG: to clean up - for( final VariantDatum datum : data ) { - if( datum.annotations[iii] > 0.0 ) { datum.annotations[iii] /= 3.0; } - } - } meanVector[iii] = theMean; varianceVector[iii] = theSTD; for( final VariantDatum datum : data ) { - datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD ); - // Each data point is now [ (x - mean) / standard deviation ] - if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) { - datum.annotations[iii] /= 3.0; - } + // Transform each data point via: (x - mean) / standard deviation + datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); } } if( foundZeroVarianceAnnotation ) { @@ -139,7 +155,7 @@ public class VariantDataManager { final int numBadSitesAdded = trainingData.size(); logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." ); - // Next, sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants + // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants Collections.sort( data ); final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) ); if( numToAdd > data.size() ) { @@ -217,23 +233,15 @@ public class VariantDataManager { double value; try { - if( annotationKey.equalsIgnoreCase("QUAL") ) { - value = vc.getPhredScaledQual(); - } else if( annotationKey.equalsIgnoreCase("DP") ) { - value = Double.parseDouble( (String)vc.getAttribute( "DP" ) ) / Double.parseDouble( (String)vc.getAttribute( "AN" ) ); - } else { - value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); - if( Double.isInfinite(value) ) { value = Double.NaN; } - if( annotationKey.equalsIgnoreCase("InbreedingCoeff") && value > 0.05 ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM - value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); - } - if( annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - if( annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); + if( Double.isInfinite(value) ) { value = Double.NaN; } + if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM + value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { - value = Double.NaN; // The VQSR works with missing data now by marginalizing over the missing dimension when evaluating Gaussians + value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } return value; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index 04a5a9d3e..eb9e98fcb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.variantrecalibration; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index b903d20af..2d0355d7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -88,7 +88,7 @@ public class VariantRecalibrator extends RodWalker(Arrays.asList(USE_ANNOTATIONS)), VRAC ); if( IGNORE_INPUT_FILTERS != null ) { @@ -228,19 +229,23 @@ public class VariantRecalibrator extends RodWalker reduceSum ) { dataManager.setData( reduceSum ); dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation + + // Generate the positive model using the training data and evaluate each variant final GaussianMixtureModel goodModel = engine.generateModel( dataManager.getTrainingData() ); engine.evaluateData( dataManager.getData(), goodModel, false ); + + // Generate the negative model using the worst performing data and evaluate each variant contrastively final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) ); engine.evaluateData( dataManager.getData(), badModel, true ); engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); - final ExpandingArrayList randomData = dataManager.getRandomDataForPlotting( 6000 ); - + // Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user final int nCallsAtTruth = TrancheManager.countCallsAtTruth( dataManager.getData(), Double.NEGATIVE_INFINITY ); final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth ); final List tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric ); tranchesStream.print(Tranche.tranchesString( tranches )); + // Find the filtering lodCutoff for display on the model PDFs. Red variants are those which were below the cutoff and filtered out of the final callset. double lodCutoff = 0.0; for( final Tranche tranche : tranches ) { if( MathUtils.compareDoubles(tranche.ts, TS_FILTER_LEVEL, 0.0001)==0 ) { @@ -252,7 +257,7 @@ public class VariantRecalibrator extends RodWalker newAlleles = new ArrayList(); - loc = clipAlleles(pos, ref, alleles, newAlleles); + loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); alleles = newAlleles; } @@ -504,7 +504,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @param clippedAlleles output list of clipped alleles * @return a list of alleles, clipped to the reference */ - protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles) { + protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { // Note that the computation of forward clipping here is meant only to see whether there is a common // base to all alleles, and to correctly compute reverse clipping, @@ -522,6 +522,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, } if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0) clipping = false; + else if (ref.length() == reverseClipped) + generateException("bad alleles encountered", lineNo); else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1]) clipping = false; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 31251c089..f4996b487 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -358,16 +358,8 @@ public class StandardVCFWriter implements VCFWriter { mWriter.write(key); if ( !entry.getValue().equals("") ) { - int numVals = 1; VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key); - if ( metaData != null ) - numVals = metaData.getCount(); - - // take care of unbounded encoding - if ( numVals == VCFInfoHeaderLine.UNBOUNDED ) - numVals = 1; - - if ( numVals > 0 ) { + if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) { mWriter.write("="); mWriter.write(entry.getValue()); } @@ -423,7 +415,7 @@ public class StandardVCFWriter implements VCFWriter { VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key); if ( metaData != null ) { - int numInFormatField = metaData.getCount(); + int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size()); if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java new file mode 100644 index 000000000..a9de949d8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +/** + * @author ebanks + * A class representing a key=value entry for ALT fields in the VCF header + */ +public class VCFAltHeaderLine extends VCFSimpleHeaderLine { + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + */ + public VCFAltHeaderLine(String name, String description) { + super(name, description, SupportedHeaderLineType.ALT); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + */ + protected VCFAltHeaderLine(String line, VCFHeaderVersion version) { + super(line, version, SupportedHeaderLineType.ALT); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index a799161ad..bb822f2ed 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + import java.util.Arrays; import java.util.LinkedHashMap; import java.util.Map; @@ -43,26 +45,43 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF // the field types private String name; - private int count; + private int count = -1; + private VCFHeaderLineCount countType; private String description; private VCFHeaderLineType type; // access methods public String getName() { return name; } - public int getCount() { return count; } public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } + public VCFHeaderLineCount getCountType() { return countType; } + public int getCount() { + if ( countType != VCFHeaderLineCount.INTEGER ) + throw new ReviewedStingException("Asking for header line count when type is not an integer"); + return count; + } - // - public void setNumberToUnbounded() { this.count = UNBOUNDED; } + // utility method + public int getCount(int numAltAlleles) { + int myCount; + switch ( countType ) { + case INTEGER: myCount = count; break; + case UNBOUNDED: myCount = -1; break; + case A: myCount = numAltAlleles; break; + case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break; + default: throw new ReviewedStingException("Unknown count type: " + countType); + } + return myCount; + } + + public void setNumberToUnbounded() { + countType = VCFHeaderLineCount.UNBOUNDED; + count = -1; + } // our type of line, i.e. format, info, etc private final SupportedHeaderLineType lineType; - // line numerical values are allowed to be unbounded (or unknown), which is - // marked with a dot (.) - public static final int UNBOUNDED = -1; // the value we store internally for unbounded types - /** * create a VCF format header line * @@ -70,10 +89,12 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF * @param count the count for this header line * @param type the type for this header line * @param description the description for this header line + * @param lineType the header line type */ protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { super(lineType.toString(), ""); this.name = name; + this.countType = VCFHeaderLineCount.INTEGER; this.count = count; this.type = type; this.description = description; @@ -81,20 +102,53 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF validate(); } + /** + * create a VCF format header line + * + * @param name the name for this header line + * @param count the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + * @param lineType the header line type + */ + protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.name = name; + this.countType = count; + this.type = type; + this.description = description; + this.lineType = lineType; + validate(); + } + /** * create a VCF format header line * * @param line the header line * @param version the VCF header version + * @param lineType the header line type * */ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { super(lineType.toString(), ""); Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); name = mapping.get("ID"); - count = (version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) ? - mapping.get("Number").equals(VCFConstants.UNBOUNDED_ENCODING_v4) ? UNBOUNDED : Integer.valueOf(mapping.get("Number")) : - mapping.get("Number").equals(VCFConstants.UNBOUNDED_ENCODING_v3) ? UNBOUNDED : Integer.valueOf(mapping.get("Number")); + count = -1; + final String numberStr = mapping.get("Number"); + if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) { + countType = VCFHeaderLineCount.A; + } else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) { + countType = VCFHeaderLineCount.G; + } else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) && + numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || + ((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) && + numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) { + countType = VCFHeaderLineCount.UNBOUNDED; + } else { + countType = VCFHeaderLineCount.INTEGER; + count = Integer.valueOf(numberStr); + + } type = VCFHeaderLineType.valueOf(mapping.get("Type")); if (type == VCFHeaderLineType.Flag && !allowFlagValues()) throw new IllegalArgumentException("Flag is an unsupported type for this kind of field"); @@ -121,7 +175,15 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF protected String toStringEncoding() { Map map = new LinkedHashMap(); map.put("ID", name); - map.put("Number", count == UNBOUNDED ? VCFConstants.UNBOUNDED_ENCODING_v4 : count); + Object number; + switch ( countType ) { + case A: number = VCFConstants.PER_ALLELE_COUNT; break; + case G: number = VCFConstants.PER_GENOTYPE_COUNT; break; + case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break; + case INTEGER: + default: number = count; + } + map.put("Number", number); map.put("Type", type); map.put("Description", description); return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); @@ -136,15 +198,13 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF if ( !(o instanceof VCFCompoundHeaderLine) ) return false; VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o; - return name.equals(other.name) && - count == other.count && - description.equals(other.description) && - type == other.type && - lineType == other.lineType; + return equalsExcludingDescription(other) && + description.equals(other.description); } public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { return count == other.count && + countType == other.countType && type == other.type && lineType == other.lineType && name.equals(other.name); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java index 695c46c27..91cf86c70 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java @@ -99,6 +99,8 @@ public final class VCFConstants { public static final String MISSING_DEPTH_v3 = "-1"; public static final String UNBOUNDED_ENCODING_v4 = "."; public static final String UNBOUNDED_ENCODING_v3 = "-1"; + public static final String PER_ALLELE_COUNT = "A"; + public static final String PER_GENOTYPE_COUNT = "G"; public static final String EMPTY_ALLELE = "."; public static final String EMPTY_GENOTYPE = "./."; public static final double MAX_GENOTYPE_QUAL = 99.0; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java index 9176fc16e..418b80074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java @@ -1,19 +1,10 @@ package org.broadinstitute.sting.utils.codecs.vcf; -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.Map; - - /** * @author ebanks * A class representing a key=value entry for FILTER fields in the VCF header */ -public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { - - private String name; - private String description; - +public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { /** * create a VCF filter header line @@ -22,12 +13,7 @@ public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeader * @param description the description for this header line */ public VCFFilterHeaderLine(String name, String description) { - super("FILTER", ""); - this.name = name; - this.description = description; - - if ( name == null || description == null ) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description )); + super(name, description, SupportedHeaderLineType.FILTER); } /** @@ -37,34 +23,6 @@ public class VCFFilterHeaderLine extends VCFHeaderLine implements VCFNamedHeader * @param version the vcf header version */ protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) { - super("FILTER", ""); - Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); - name = mapping.get("ID"); - description = mapping.get("Description"); - if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - } - - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - map.put("Description", description); - return "FILTER=" + VCFHeaderLine.toStringEncoding(map); - } - - public boolean equals(Object o) { - if ( !(o instanceof VCFFilterHeaderLine) ) - return false; - VCFFilterHeaderLine other = (VCFFilterHeaderLine)o; - return name.equals(other.name) && - description.equals(other.description); - } - - public String getName() { - return name; - } - - public String getDescription() { - return description; + super(line, version, SupportedHeaderLineType.FILTER); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java index 352be3e97..474c8dd14 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFormatHeaderLine.java @@ -16,6 +16,10 @@ public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { throw new IllegalArgumentException("Flag is an unsupported type for format fields"); } + public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.FORMAT); + } + protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) { super(line, version, SupportedHeaderLineType.FORMAT); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineCount.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineCount.java new file mode 100644 index 000000000..d615c7c78 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineCount.java @@ -0,0 +1,8 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +/** + * the count encodings we use for fields in VCF header lines + */ +public enum VCFHeaderLineCount { + INTEGER, A, G, UNBOUNDED; +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java index 135a5c1a1..9b20f38a1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFInfoHeaderLine.java @@ -13,6 +13,10 @@ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { super(name, count, type, description, SupportedHeaderLineType.INFO); } + public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { + super(name, count, type, description, SupportedHeaderLineType.INFO); + } + protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) { super(line, version, SupportedHeaderLineType.INFO); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java new file mode 100644 index 000000000..152043f28 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java @@ -0,0 +1,81 @@ +package org.broadinstitute.sting.utils.codecs.vcf; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + + +/** + * @author ebanks + * A class representing a key=value entry for simple VCF header types + */ +public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { + + public enum SupportedHeaderLineType { + FILTER, ALT; + } + + private String name; + private String description; + + // our type of line, i.e. filter, alt, etc + private final SupportedHeaderLineType lineType; + + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + * @param lineType the header line type + */ + public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.lineType = lineType; + this.name = name; + this.description = description; + + if ( name == null || description == null ) + throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description )); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + * @param lineType the header line type + */ + protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { + super(lineType.toString(), ""); + this.lineType = lineType; + Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); + name = mapping.get("ID"); + description = mapping.get("Description"); + if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided + description = UNBOUND_DESCRIPTION; + } + + protected String toStringEncoding() { + Map map = new LinkedHashMap(); + map.put("ID", name); + map.put("Description", description); + return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); + } + + public boolean equals(Object o) { + if ( !(o instanceof VCFSimpleHeaderLine) ) + return false; + VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o; + return name.equals(other.name) && + description.equals(other.description); + } + + public String getName() { + return name; + } + + public String getDescription() { + return description; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index ecede068e..4037f75b9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -180,19 +180,4 @@ public class VCFUtils { return new HashSet(map.values()); } - - /** - * return a set of supported format lines; what we currently support for output in the genotype fields of a VCF - * @return a set of VCF format lines - */ - public static Set getSupportedHeaderStrings() { - Set result = new HashSet(); - result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, -1, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2")); - - return result; - } - } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index cb6557408..31791e805 100755 --- a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -133,8 +133,12 @@ public class Haplotype { byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); + int startAfter = startIdxInReference+numPrefBases+ refAllele.getBases().length; + // protect against long events that overrun available reference context + if (startAfter > refBases.length) + startAfter = refBases.length; byte[] basesAfterVariant = Arrays.copyOfRange(refBases, - startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length); + startAfter, refBases.length); // Create location for all haplotypes diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java index a9ba46159..901de6fae 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java @@ -108,7 +108,7 @@ public class Allele implements Comparable { this.bases = bases; if ( ! acceptableAlleleBases(bases) ) - throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases)); + throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); } private Allele(String bases, boolean isRef) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 3d375aba2..5787b591f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1343,6 +1343,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati return (int)stop; } + private boolean hasSymbolicAlleles() { + for (Allele a: getAlleles()) { + if (a.isSymbolic()) { + return true; + } + } + return false; + } + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, byte inputRefBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { Allele refAllele = inputVC.getReference(); @@ -1352,7 +1361,9 @@ public class VariantContext implements Feature { // to enable tribble intergrati // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; - if (refAllele.length() == locLength) + if (inputVC.hasSymbolicAlleles()) + padVC = true; + else if (refAllele.length() == locLength) padVC = false; else if (refAllele.length() == locLength-1) padVC = true; diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 61bb8b34b..b3e422ba9 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -4,6 +4,7 @@ import org.apache.commons.io.FileUtils; import org.apache.log4j.*; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; @@ -12,6 +13,10 @@ import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * @@ -107,6 +112,57 @@ public abstract class BaseTest { } } + /** + * Simple generic utility class to creating TestNG data providers: + * + * 1: inherit this class, as in + * + * private class SummarizeDifferenceTest extends TestDataProvider { + * public SummarizeDifferenceTest() { + * super(SummarizeDifferenceTest.class); + * } + * ... + * } + * + * Provide a reference to your class to the TestDataProvider constructor. + * + * 2: Create instances of your subclass. Return from it the call to getTests, providing + * the class type of your test + * + * @DataProvider(name = "summaries") + * public Object[][] createSummaries() { + * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + * } + * + * This class magically tracks created objects of this + */ + public static class TestDataProvider { + private static final Map> tests = new HashMap>(); + + /** + * Create a new TestDataProvider instance bound to the class variable C + * @param c + */ + public TestDataProvider(Class c) { + if ( ! tests.containsKey(c) ) + tests.put(c, new ArrayList()); + tests.get(c).add(this); + } + + /** + * Return all of the data providers in the form expected by TestNG of type class C + * @param c + * @return + */ + public static Object[][] getTests(Class c) { + List params2 = new ArrayList(); + for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + } + /** * test if the file exists * @@ -279,11 +335,14 @@ public abstract class BaseTest { if (parameterize || expectedMD5.equals("")) { // Don't assert - } else { - Assert.assertEquals(filemd5sum, expectedMD5, name + " Mismatching MD5s"); + } else if ( filemd5sum.equals(expectedMD5) ) { System.out.println(String.format(" => %s PASSED", name)); + } else { + Assert.fail(String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum)); } + + return filemd5sum; } @@ -326,7 +385,12 @@ public abstract class BaseTest { System.out.printf("##### Path to calculated file (MD5=%s): %s%n", filemd5sum, pathToFileMD5File); System.out.printf("##### Diff command: diff %s %s%n", pathToExpectedMD5File, pathToFileMD5File); - // todo -- add support for simple inline display of the first N differences for text file + // inline differences + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(System.out, 20, 10, 0); + boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), params); + if ( success ) + System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", + pathToExpectedMD5File, pathToFileMD5File); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 33a20f7b5..600718aa0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -34,76 +34,76 @@ import java.util.Arrays; * Tests CombineVariants */ public class CombineVariantsIntegrationTest extends WalkerTest { -// public static String baseTestString(String args) { -// return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; -// } -// -// public void test1InOut(String file, String md5, boolean vcf3) { -// test1InOut(file, md5, "", vcf3); -// } -// -// public void test1InOut(String file, String md5, String args, boolean vcf3) { -// WalkerTestSpec spec = new WalkerTestSpec( -// baseTestString(" -priority v1 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file + args), -// 1, -// Arrays.asList(md5)); -// executeTest("testInOut1--" + file, spec); -// } -// -// public void combine2(String file1, String file2, String args, String md5, boolean vcf3) { -// WalkerTestSpec spec = new WalkerTestSpec( -// baseTestString(" -priority v1,v2 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file1 + " -B:v2,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file2 + args), -// 1, -// Arrays.asList(md5)); -// executeTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); -// } -// -// public void combineSites(String args, String md5) { -// String file1 = "1000G_omni2.5.b37.sites.vcf"; -// String file2 = "hapmap_3.3.b37.sites.vcf"; -// WalkerTestSpec spec = new WalkerTestSpec( -// "-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference -// + " -L 1:1-10,000,000 -B:omni,VCF " + validationDataLocation + file1 -// + " -B:hm3,VCF " + validationDataLocation + file2 + args, -// 1, -// Arrays.asList(md5)); -// executeTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); -// } -// -// -// @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2117fff6e0d182cd20be508e9661829c", true); } -// @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2cfaf7af3dd119df08b8a9c1f72e2f93", " -setKey foo", true); } -// @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "1474ac0fde2ce42a3c24f1c97eab333e", " -setKey null", true); } -// @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "7fc66df048a0ab08cf507906e1d4a308", false); } // official project VCF files in tabix format -// -// @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "ec9715f53dbf4531570557c212822f12", false); } -// @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "f1072be5f5c6ee810276d9ca6537224d", false); } -// -// @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "b77a1eec725201d9d8e74ee0c45638d3", false); } // official project VCF files in tabix format -// @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "802977fdfd2f4905b501bb06800f60af", false); } // official project VCF files in tabix format -// @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a67157287dd2b24b5cdf7ebf8fcbbe9a", false); } -// -// @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e1f4718a179f1196538a33863da04f53", false); } -// -// @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "b3783384b7c8e877b971033e90beba48", true); } -// -// @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "902e541c87caa72134db6293fc46f0ad"); } -// @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "f339ad4bb5863b58b9c919ce7d040bb9"); } -// -// @Test public void threeWayWithRefs() { -// WalkerTestSpec spec = new WalkerTestSpec( -// baseTestString(" -B:NA19240_BGI,VCF "+validationDataLocation+"NA19240.BGI.RG.vcf" + -// " -B:NA19240_ILLUMINA,VCF "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + -// " -B:NA19240_WUGSC,VCF "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + -// " -B:denovoInfo,VCF "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + -// " -setKey centerSet" + -// " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + -// " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + -// " -genotypeMergeOptions UNIQUIFY -L 1"), -// 1, -// Arrays.asList("a07995587b855f3214fb71940bf23c0f")); -// executeTest("threeWayWithRefs", spec); -// } + public static String baseTestString(String args) { + return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; + } + + public void test1InOut(String file, String md5, boolean vcf3) { + test1InOut(file, md5, "", vcf3); + } + + public void test1InOut(String file, String md5, String args, boolean vcf3) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -priority v1 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file + args), + 1, + Arrays.asList(md5)); + executeTest("testInOut1--" + file, spec); + } + + public void combine2(String file1, String file2, String args, String md5, boolean vcf3) { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -priority v1,v2 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file1 + " -B:v2,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file2 + args), + 1, + Arrays.asList(md5)); + executeTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); + } + + public void combineSites(String args, String md5) { + String file1 = "1000G_omni2.5.b37.sites.vcf"; + String file2 = "hapmap_3.3.b37.sites.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference + + " -L 1:1-10,000,000 -B:omni,VCF " + validationDataLocation + file1 + + " -B:hm3,VCF " + validationDataLocation + file2 + args, + 1, + Arrays.asList(md5)); + executeTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); + } + + + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2117fff6e0d182cd20be508e9661829c", true); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "2cfaf7af3dd119df08b8a9c1f72e2f93", " -setKey foo", true); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "1474ac0fde2ce42a3c24f1c97eab333e", " -setKey null", true); } + @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "7fc66df048a0ab08cf507906e1d4a308", false); } // official project VCF files in tabix format + + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "ec9715f53dbf4531570557c212822f12", false); } + @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "f1072be5f5c6ee810276d9ca6537224d", false); } + + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "b77a1eec725201d9d8e74ee0c45638d3", false); } // official project VCF files in tabix format + @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "802977fdfd2f4905b501bb06800f60af", false); } // official project VCF files in tabix format + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a67157287dd2b24b5cdf7ebf8fcbbe9a", false); } + + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e1f4718a179f1196538a33863da04f53", false); } + + @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "b3783384b7c8e877b971033e90beba48", true); } + + @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "902e541c87caa72134db6293fc46f0ad"); } + @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "f339ad4bb5863b58b9c919ce7d040bb9"); } + + @Test public void threeWayWithRefs() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -B:NA19240_BGI,VCF "+validationDataLocation+"NA19240.BGI.RG.vcf" + + " -B:NA19240_ILLUMINA,VCF "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + + " -B:NA19240_WUGSC,VCF "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + + " -B:denovoInfo,VCF "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + + " -setKey centerSet" + + " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + + " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + + " -genotypeMergeOptions UNIQUIFY -L 1"), + 1, + Arrays.asList("a07995587b855f3214fb71940bf23c0f")); + executeTest("threeWayWithRefs", spec); + } // complex examples with filtering, indels, and multiple alleles @@ -119,7 +119,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); } - @Test public void complexTestFull() { combineComplexSites("", "64b991fd3850f83614518f7d71f0532f"); } +// @Test public void complexTestFull() { combineComplexSites("", "64b991fd3850f83614518f7d71f0532f"); } @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "0db9ef50fe54b60426474273d7c7fa99"); } @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "d20acb3d53ba0a02ce92d540ebeda2a9"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "8d1b3d120515f8b56b5a0d10bc5da713"); } diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml index 1862d6cbb..7e31934df 100644 --- a/public/packages/AnalyzeCovariates.xml +++ b/public/packages/AnalyzeCovariates.xml @@ -10,7 +10,6 @@ - diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index d71c6ae5d..a961beca1 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -19,10 +19,7 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport class MethodsDevelopmentCallingPipeline extends QScript { qscript => - @Argument(shortName="gatk", doc="gatk jar file", required=true) - var gatkJarFile: File = _ - - @Argument(shortName="outputDir", doc="output directory", required=true) + @Argument(shortName="outputDir", doc="output directory", required=false) var outputDir: String = "./" @Argument(shortName="skipCalling", doc="skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false) @@ -185,7 +182,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; - jarFile = gatkJarFile; memoryLimit = 4; phone_home = if ( LOCAL_ET ) GATKRunReport.PhoneHomeOption.STANDARD else GATKRunReport.PhoneHomeOption.AWS_S3 } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index bfcc4d48c..8ed3f84c1 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -138,30 +138,32 @@ class QGraph extends Logging { validate() if (running && numMissingValues == 0) { - logger.info("Generating scatter gather jobs.") val scatterGathers = jobGraph.edgeSet.filter(edge => scatterGatherable(edge)) + if (!scatterGathers.isEmpty) { + logger.info("Generating scatter gather jobs.") - var addedFunctions = List.empty[QFunction] - for (scatterGather <- scatterGathers) { - val functions = scatterGather.asInstanceOf[FunctionEdge] - .function.asInstanceOf[ScatterGatherableFunction] - .generateFunctions() - addedFunctions ++= functions + var addedFunctions = List.empty[QFunction] + for (scatterGather <- scatterGathers) { + val functions = scatterGather.asInstanceOf[FunctionEdge] + .function.asInstanceOf[ScatterGatherableFunction] + .generateFunctions() + addedFunctions ++= functions + } + + logger.info("Removing original jobs.") + this.jobGraph.removeAllEdges(scatterGathers) + prune() + + logger.info("Adding scatter gather jobs.") + addedFunctions.foreach(function => if (running) this.add(function)) + + logger.info("Regenerating graph.") + fill + val scatterGatherDotFile = if (settings.expandedDotFile != null) settings.expandedDotFile else settings.dotFile + if (scatterGatherDotFile != null) + renderToDot(scatterGatherDotFile) + validate() } - - logger.info("Removing original jobs.") - this.jobGraph.removeAllEdges(scatterGathers) - prune() - - logger.info("Adding scatter gather jobs.") - addedFunctions.foreach(function => if (running) this.add(function)) - - logger.info("Regenerating graph.") - fill - val scatterGatherDotFile = if (settings.expandedDotFile != null) settings.expandedDotFile else settings.dotFile - if (scatterGatherDotFile != null) - renderToDot(scatterGatherDotFile) - validate() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 57d133dfe..ac2f036b4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -286,11 +286,11 @@ object Lsf706JobRunner extends Logging { // LSB_SHAREDIR/cluster_name/logdir/lsb.acct (man bacct) // LSB_SHAREDIR/cluster_name/logdir/lsb.events (man bhist) logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(runner.jobId)) - val unknownStatusSeconds = (System.currentTimeMillis - runner.lastStatusUpdate) - if (unknownStatusSeconds > (unknownStatusMaxSeconds * 1000L)) { + val unknownStatusMillis = (System.currentTimeMillis - runner.lastStatusUpdate) + if (unknownStatusMillis > (unknownStatusMaxSeconds * 1000L)) { // Unknown status has been returned for a while now. runner.updateStatus(RunnerStatus.FAILED) - logger.error("Unable to read LSF status for %d minutes: job id %d: %s".format(unknownStatusSeconds/60, runner.jobId, runner.function.description)) + logger.error("Unable to read LSF status for %0.2f minutes: job id %d: %s".format(unknownStatusMillis/(60 * 1000D), runner.jobId, runner.function.description)) } } diff --git a/public/testdata/diffTestMaster.vcf b/public/testdata/diffTestMaster.vcf new file mode 100644 index 000000000..549f54345 --- /dev/null +++ b/public/testdata/diffTestMaster.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.0 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 +chr1 2979 rs62635286 T G 83.67 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,32:9:-33.61,-2.71,-0.00:27.09 +chr1 2981 rs62028691 A G 14.69 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,33:9:-32.12,-2.71,-0.00:27.08 +chr1 4536 rs11582131 G C 0.18 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:42,33:16:-41.67,-4.82,-26.29:99 +chr1 4562 rs11490464 C G 0.14 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:26,30:9:-19.64,-2.72,-14.87:99 +chr1 4770 rs6682375 A G 0.32 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:9,111:84:-306.27,-28.58,-3.46:99 +chr1 4793 rs6682385 A G 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:4,115:109:-350.74,-32.88,-0.10:99 +chr1 5074 rs11586607 T G 0.01 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:29,97:39:-130.41,-11.75,-3.82:79.31 +chr1 5137 rs62636497 A T 140.49 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:0,74:39:-148.99,-11.75,-0.01:99 diff --git a/public/testdata/diffTestTest.vcf b/public/testdata/diffTestTest.vcf new file mode 100644 index 000000000..8699ab253 --- /dev/null +++ b/public/testdata/diffTestTest.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.0 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 +chr1 2979 rs62635286 T G 83.67 CHANGED_FILTER AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,32:9:-33.61,-2.71,-0.00:27.09 +chr1 2981 rs62028691 A G 14.69 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:31,33:9:-32.12,-2.71,-0.00:27.08 +chr1 4536 rs11582131 G C 0.18 PASS AC=2;AF=0.50;AN=2 GT:AD:DP:GL:GQ 0/1:42,33:16:-41.67,-4.82,-26.29:99 +chr1 4562 rs11490464 C G 0.14 PASS AC=1;AF=0.50;AN=2 GT:AD:DP:GL:GQ 1/1:26,30:9:-19.64,-2.72,-14.87:99 +chr1 4770 rs6682375 A G 0.32 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 0/1:9,111:84:-306.27,-28.58,-3.46:99 +chr1 4793 rs6682385 A G 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:4,114:109:-350.74,-32.88,-0.10:99 +chr1 5074 rs11586607 T G 0.01 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:29,97:39:-130.41,-11.74,-3.82:79.31 +chr1 5137 rs62636497 A T 140.49 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:0,74:39:-148.99,-11.75,-0.01:9 diff --git a/settings/ivysettings.xml b/settings/ivysettings.xml index 1e47fa847..2b4a081d4 100644 --- a/settings/ivysettings.xml +++ b/settings/ivysettings.xml @@ -25,5 +25,7 @@ + + diff --git a/settings/repository/com.google/cofoja-1.0-20110609.jar b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.jar similarity index 100% rename from settings/repository/com.google/cofoja-1.0-20110609.jar rename to settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.jar diff --git a/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml new file mode 100644 index 000000000..38d4e88f1 --- /dev/null +++ b/settings/repository/com.google.code.cofoja/cofoja-1.0-20110609.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar new file mode 100644 index 000000000..dc77c7d33 Binary files /dev/null and b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2-sources.jar differ diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar new file mode 100644 index 000000000..f267be4b5 Binary files /dev/null and b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.jar differ diff --git a/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml new file mode 100644 index 000000000..c6a8da052 --- /dev/null +++ b/settings/repository/net.sf.gridscheduler/drmaa-6.2u5p2.xml @@ -0,0 +1,3 @@ + + +