+ * The format includes eleven standard fields, plus genotypes for each of the samples included
+ * in the file:
+ *
+ *
{
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
index f4048d37d..f4633b2ce 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
@@ -38,10 +38,43 @@ import java.util.regex.Pattern;
import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;
/**
- * A Tribble encoder / decoder for SAM pileup data.
+ * Decoder for SAM pileup data. For GATK validation purposes only
*
- * @author mhanna
- * @version 0.1
+ *
+ * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute.
+ * It desribes the base-pair information at each chromosomal position. This format
+ * facilitates SNP/indel calling and brief alignment viewing by eyes.
+ *
+ *
+ * Each line consists of chromosome, 1-based coordinate, reference base, the
+ * number of reads covering the site, read bases and base qualities. At the
+ * read base column, a dot stands for a match to the reference base on the
+ * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch
+ * on the forward strand and `acgtn' for a mismatch on the reverse strand.
+ * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between
+ * this reference position and the next reference position. The length of the
+ * insertion is given by the integer in the pattern, followed by the inserted sequence.
+ *
+ *
+ *
+ *
See also: @see SAMTools project
+ *
See also: @see Pileup format
+ *
+ *
+ * File format example
+ *
+ * seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ * seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ * seq1 274 T 23 ,.$....,,.,.,...,,,.,... 7<7;<;<<<<<<<<<=<;<;<<6
+ * seq1 275 A 23 ,$....,,.,.,...,,,.,...^l. <+;9*<<<<<<<<<=<<:;<<<<
+ * seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6<
+ * seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&<
+ * seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
+ * seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ *
+ *
+ * @author Matt Hanna
+ * @since 2009
*/
public class SAMPileupCodec implements FeatureCodec {
// the number of tokens we expect to parse from a pileup line
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
index f6861e585..d4bdb5aa9 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
@@ -36,8 +36,21 @@ import org.broad.tribble.util.ParsingUtils;
/**
* Decodes a simple SAM text string.
*
- * @author mhanna
- * @version 0.1
+ *
+ * Reads in the SAM text version of a BAM file as a ROD. For testing only
+ *
+ *
+ *
+ * See also: @see SAMTools for format specification
+ *
+ *
+ * File format example
+ *
+ * SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB
+ *
+ *
+ * @author Matt Hanna
+ * @since 2009
*/
public class SAMReadCodec implements FeatureCodec {
/* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
index b5efb49a7..7f3d9e17d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
@@ -41,10 +41,11 @@ import java.io.IOException;
/**
* Codec for decoding the output format of the SnpEff variant effect predictor tool
- * (http://snpeff.sourceforge.net/).
*
+ *
* This format has 23 tab-delimited fields:
*
+ *
* Chromosome
* Position
* Reference
@@ -68,10 +69,16 @@ import java.io.IOException;
* Codons Around
* Amino Acids Around
* Custom Interval ID
+ *
+ * Note that we treat all except the Chromosome, Position, and Effect fields as optional.
+ *
*
- * We treat all except the Chromosome, Position, and Effect fields as optional.
+ *
+ * See also: @see SNPEff project page
+ *
*
* @author David Roazen
+ * @since 2011
*/
public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec {
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
index 6fe1907e3..fdcc8ed10 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
@@ -6,14 +6,19 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import java.util.Arrays;
/**
- * Created by IntelliJ IDEA.
- * User: chartl
- * Date: 3/28/11
- * Time: 2:47 PM
- * To change this template use File | Settings | File Templates.
- */
-/**
- * The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop)
+ * The standard table codec that expects loci as contig start stop, not contig:start-stop
+ *
+ *
+ * The standard table codec with a slightly different parsing convention
+ * (expects loci as contig start stop, not contig:start-stop)
+ *
+ *
+ *
+ * See also: TableCodec
+ *
+ *
+ * @author Chris Hartl
+ * @since 2010
*/
public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec {
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
index 2ce7c679e..1919ccbf0 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
@@ -11,13 +11,40 @@ import java.util.ArrayList;
import java.util.Arrays;
/**
- * implementation of a simple table (tab or comma delimited format) input files
+ * Reads tab deliminated tabular text files
+ *
+ *
+ *
+ * - Header: must begin with line HEADER or track (for IGV), followed by any number of column names,
+ * separated by whitespace.
+ * - Comment lines starting with # are ignored
+ * - Each non-header and non-comment line is split into parts by whitespace,
+ * and these parts are assigned as a map to their corresponding column name in the header.
+ * Note that the first element (corresponding to the HEADER column) must be a valid genome loc
+ * such as 1, 1:1 or 1:1-10, which is the position of the Table element on the genome. TableCodec
+ * requires that there be one value for each column in the header, and no more, on all lines.
+ *
+ *
+ *
+ *
+ *
+ * File format example
+ *
+ * HEADER a b c
+ * 1:1 1 2 3
+ * 1:2 4 5 6
+ * 1:3 7 8 9
+ *
+ *
+ * @author Mark DePristo
+ * @since 2009
*/
public class TableCodec implements ReferenceDependentFeatureCodec {
- protected String delimiterRegex = "\\s+";
- protected String headerDelimiter = "HEADER";
- protected String igvHeaderDelimiter = "track";
- protected String commentDelimiter = "#";
+ final static protected String delimiterRegex = "\\s+";
+ final static protected String headerDelimiter = "HEADER";
+ final static protected String igvHeaderDelimiter = "track";
+ final static protected String commentDelimiter = "#";
+
protected ArrayList header = new ArrayList();
/**
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
index 19f58ddaa..46242c302 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@@ -14,10 +14,9 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
+import java.io.*;
import java.util.*;
+import java.util.zip.GZIPInputStream;
public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser, SelfScopingFeatureCodec {
@@ -623,9 +622,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
try {
- char[] buff = new char[MAGIC_HEADER_LINE.length()];
- new FileReader(potentialInput).read(buff, 0, MAGIC_HEADER_LINE.length());
+ return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
+ isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
+ } catch ( FileNotFoundException e ) {
+ return false;
+ } catch ( IOException e ) {
+ return false;
+ }
+ }
+
+ private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) {
+ try {
+ byte[] buff = new byte[MAGIC_HEADER_LINE.length()];
+ stream.read(buff, 0, MAGIC_HEADER_LINE.length());
String firstLine = new String(buff);
+ stream.close();
return firstLine.startsWith(MAGIC_HEADER_LINE);
} catch ( IOException e ) {
return false;
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java
index ea16595bb..e5b1a2de5 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java
@@ -14,8 +14,20 @@ import java.util.*;
/**
- * a feature codec for the VCF 3 specification. Our aim is to read in the records and convert to VariantContext as
- * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
+ * A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been
+ * depreciated in favor of VCF4 (See VCF codec for the latest information)
+ *
+ *
+ * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example)
+ *
+ *
+ *
+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication
+ *
+ *
+ * @author Mark DePristo
+ * @since 2010
*/
public class VCF3Codec extends AbstractVCFCodec {
public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3";
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
index 55a0eb3f9..fa030ef5f 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
@@ -12,12 +12,46 @@ import java.io.FileReader;
import java.io.IOException;
import java.util.*;
-
/**
- * a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as
- * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
+ * A feature codec for the VCF 4 specification
+ *
+ *
+ * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a
+ * header line, and then data lines each containing information about a position in the genome.
+ *
+ * One of the main uses of next-generation sequencing is to discover variation amongst large populations
+ * of related samples. Recently the format for storing next-generation read alignments has been
+ * standardised by the SAM/BAM file format specification. This has significantly improved the
+ * interoperability of next-generation tools for alignment, visualisation, and variant calling.
+ * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent
+ * types of sequence variation, including SNPs, indels and larger structural variants, together
+ * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for
+ * fast data retrieval of variants from a range of positions on the reference genome.
+ * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects
+ * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements
+ * various utilities for processing VCF files, including validation, merging and comparing,
+ * and also provides a general Perl and Python API.
+ * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.
+ *
+ *
+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication
+ *
+ *
+ * File format example
+ *
+ * ##fileformat=VCFv4.0
+ * #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
+ * chr1 109 . A T 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99
+ * chr1 147 . C A 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:294,49:118:-57.87,-34.96,-338.46:99
+ *
+ *
+ * @author Mark DePristo
+ * @since 2010
*/
public class VCFCodec extends AbstractVCFCodec {
+ // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
+
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
/**
From 0f25167efd3b765c8a40a8f6b90777e5a2eb4874 Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Fri, 19 Aug 2011 11:01:04 -0400
Subject: [PATCH 02/11] minor fix in VariantEval docs
---
.../sting/gatk/walkers/varianteval/VariantEvalWalker.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index d1fa3f4df..f6d42afb1 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -95,7 +95,7 @@ public class VariantEvalWalker extends RodWalker implements Tr
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
// Help arguments
- @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit")
+ @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
protected Boolean LIST = false;
// Partitioning the data arguments
From 4d1fd17a97aa49e48c6163b84ff3ff15725bea66 Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Fri, 19 Aug 2011 13:13:41 -0400
Subject: [PATCH 03/11] GATKDoclet cleanup and documentation
-- Fixed bug in the way ArgumentCollections were handled that lead to failure in handling the dbsnp argument collection.
---
.../sting/utils/help/GATKDoclet.java | 26 +-
.../help/GenericDocumentationHandler.java | 263 +++++++++++-------
2 files changed, 189 insertions(+), 100 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java
index 5755d2b37..de6ad359e 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java
@@ -34,7 +34,10 @@ import org.apache.commons.io.FileUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.walkers.qc.DocumentationTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.*;
import java.util.*;
@@ -48,6 +51,7 @@ public class GATKDoclet {
final protected static Logger logger = Logger.getLogger(GATKDoclet.class);
protected static String buildTimestamp = null, absoluteVersion = null;
protected static boolean showHiddenFeatures = false;
+ protected static boolean testOnly = false;
RootDoc rootDoc;
@@ -75,6 +79,8 @@ public class GATKDoclet {
absoluteVersion = options[1];
if (options[0].equals("-include-hidden"))
showHiddenFeatures = true;
+ if (options[0].equals("-test"))
+ testOnly = true;
}
GATKDoclet doclet = new GATKDoclet();
@@ -88,16 +94,26 @@ public class GATKDoclet {
* @return Number of potential parameters; 0 if not supported.
*/
public static int optionLength(String option) {
- if(option.equals("-build-timestamp") || option.equals("-absolute-version") || option.equals("-include-hidden")) {
+ if(option.equals("-build-timestamp") ||
+ option.equals("-absolute-version") ||
+ option.equals("-include-hidden")) {
return 2;
- }
- return 0;
+ } else if ( option.equals("-test") )
+ return 1;
+ else
+ return 0;
}
public boolean showHiddenFeatures() {
return showHiddenFeatures;
}
+ public static boolean testOnly() {
+ return testOnly;
+ }
+
+ private static final List> testOnlyKeepers = Arrays.asList(
+ DocumentationTest.class, CommandLineGATK.class, UserException.class);
public Set workUnits() {
TreeSet m = new TreeSet();
@@ -105,6 +121,10 @@ public class GATKDoclet {
//logger.debug("Considering " + doc);
Class clazz = getClassForClassDoc(doc);
+ // don't add anything that's not DocumentationTest if we are in test mode
+ if ( clazz != null && testOnly && ! testOnlyKeepers.contains(clazz) )
+ continue;
+
//if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance"))
// logger.debug("foo");
diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
index d7add9af0..08e430c8a 100644
--- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
+++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java
@@ -24,6 +24,7 @@
package org.broadinstitute.sting.utils.help;
+import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import com.sun.javadoc.ClassDoc;
import com.sun.javadoc.FieldDoc;
@@ -31,8 +32,10 @@ import com.sun.javadoc.RootDoc;
import com.sun.javadoc.Tag;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
+import org.broad.tribble.bed.FullBEDFeature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.JVMUtils;
@@ -49,14 +52,18 @@ import java.util.*;
*/
public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class);
- GATKDocWorkUnit toProcess;
- ClassDoc classdoc;
- Set all;
- RootDoc rootDoc;
+
+ /** The Class we are documenting */
+ private GATKDocWorkUnit toProcess;
+
+ /** The set of all classes we are documenting, for cross-referencing */
+ private Set all;
+
+ /** The JavaDoc root */
+ private RootDoc rootDoc;
@Override
public boolean includeInDocs(ClassDoc doc) {
-// return true;
try {
Class type = HelpUtils.getClassForDoc(doc);
return JVMUtils.isConcrete(type);
@@ -76,7 +83,6 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
this.rootDoc = rootDoc;
this.toProcess = toProcessArg;
this.all = allArg;
- this.classdoc = toProcess.classDoc;
//System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc);
Map root = new HashMap();
@@ -88,71 +94,76 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
toProcess.setHandlerContent((String)root.get("summary"), root);
}
+ /**
+ * Add high-level summary information about toProcess to root, such as its
+ * name, summary, description, version, etc.
+ *
+ * @param root
+ */
protected void addHighLevelBindings(Map root) {
- root.put("name", classdoc.name());
+ root.put("name", toProcess.classDoc.name());
// Extract overrides from the doc tags.
StringBuilder summaryBuilder = new StringBuilder();
- for(Tag tag: classdoc.firstSentenceTags())
+ for(Tag tag: toProcess.classDoc.firstSentenceTags())
summaryBuilder.append(tag.text());
root.put("summary", summaryBuilder.toString());
- root.put("description", classdoc.commentText().substring(summaryBuilder.toString().length()));
+ root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length()));
root.put("timestamp", toProcess.buildTimestamp);
root.put("version", toProcess.absoluteVersion);
- for(Tag tag: classdoc.tags()) {
+ for(Tag tag: toProcess.classDoc.tags()) {
root.put(tag.name(), tag.text());
}
}
+ /**
+ * Add bindings describing related GATK capabilites to toProcess
+ * @param root
+ */
+ protected void addRelatedBindings(Map root) {
+ List