GATKDoc descriptions for all standard codecs, or TODO for their owners

-- Also added vcf.gz support in the VCF codec. This wasn't committed in the last round, because it was missed by the parallel documentation effort.
2011-08-19 09:57:21 -04:00 · 2011-08-19 09:57:21 -04:00 · 198955f752
parent bc902e8421
commit 198955f752
12 changed files with 249 additions and 37 deletions
--- a/build.xml
+++ b/build.xml
@ -49,7 +49,7 @@
    
    <!-- Contracts for Java -->
    <!-- To disable, run with -Duse.contracts=false -->
-    <property name="use.contracts" value="true" />
+    <property name="use.contracts" value="false" />
    <property name="java.contracts" value="${build.dir}/java/contracts" />
    <property name="contracts.version" value="1.0-20110609" />
    <property name="cofoja.jar" value="${lib.dir}/cofoja-${contracts.version}.jar"/>
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java
@ -40,6 +40,29 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.regex.Pattern;

+/**
+ * TODO GUILLERMO DEL ANGEL
+ *
+ * <p>
+ * Codec Description
+ * </p>
+ *
+ * <p>
+ * See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
+ * </p>
+
+ * </p>
+ *
+ * <h2>File format example</h2>
+ * <pre>
+ *     line 1
+ *     line 2
+ *     line 3
+ * </pre>
+ *
+ * @author Mark DePristo
+ * @since 2010
+ */
 public class BeagleCodec implements ReferenceDependentFeatureCodec<BeagleFeature> {
    private String[] header;
    public enum BeagleReaderType {PROBLIKELIHOOD, GENOTYPES, R2};
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java
@ -33,12 +33,43 @@ import java.io.IOException;
 import java.util.Arrays;

 /**
- * a codec for the file types produced by the HapMap consortium, available on their website:
- * http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/
+ * A codec for the file types produced by the HapMap consortium
 *
- * The format includes eleven standard fields, plus genotypes for each of the samples included
- * in the file
- * 
+ * <p>
+ *     The format includes eleven standard fields, plus genotypes for each of the samples included
+ *     in the file:
+ *
+ * <pre>
+ *     Col1: refSNP rs# identifier at the time of release (NB might merge with another rs# in the future)
+ *     Col2: SNP alleles according to dbSNP
+ *     Col3: chromosome that SNP maps to
+ *     Col4: chromosome position of SNP, in basepairs on reference sequence
+ *     Col5: strand of reference sequence that SNP maps to
+ *     Col6: version of reference sequence assembly
+ *     Col7: HapMap genotype center that produced the genotypes
+ *     Col8: LSID for HapMap protocol used for genotyping
+ *     Col9: LSID for HapMap assay used for genotyping
+ *     Col10: LSID for panel of individuals genotyped
+ *     Col11: QC-code, currently 'QC+' for all entries (for future use)
+ *     Col12 and on: observed genotypes of samples, one per column, sample identifiers in column headers (Coriell catalog numbers, example: NA10847). Duplicate samples have .dup suffix.
+ * </pre>
+ * </p>
+ *
+ * <p>
+ *  See also: @See <a href="http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/">HapMap genotypes download</a>
+ * </p>
+ *
+ * <h2>File format example</h2>
+ * From <a href="http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest/forward/non-redundant/genotypes_chr1_ASW_r27_nr.b36_fwd.txt.gz">genotypes_chr1_ASW_r27_nr.b36_fwd.txt.gz</a>:
+ * <pre>
+ *     rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode NA19625 NA19700 NA19701 NA19702 NA19703 NA19704 NA19705 NA19708 NA19712 NA19711 NA19818 NA19819 NA19828 NA19835 NA19834 NA19836 NA19902 NA19901 NA19900 NA19904 NA19919 NA19908 NA19909 NA19914 NA19915 NA19916 NA19917 NA19918 NA19921 NA20129 NA19713 NA19982 NA19983 NA19714 NA19985 NA20128 NA20126 NA20127 NA20277 NA20276 NA20279 NA20282 NA20281 NA20284 NA20287 NA20288 NA20290 NA20289 NA20291 NA20292 NA20295 NA20294 NA20297 NA20300 NA20301 NA20302 NA20317 NA20319 NA20322 NA20333 NA20332 NA20335 NA20334 NA20337 NA20336 NA20340 NA20341 NA20343 NA20342 NA20344 NA20345 NA20346 NA20347 NA20348 NA20349 NA20350 NA20357 NA20356 NA20358 NA20359 NA20360 NA20363 NA20364
+ *     rs9629043 C/T chr1 554636 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8575115:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ CC CC CC CC CC CC CC CC CC CC CC CC NN CC CC CC CT CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC
+ *     rs28446478 G/T chr1 576058 + ncbi_b36 sanger urn:LSID:illumina.hapmap.org:Protocol:Human_1M_BeadChip:3 urn:LSID:sanger.hapmap.org:Assay:H1Mrs28446478:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GT TT GT TT TT TT TT GT GT TT TT TT TT GT GT GT GT TT GT TT GT GT TT GT GT TT TT TT GT GT TT TT TT GT TT GT TT GT GT GT GT GT TT GT TT TT GT GT TT TT TT TT TT TT GT GT GT GT TT TT TT TT GT TT GT TT TT GT TT TT TT GT TT TT TT GT GT TT GT TT GT TT TT
+ *     rs12565286 C/G chr1 711153 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8709646:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG GG CG NN GG GG GG GG GG GG NN GG NN NN
+ * </pre>
+ *
+ * @author Mark DePristo
+ * @since 2010
 */
 public class RawHapMapCodec implements FeatureCodec {
    // the minimum number of features in the HapMap file line
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@ -12,7 +12,23 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;

 /**
- * the ref seq codec
+ * TODO FOR CHRIS HARTL
+ *
+ * <p>
+ * Codec Description
+ * </p>
+ *
+ * <p>
+ * See also: link to file specification
+ * </p>
+ *
+ * <h2>File format example</h2>
+ * <p>
+ *     A BAM file containing <b>exactly one sample</b>.
+ * </p>
+ *
+ * @author Mark DePristo
+ * @since 2010
 */
 public class RefSeqCodec implements ReferenceDependentFeatureCodec<RefSeqFeature> {

--- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java
@ -38,10 +38,43 @@ import java.util.regex.Pattern;
 import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;

 /**
- * A Tribble encoder / decoder for SAM pileup data. 
+ * Decoder for SAM pileup data.  For GATK validation purposes only
 *
- * @author mhanna
- * @version 0.1
+ * <p>
+ *     Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute.
+ *     It desribes the base-pair information at each chromosomal position. This format
+ *     facilitates SNP/indel calling and brief alignment viewing by eyes.
+ * </p>
+ * <p>
+ *     Each line consists of chromosome, 1-based coordinate, reference base, the
+ *     number of reads covering the site, read bases and base qualities. At the
+ *     read base column, a dot stands for a match to the reference base on the
+ *     forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch
+ *     on the forward strand and `acgtn' for a mismatch on the reverse strand.
+ *     A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between
+ *     this reference position and the next reference position. The length of the
+ *     insertion is given by the integer in the pattern, followed by the inserted sequence.
+ * </p>
+ *
+ * <p>
+ *     <br>See also: @see <a href="http://samtools.sourceforge.net/">SAMTools project</a></br>
+ *     <br>See also: @see <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format</a></br>
+ * </p>
+ *
+ * <h2>File format example</h2>
+ * <pre>
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * </pre>
+ *
+ * @author Matt Hanna
+ * @since 2009
 */
 public class SAMPileupCodec implements FeatureCodec<SAMPileupFeature> {
    // the number of tokens we expect to parse from a pileup line
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java
@ -36,8 +36,21 @@ import org.broad.tribble.util.ParsingUtils;
 /**
 * Decodes a simple SAM text string.
 *
- * @author mhanna
- * @version 0.1
+ * <p>
+ * Reads in the SAM text version of a BAM file as a ROD.  For testing only
+ * </p>
+ *
+ * <p>
+ * See also: @see <a href="http://samtools.sourceforge.net">SAMTools</a> for format specification
+ * </p>
+ *
+ * <h2>File format example</h2>
+ * <pre>
+ *     SL-XBC:1:10:628:923#0	16	Escherichia_coli_K12	1	37	76M	=	1	0	AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA	B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB
+ * </pre>
+ *
+ * @author Matt Hanna
+ * @since 2009
 */
 public class SAMReadCodec implements FeatureCodec<SAMReadFeature> {
    /* SL-XBC:1:10:628:923#0	16	Escherichia_coli_K12	1	37	76M	=	1	0	AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA	B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java
@ -41,10 +41,11 @@ import java.io.IOException;

 /**
 * Codec for decoding the output format of the SnpEff variant effect predictor tool
- * (http://snpeff.sourceforge.net/).
 *
+ * <p>
 * This format has 23 tab-delimited fields:
 *
+ * <pre>
 * Chromosome
 * Position
 * Reference
@ -68,10 +69,16 @@ import java.io.IOException;
 * Codons Around
 * Amino Acids Around
 * Custom Interval ID
+ * </pre>
+ * Note that we treat all except the Chromosome, Position, and Effect fields as optional.
+ * </p>
 *
- * We treat all except the Chromosome, Position, and Effect fields as optional.
+ * <p>
+ * See also: @see <a href="http://snpeff.sourceforge.net/">SNPEff project page</a>
+ * </p>
 *
 * @author David Roazen
+ * @since 2011
 */
 public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec {

--- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java
@ -6,14 +6,19 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
 import java.util.Arrays;

 /**
- * Created by IntelliJ IDEA.
- * User: chartl
- * Date: 3/28/11
- * Time: 2:47 PM
- * To change this template use File | Settings | File Templates.
- */
-/**
- * The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop)
+ * The standard table codec that expects loci as contig start stop, not contig:start-stop
+ *
+ * <p>
+ * The standard table codec with a slightly different parsing convention
+ * (expects loci as contig start stop, not contig:start-stop)
+ * </p>
+ *
+ * <p>
+ * See also: TableCodec
+ * </p>
+ *
+ * @author Chris Hartl
+ * @since 2010
 */
 public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec {

--- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
@ -11,13 +11,40 @@ import java.util.ArrayList;
 import java.util.Arrays;

 /**
- * implementation of a simple table (tab or comma delimited format) input files
+ * Reads tab deliminated tabular text files
+ *
+ * <p>
+ *     <ul>
+ *     <li>Header: must begin with line HEADER or track (for IGV), followed by any number of column names,
+ *     separated by whitespace.</li>
+ *     <li>Comment lines starting with # are ignored</li>
+ *     <li>Each non-header and non-comment line is split into parts by whitespace,
+ *     and these parts are assigned as a map to their corresponding column name in the header.
+ *     Note that the first element (corresponding to the HEADER column) must be a valid genome loc
+ *     such as 1, 1:1 or 1:1-10, which is the position of the Table element on the genome.  TableCodec
+ *     requires that there be one value for each column in the header, and no more, on all lines.</li>
+ *     </ul>
+ * </p>
+ *
+ * </p>
+ *
+ * <h2>File format example</h2>
+ * <pre>
+ *     HEADER a b c
+ *     1:1  1   2   3
+ *     1:2  4   5   6
+ *     1:3  7   8   9
+ * </pre>
+ *
+ * @author Mark DePristo
+ * @since 2009
 */
 public class TableCodec implements ReferenceDependentFeatureCodec {
-    protected String delimiterRegex = "\\s+";
-    protected String headerDelimiter = "HEADER";
-    protected String igvHeaderDelimiter = "track";
-    protected String commentDelimiter = "#";
+    final static protected String delimiterRegex = "\\s+";
+    final static protected String headerDelimiter = "HEADER";
+    final static protected String igvHeaderDelimiter = "track";
+    final static protected String commentDelimiter = "#";
+
    protected ArrayList<String> header = new ArrayList<String>();

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -14,10 +14,9 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;

-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
+import java.io.*;
 import java.util.*;
+import java.util.zip.GZIPInputStream;


 public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser, SelfScopingFeatureCodec {
@ -623,9 +622,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,

    public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
        try {
-            char[] buff = new char[MAGIC_HEADER_LINE.length()];
-            new FileReader(potentialInput).read(buff, 0, MAGIC_HEADER_LINE.length());
+            return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
+                    isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
+        } catch ( FileNotFoundException e ) {
+            return false;
+        } catch ( IOException e ) {
+            return false;
+        }
+    }
+
+    private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) {
+        try {
+            byte[] buff = new byte[MAGIC_HEADER_LINE.length()];
+            stream.read(buff, 0, MAGIC_HEADER_LINE.length());
            String firstLine = new String(buff);
+            stream.close();
            return firstLine.startsWith(MAGIC_HEADER_LINE);
        } catch ( IOException e ) {
            return false;
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java
@ -14,8 +14,20 @@ import java.util.*;


 /**
- * a feature codec for the VCF 3 specification.  Our aim is to read in the records and convert to VariantContext as
- * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
+ * A feature codec for the VCF3 specification, to read older VCF files.  VCF3 has been
+ * depreciated in favor of VCF4 (See VCF codec for the latest information)
+ *
+ * <p>
+ * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example)
+ * </p>
+ *
+ * <p>
+ * See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
+ * See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
+ * </p>
+ *
+ * @author Mark DePristo
+ * @since 2010
 */
 public class VCF3Codec extends AbstractVCFCodec {
    public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3";
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
@ -12,12 +12,46 @@ import java.io.FileReader;
 import java.io.IOException;
 import java.util.*;

-
 /**
- * a feature codec for the VCF 4 specification.  Our aim is to read in the records and convert to VariantContext as
- * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
+ * A feature codec for the VCF 4 specification
+ *
+ * <p>
+ * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a
+ * header line, and then data lines each containing information about a position in the genome.
+ * </p>
+ * <p>One of the main uses of next-generation sequencing is to discover variation amongst large populations
+ * of related samples. Recently the format for storing next-generation read alignments has been
+ * standardised by the SAM/BAM file format specification. This has significantly improved the
+ * interoperability of next-generation tools for alignment, visualisation, and variant calling.
+ * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent
+ * types of sequence variation, including SNPs, indels and larger structural variants, together
+ * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for
+ * fast data retrieval of variants from a range of positions on the reference genome.
+ * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects
+ * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements
+ * various utilities for processing VCF files, including validation, merging and comparing,
+ * and also provides a general Perl and Python API.
+ * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.</p>
+ *
+ * <p>
+ * See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
+ * See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
+ * </p>
+ *
+ * <h2>File format example</h2>
+ * <pre>
+ *     ##fileformat=VCFv4.0
+ *     #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
+ *     chr1    109     .       A       T       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:610,327:308:-316.30,-95.47,-803.03:99
+ *     chr1    147     .       C       A       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:294,49:118:-57.87,-34.96,-338.46:99
+ * </pre>
+ *
+ * @author Mark DePristo
+ * @since 2010
 */
 public class VCFCodec extends AbstractVCFCodec {
+    // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
+
    public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";

    /**