diff --git a/build.xml b/build.xml
index 85955d774..d8c38738a 100644
--- a/build.xml
+++ b/build.xml
@@ -49,7 +49,7 @@
-
+ * Codec Description + *
+ * + *
+ * See also: @see VCF specification
+ *
+ * line 1 + * line 2 + * line 3 + *+ * + * @author Mark DePristo + * @since 2010 + */ public class BeagleCodec implements ReferenceDependentFeatureCodec
+ * The format includes eleven standard fields, plus genotypes for each of the samples included + * in the file: + * + *
+ * Col1: refSNP rs# identifier at the time of release (NB might merge with another rs# in the future) + * Col2: SNP alleles according to dbSNP + * Col3: chromosome that SNP maps to + * Col4: chromosome position of SNP, in basepairs on reference sequence + * Col5: strand of reference sequence that SNP maps to + * Col6: version of reference sequence assembly + * Col7: HapMap genotype center that produced the genotypes + * Col8: LSID for HapMap protocol used for genotyping + * Col9: LSID for HapMap assay used for genotyping + * Col10: LSID for panel of individuals genotyped + * Col11: QC-code, currently 'QC+' for all entries (for future use) + * Col12 and on: observed genotypes of samples, one per column, sample identifiers in column headers (Coriell catalog numbers, example: NA10847). Duplicate samples have .dup suffix. + *+ * + * + *
+ * See also: @See HapMap genotypes download + *
+ * + *+ * rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode NA19625 NA19700 NA19701 NA19702 NA19703 NA19704 NA19705 NA19708 NA19712 NA19711 NA19818 NA19819 NA19828 NA19835 NA19834 NA19836 NA19902 NA19901 NA19900 NA19904 NA19919 NA19908 NA19909 NA19914 NA19915 NA19916 NA19917 NA19918 NA19921 NA20129 NA19713 NA19982 NA19983 NA19714 NA19985 NA20128 NA20126 NA20127 NA20277 NA20276 NA20279 NA20282 NA20281 NA20284 NA20287 NA20288 NA20290 NA20289 NA20291 NA20292 NA20295 NA20294 NA20297 NA20300 NA20301 NA20302 NA20317 NA20319 NA20322 NA20333 NA20332 NA20335 NA20334 NA20337 NA20336 NA20340 NA20341 NA20343 NA20342 NA20344 NA20345 NA20346 NA20347 NA20348 NA20349 NA20350 NA20357 NA20356 NA20358 NA20359 NA20360 NA20363 NA20364 + * rs9629043 C/T chr1 554636 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8575115:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ CC CC CC CC CC CC CC CC CC CC CC CC NN CC CC CC CT CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC + * rs28446478 G/T chr1 576058 + ncbi_b36 sanger urn:LSID:illumina.hapmap.org:Protocol:Human_1M_BeadChip:3 urn:LSID:sanger.hapmap.org:Assay:H1Mrs28446478:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GT TT GT TT TT TT TT GT GT TT TT TT TT GT GT GT GT TT GT TT GT GT TT GT GT TT TT TT GT GT TT TT TT GT TT GT TT GT GT GT GT GT TT GT TT TT GT GT TT TT TT TT TT TT GT GT GT GT TT TT TT TT GT TT GT TT TT GT TT TT TT GT TT TT TT GT GT TT GT TT GT TT TT + * rs12565286 C/G chr1 711153 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8709646:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG GG CG NN GG GG GG GG GG GG NN GG NN NN + *+ * + * @author Mark DePristo + * @since 2010 */ public class RawHapMapCodec implements FeatureCodec { // the minimum number of features in the HapMap file line diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index 391715c63..d94d9ff84 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -12,7 +12,23 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.ArrayList; /** - * the ref seq codec + * TODO FOR CHRIS HARTL + * + *
+ * Codec Description + *
+ * + *+ * See also: link to file specification + *
+ * + *+ * A BAM file containing exactly one sample. + *
+ * + * @author Mark DePristo + * @since 2010 */ public class RefSeqCodec implements ReferenceDependentFeatureCodec+ * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute. + * It desribes the base-pair information at each chromosomal position. This format + * facilitates SNP/indel calling and brief alignment viewing by eyes. + *
+ *+ * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *
+ * + *
+ *
See also: @see SAMTools project
+ *
See also: @see Pileup format
+ *
+ * seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<& + * seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+ + * seq1 274 T 23 ,.$....,,.,.,...,,,.,... 7<7;<;<<<<<<<<<=<;<;<<6 + * seq1 275 A 23 ,$....,,.,.,...,,,.,...^l. <+;9*<<<<<<<<<=<<:;<<<< + * seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6< + * seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&< + * seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<< + * seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<< + *+ * + * @author Matt Hanna + * @since 2009 */ public class SAMPileupCodec implements FeatureCodec
+ * Reads in the SAM text version of a BAM file as a ROD. For testing only + *
+ * + *+ * See also: @see SAMTools for format specification + *
+ * + *+ * SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB + *+ * + * @author Matt Hanna + * @since 2009 */ public class SAMReadCodec implements FeatureCodec
* This format has 23 tab-delimited fields: * + *
* Chromosome * Position * Reference @@ -68,10 +69,16 @@ import java.io.IOException; * Codons Around * Amino Acids Around * Custom Interval ID + *+ * Note that we treat all except the Chromosome, Position, and Effect fields as optional. + * * - * We treat all except the Chromosome, Position, and Effect fields as optional. + *
+ * See also: @see SNPEff project page + *
* * @author David Roazen + * @since 2011 */ public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java index 6fe1907e3..fdcc8ed10 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java @@ -6,14 +6,19 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import java.util.Arrays; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 3/28/11 - * Time: 2:47 PM - * To change this template use File | Settings | File Templates. - */ -/** - * The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop) + * The standard table codec that expects loci as contig start stop, not contig:start-stop + * + *+ * The standard table codec with a slightly different parsing convention + * (expects loci as contig start stop, not contig:start-stop) + *
+ * + *+ * See also: TableCodec + *
+ * + * @author Chris Hartl + * @since 2010 */ public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java index 2ce7c679e..1919ccbf0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java @@ -11,13 +11,40 @@ import java.util.ArrayList; import java.util.Arrays; /** - * implementation of a simple table (tab or comma delimited format) input files + * Reads tab deliminated tabular text files + * + *+ *
+ * HEADER a b c + * 1:1 1 2 3 + * 1:2 4 5 6 + * 1:3 7 8 9 + *+ * + * @author Mark DePristo + * @since 2009 */ public class TableCodec implements ReferenceDependentFeatureCodec { - protected String delimiterRegex = "\\s+"; - protected String headerDelimiter = "HEADER"; - protected String igvHeaderDelimiter = "track"; - protected String commentDelimiter = "#"; + final static protected String delimiterRegex = "\\s+"; + final static protected String headerDelimiter = "HEADER"; + final static protected String igvHeaderDelimiter = "track"; + final static protected String commentDelimiter = "#"; + protected ArrayList
+ * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example) + *
+ * + *
+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication
+ *
+ * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a + * header line, and then data lines each containing information about a position in the genome. + *
+ *One of the main uses of next-generation sequencing is to discover variation amongst large populations + * of related samples. Recently the format for storing next-generation read alignments has been + * standardised by the SAM/BAM file format specification. This has significantly improved the + * interoperability of next-generation tools for alignment, visualisation, and variant calling. + * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * types of sequence variation, including SNPs, indels and larger structural variants, together + * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for + * fast data retrieval of variants from a range of positions on the reference genome. + * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects + * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements + * various utilities for processing VCF files, including validation, merging and comparing, + * and also provides a general Perl and Python API. + * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.
+ * + *
+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication
+ *
+ * ##fileformat=VCFv4.0 + * #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 + * chr1 109 . A T 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99 + * chr1 147 . C A 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:294,49:118:-57.87,-34.96,-338.46:99 + *+ * + * @author Mark DePristo + * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /**