diff --git a/build.xml b/build.xml index 85955d774..d8c38738a 100644 --- a/build.xml +++ b/build.xml @@ -49,7 +49,7 @@ - + diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java index e328c9286..413848543 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java @@ -40,6 +40,29 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.regex.Pattern; +/** + * TODO GUILLERMO DEL ANGEL + * + *

+ * Codec Description + *

+ * + *

+ * See also: @see VCF specification
+ *

+ + *

+ * + *

File format example

+ *
+ *     line 1
+ *     line 2
+ *     line 3
+ * 
+ * + * @author Mark DePristo + * @since 2010 + */ public class BeagleCodec implements ReferenceDependentFeatureCodec { private String[] header; public enum BeagleReaderType {PROBLIKELIHOOD, GENOTYPES, R2}; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 535f607a1..a80e05d59 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -33,12 +33,43 @@ import java.io.IOException; import java.util.Arrays; /** - * a codec for the file types produced by the HapMap consortium, available on their website: - * http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/ + * A codec for the file types produced by the HapMap consortium * - * The format includes eleven standard fields, plus genotypes for each of the samples included - * in the file - * + *

+ * The format includes eleven standard fields, plus genotypes for each of the samples included + * in the file: + * + *

+ *     Col1: refSNP rs# identifier at the time of release (NB might merge with another rs# in the future)
+ *     Col2: SNP alleles according to dbSNP
+ *     Col3: chromosome that SNP maps to
+ *     Col4: chromosome position of SNP, in basepairs on reference sequence
+ *     Col5: strand of reference sequence that SNP maps to
+ *     Col6: version of reference sequence assembly
+ *     Col7: HapMap genotype center that produced the genotypes
+ *     Col8: LSID for HapMap protocol used for genotyping
+ *     Col9: LSID for HapMap assay used for genotyping
+ *     Col10: LSID for panel of individuals genotyped
+ *     Col11: QC-code, currently 'QC+' for all entries (for future use)
+ *     Col12 and on: observed genotypes of samples, one per column, sample identifiers in column headers (Coriell catalog numbers, example: NA10847). Duplicate samples have .dup suffix.
+ * 
+ *

+ * + *

+ * See also: @See HapMap genotypes download + *

+ * + *

File format example

+ * From genotypes_chr1_ASW_r27_nr.b36_fwd.txt.gz: + *
+ *     rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode NA19625 NA19700 NA19701 NA19702 NA19703 NA19704 NA19705 NA19708 NA19712 NA19711 NA19818 NA19819 NA19828 NA19835 NA19834 NA19836 NA19902 NA19901 NA19900 NA19904 NA19919 NA19908 NA19909 NA19914 NA19915 NA19916 NA19917 NA19918 NA19921 NA20129 NA19713 NA19982 NA19983 NA19714 NA19985 NA20128 NA20126 NA20127 NA20277 NA20276 NA20279 NA20282 NA20281 NA20284 NA20287 NA20288 NA20290 NA20289 NA20291 NA20292 NA20295 NA20294 NA20297 NA20300 NA20301 NA20302 NA20317 NA20319 NA20322 NA20333 NA20332 NA20335 NA20334 NA20337 NA20336 NA20340 NA20341 NA20343 NA20342 NA20344 NA20345 NA20346 NA20347 NA20348 NA20349 NA20350 NA20357 NA20356 NA20358 NA20359 NA20360 NA20363 NA20364
+ *     rs9629043 C/T chr1 554636 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8575115:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ CC CC CC CC CC CC CC CC CC CC CC CC NN CC CC CC CT CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC
+ *     rs28446478 G/T chr1 576058 + ncbi_b36 sanger urn:LSID:illumina.hapmap.org:Protocol:Human_1M_BeadChip:3 urn:LSID:sanger.hapmap.org:Assay:H1Mrs28446478:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GT TT GT TT TT TT TT GT GT TT TT TT TT GT GT GT GT TT GT TT GT GT TT GT GT TT TT TT GT GT TT TT TT GT TT GT TT GT GT GT GT GT TT GT TT TT GT GT TT TT TT TT TT TT GT GT GT GT TT TT TT TT GT TT GT TT TT GT TT TT TT GT TT TT TT GT GT TT GT TT GT TT TT
+ *     rs12565286 C/G chr1 711153 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8709646:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG GG CG NN GG GG GG GG GG GG NN GG NN NN
+ * 
+ * + * @author Mark DePristo + * @since 2010 */ public class RawHapMapCodec implements FeatureCodec { // the minimum number of features in the HapMap file line diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index 391715c63..d94d9ff84 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -12,7 +12,23 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.ArrayList; /** - * the ref seq codec + * TODO FOR CHRIS HARTL + * + *

+ * Codec Description + *

+ * + *

+ * See also: link to file specification + *

+ * + *

File format example

+ *

+ * A BAM file containing exactly one sample. + *

+ * + * @author Mark DePristo + * @since 2010 */ public class RefSeqCodec implements ReferenceDependentFeatureCodec { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java index f4048d37d..f4633b2ce 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -38,10 +38,43 @@ import java.util.regex.Pattern; import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; /** - * A Tribble encoder / decoder for SAM pileup data. + * Decoder for SAM pileup data. For GATK validation purposes only * - * @author mhanna - * @version 0.1 + *

+ * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute. + * It desribes the base-pair information at each chromosomal position. This format + * facilitates SNP/indel calling and brief alignment viewing by eyes. + *

+ *

+ * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

+ * + *

+ *
See also: @see SAMTools project
+ *
See also: @see Pileup format
+ *

+ * + *

File format example

+ *
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * 
+ * + * @author Matt Hanna + * @since 2009 */ public class SAMPileupCodec implements FeatureCodec { // the number of tokens we expect to parse from a pileup line diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java index f6861e585..d4bdb5aa9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java @@ -36,8 +36,21 @@ import org.broad.tribble.util.ParsingUtils; /** * Decodes a simple SAM text string. * - * @author mhanna - * @version 0.1 + *

+ * Reads in the SAM text version of a BAM file as a ROD. For testing only + *

+ * + *

+ * See also: @see SAMTools for format specification + *

+ * + *

File format example

+ *
+ *     SL-XBC:1:10:628:923#0	16	Escherichia_coli_K12	1	37	76M	=	1	0	AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA	B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB
+ * 
+ * + * @author Matt Hanna + * @since 2009 */ public class SAMReadCodec implements FeatureCodec { /* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */ diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java index b5efb49a7..7f3d9e17d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java @@ -41,10 +41,11 @@ import java.io.IOException; /** * Codec for decoding the output format of the SnpEff variant effect predictor tool - * (http://snpeff.sourceforge.net/). * + *

* This format has 23 tab-delimited fields: * + *

  * Chromosome
  * Position
  * Reference
@@ -68,10 +69,16 @@ import java.io.IOException;
  * Codons Around
  * Amino Acids Around
  * Custom Interval ID
+ * 
+ * Note that we treat all except the Chromosome, Position, and Effect fields as optional. + *

* - * We treat all except the Chromosome, Position, and Effect fields as optional. + *

+ * See also: @see SNPEff project page + *

* * @author David Roazen + * @since 2011 */ public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java index 6fe1907e3..fdcc8ed10 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java @@ -6,14 +6,19 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import java.util.Arrays; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 3/28/11 - * Time: 2:47 PM - * To change this template use File | Settings | File Templates. - */ -/** - * The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop) + * The standard table codec that expects loci as contig start stop, not contig:start-stop + * + *

+ * The standard table codec with a slightly different parsing convention + * (expects loci as contig start stop, not contig:start-stop) + *

+ * + *

+ * See also: TableCodec + *

+ * + * @author Chris Hartl + * @since 2010 */ public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java index 2ce7c679e..1919ccbf0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java @@ -11,13 +11,40 @@ import java.util.ArrayList; import java.util.Arrays; /** - * implementation of a simple table (tab or comma delimited format) input files + * Reads tab deliminated tabular text files + * + *

+ *

    + *
  • Header: must begin with line HEADER or track (for IGV), followed by any number of column names, + * separated by whitespace.
  • + *
  • Comment lines starting with # are ignored
  • + *
  • Each non-header and non-comment line is split into parts by whitespace, + * and these parts are assigned as a map to their corresponding column name in the header. + * Note that the first element (corresponding to the HEADER column) must be a valid genome loc + * such as 1, 1:1 or 1:1-10, which is the position of the Table element on the genome. TableCodec + * requires that there be one value for each column in the header, and no more, on all lines.
  • + *
+ *

+ * + *

+ * + *

File format example

+ *
+ *     HEADER a b c
+ *     1:1  1   2   3
+ *     1:2  4   5   6
+ *     1:3  7   8   9
+ * 
+ * + * @author Mark DePristo + * @since 2009 */ public class TableCodec implements ReferenceDependentFeatureCodec { - protected String delimiterRegex = "\\s+"; - protected String headerDelimiter = "HEADER"; - protected String igvHeaderDelimiter = "track"; - protected String commentDelimiter = "#"; + final static protected String delimiterRegex = "\\s+"; + final static protected String headerDelimiter = "HEADER"; + final static protected String igvHeaderDelimiter = "track"; + final static protected String commentDelimiter = "#"; + protected ArrayList header = new ArrayList(); /** diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 19f58ddaa..46242c302 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -14,10 +14,9 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; +import java.io.*; import java.util.*; +import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser, SelfScopingFeatureCodec { @@ -623,9 +622,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) { try { - char[] buff = new char[MAGIC_HEADER_LINE.length()]; - new FileReader(potentialInput).read(buff, 0, MAGIC_HEADER_LINE.length()); + return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || + isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); + } catch ( FileNotFoundException e ) { + return false; + } catch ( IOException e ) { + return false; + } + } + + private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) { + try { + byte[] buff = new byte[MAGIC_HEADER_LINE.length()]; + stream.read(buff, 0, MAGIC_HEADER_LINE.length()); String firstLine = new String(buff); + stream.close(); return firstLine.startsWith(MAGIC_HEADER_LINE); } catch ( IOException e ) { return false; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index ea16595bb..e5b1a2de5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -14,8 +14,20 @@ import java.util.*; /** - * a feature codec for the VCF 3 specification. Our aim is to read in the records and convert to VariantContext as - * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + * A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been + * depreciated in favor of VCF4 (See VCF codec for the latest information) + * + *

+ * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example) + *

+ * + *

+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication + *

+ * + * @author Mark DePristo + * @since 2010 */ public class VCF3Codec extends AbstractVCFCodec { public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 55a0eb3f9..fa030ef5f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -12,12 +12,46 @@ import java.io.FileReader; import java.io.IOException; import java.util.*; - /** - * a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as - * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + * A feature codec for the VCF 4 specification + * + *

+ * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a + * header line, and then data lines each containing information about a position in the genome. + *

+ *

One of the main uses of next-generation sequencing is to discover variation amongst large populations + * of related samples. Recently the format for storing next-generation read alignments has been + * standardised by the SAM/BAM file format specification. This has significantly improved the + * interoperability of next-generation tools for alignment, visualisation, and variant calling. + * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * types of sequence variation, including SNPs, indels and larger structural variants, together + * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for + * fast data retrieval of variants from a range of positions on the reference genome. + * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects + * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements + * various utilities for processing VCF files, including validation, merging and comparing, + * and also provides a general Perl and Python API. + * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.

+ * + *

+ * See also: @see VCF specification
+ * See also: @see VCF spec. publication + *

+ * + *

File format example

+ *
+ *     ##fileformat=VCFv4.0
+ *     #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
+ *     chr1    109     .       A       T       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:610,327:308:-316.30,-95.47,-803.03:99
+ *     chr1    147     .       C       A       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:294,49:118:-57.87,-34.96,-338.46:99
+ * 
+ * + * @author Mark DePristo + * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /**