diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index a878bd556..f3f2fe664 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -76,6 +76,7 @@ public class ReferenceOrderedData implements addModule("Variants", RodGeliText.class); addModule("GLF", RodGLF.class); addModule("VCF", RodVCF.class); + addModule("PicardDbSNP", rodPicardDbSNP.class); } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodPicardDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodPicardDbSNP.java new file mode 100644 index 000000000..33b7e4d8b --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodPicardDbSNP.java @@ -0,0 +1,282 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.refdata; + +import edu.mit.broad.picard.variation.KnownVariant; +import edu.mit.broad.picard.variation.VariantType; +import edu.mit.broad.picard.variation.DbSnpFileReader; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Collections; +import java.util.Arrays; + +/** + * @author alecw@broadinstitute.org + */ +public class rodPicardDbSNP implements VariationRod { + + private final KnownVariant knownVariant; + private final MyGenomeLoc loc; + private final List alleles; + + public rodPicardDbSNP(final KnownVariant knownVariant) { + this.knownVariant = knownVariant; + alleles = Collections.unmodifiableList(Arrays.asList(knownVariant.getObservedAlleles())); + loc = new MyGenomeLoc(knownVariant); + } + + /** + * are we bi-allelic? + */ + public boolean isBiallelic() { + return alleles.size() == 2; + } + + /** + * get the frequency of this variant, if we're a variant. If we're reference this method + * should return 0. If we can't provide an alternate allele frequency, this should also + * return 0. + *

+ * WARNING: This method is only valid for biAllelic data, the contract is to check isBiallelic() + * before calling this method + * + * @return double the minor allele frequency + */ + public double getNonRefAlleleFrequency() { + return knownVariant.getMinorAlleleFrequency(); + } + + /** + * A convenience method, for switching over the variation type + * + * @return the VARIANT_TYPE of the current variant + */ + public VARIANT_TYPE getType() { + switch (knownVariant.getType()) { + case SNP: + return VARIANT_TYPE.SNP; + case insertion: + case deletion: + return VARIANT_TYPE.INDEL; + } + return null; + } + + /** + * are we a SNP? If not we're a Indel/deletion or the reference. This method must be call before you use + * the convenience methods getAlternativeBaseForSNP or getReferenceForSNP, to ensure that you're working with a SNP + * + * @return true if we're a SNP + */ + public boolean isSNP() { + return knownVariant.getType() == VariantType.SNP; + } + + /** + * are we an insertion? + * + * @return true if we are, false otherwise + */ + public boolean isInsertion() { + return knownVariant.getType() == VariantType.insertion; + } + + /** + * are we an deletion? + * + * @return true if we are, false otherwise + */ + public boolean isDeletion() { + return knownVariant.getType() == VariantType.deletion; + } + + /** + * are we a variant that represents the reference allele? + * + * @return false if we're a variant(indel, delete, SNP, etc), true if we're hom ref + */ + public boolean isReference() { + return false; // snp locations are never "reference", there's always a variant + } + + /** + * are we an insertion or a deletion? yes, then return true. No? false. + * + * @return true if we're an insertion or deletion + */ + public boolean isIndel() { + return getType() == VARIANT_TYPE.INDEL; + } + + public String getName() { + return "PicarddbSNP"; + } + + public boolean parseLine(final Object header, final String[] parts) throws IOException { + throw new UnsupportedOperationException("Not a text format so this should never be called."); + } + + public String toSimpleString() { + return getName() + ":" + knownVariant.getObservedAllelesString(); + } + + public String repl() { + throw new UnsupportedOperationException("Not a text format so this should never be called."); + } + + /** + * Used by the ROD system to determine how to split input lines + * + * @return Regex string delimiter separating fields + */ + public String delimiterRegex() { + throw new UnsupportedOperationException("Not a text format so this should never be called."); + } + + /** + * get the location of this Variant + * + * @return a GenomeLoc + */ + public GenomeLoc getLocation() { + return loc; + } + + public int compareTo(final ReferenceOrderedDatum that) { + return getLocation().compareTo(that.getLocation()); + } + + /** + * Backdoor hook to read header, meta-data, etc. associated with the file. Will be + * called by the ROD system before streaming starts + * + * @param source source data file on disk from which this rod stream will be pulled + * @return a header object that will be passed to parseLine command + */ + public Object initialize(final File source) throws FileNotFoundException { + throw new UnsupportedOperationException("Not a text format so this should never be called."); + } + + /** + * get the reference base(s) for this Variant + * + * @return the reference base or bases, as a string + */ + public String getReference() { + // Not known without looking in reference fasta. + return null; + } + + /** + * get the -1 * (log 10 of the error value) + * + * @return the postive number space log based error estimate + */ + public double getNegLog10PError() { + // Copied from rodDbSNP + return 4; // -log10(0.0001) + } + + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + public List getAlternateAlleleList() { + throw new UnsupportedOperationException("Don't know which is the reference allele"); + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. If the reference base is not an allele in this varation + * it will not be in the list (i.e. there is no guarantee that the reference base is in the list). + * + * @return an alternate allele list + */ + public List getAlleleList() { + return alleles; + } + + /** + * gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP + * of + * + * @return a char, representing the alternate base + */ + public char getAlternativeBaseForSNP() { + throw new UnsupportedOperationException("Don't know which is the reference allele"); + } + + /** + * gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP + * + * @return a char, representing the alternate base + */ + public char getReferenceForSNP() { + throw new UnsupportedOperationException("Don't know which is the reference allele"); + } + + public static Iterator createIterator(final String trackName, final File f) throws IOException { + return new rodPicardDbSNPIterator(f); + } + + private static class MyGenomeLoc extends GenomeLoc { + private MyGenomeLoc(final KnownVariant knownVariant) { + // GenomeLoc is one-based half-open interval. + super(knownVariant.getReferenceSequence(), knownVariant.getSequenceIndex(), knownVariant.getStartPos(), + knownVariant.getEndPos() + 1); + } + } + + private static class rodPicardDbSNPIterator implements Iterator { + private final DbSnpFileReader reader; + + private rodPicardDbSNPIterator(final File f) { + reader = new DbSnpFileReader(f); + } + + public boolean hasNext() { + return reader.hasNext(); + } + + public rodPicardDbSNP next() { + return new rodPicardDbSNP(reader.next()); + } + + public void remove() { + reader.remove(); + } + + } + +}