package org.broadinstitute.sting.gatk.refdata;
import net.sf.picard.util.SequenceUtil;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Example format:
* 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1
* 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1
*
* User: mdepristo
* Date: Feb 27, 2009
* Time: 10:47:14 AM
* To change this template use File | Settings | File Templates.
*/
public class rodDbSNP extends BasicReferenceOrderedDatum implements Variation, VariantBackedByGenotype, AllelicVariant {
public GenomeLoc loc; // genome location of SNP
// Reference sequence chromosome or scaffold
// Start and stop positions in chrom
public String name; // Reference SNP identifier or Affy SNP name
public String strand; // Which DNA strand contains the observed alleles
public String refBases; // the reference base according to NCBI, in the dbSNP file
public String observed; // The sequences of the observed alleles from rs-fasta files
public String molType; // Sample type from exemplar ss
public String varType; // The class of variant (simple, insertion, deletion, range, etc.)
// Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion'
public String validationStatus; // The validation status of the SNP
// one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap')
public double avHet; // The average heterozygosity from all observations
public double avHetSE; // The Standard Error for the average heterozygosity
public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.)
// set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5',
// 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5')
public String locType; // How the variant affects the reference sequence
// enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion')
public int weight; // The quality of the alignment
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
public rodDbSNP(final String name) {
super(name);
}
// ----------------------------------------------------------------------
//
// manipulating the SNP information
//
// ----------------------------------------------------------------------
public GenomeLoc getLocation() {
return loc;
}
/**
* get the reference base(s) at this position
*
* @return the reference base or bases, as a string
*/
@Override
public String getReference() {
return getRefBasesFWD();
}
/**
* get the -1 * (log 10 of the error value)
*
* @return the log based error estimate
*/
@Override
public double getNegLog10PError() {
return 4; // -log10(0.0001)
}
public boolean onFwdStrand() {
return strand.equals("+");
}
/**
* Returns bases in the reference allele as a String. String can be empty (as in insertion into
* the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters
* (for longer indels).
*
* @return reference allele, forward strand
*/
public String getRefBasesFWD() {
// fix - at least this way we ensure that we'll get the other base compared to getAltBasesFWD()
return (getAllelesFWD().get(0).equals(refBases)) ? getAllelesFWD().get(0) : getAllelesFWD().get(1);
//if ( onFwdStrand() )
// return refBases;
//else
// return SequenceUtil.reverseComplement(refBases);
}
/**
* Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException
* if variant is not a SNP.
*
* @return reference base on the forward strand
*/
public char getRefSnpFWD() throws IllegalStateException {
//System.out.printf("refbases is %s but %s%n", refBases, toString());
if (isIndel()) throw new IllegalStateException("Variant is not a SNP");
// fix - at least this way we ensure that we'll get the other base compared to getAltBasesFWD()
List alleles = getAllelesFWD();
String val = (alleles.get(0).equals(refBases) ? alleles.get(0) : alleles.get(1));
return val.charAt(0);
// if ( onFwdStrand() ) return refBases.charAt(0);
// else return SequenceUtil.reverseComplement(refBases).charAt(0);
}
public List getAllelesFWD() {
List alleles = null;
if (onFwdStrand())
alleles = Arrays.asList(observed.split("/"));
else
alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/"));
//System.out.printf("getAlleles %s on %s %b => %s %n", observed, strand, onFwdStrand(), Utils.join("/", alleles));
return alleles;
}
public String getAllelesFWDString() {
return Utils.join("", getAllelesFWD());
}
/**
* get the frequency of this variant
*
* @return VariantFrequency with the stored frequency
*/
@Override
public double getNonRefAlleleFrequency() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/** @return the VARIANT_TYPE of the current variant */
@Override
public VARIANT_TYPE getType() {
return VARIANT_TYPE.SNP;
}// ----------------------------------------------------------------------
//
// What kind of variant are we?
//
// ----------------------------------------------------------------------
public boolean isSNP() {
return varType.contains("single");
}
public boolean isInsertion() {
return varType.contains("insertion");
}
public boolean isDeletion() {
return varType.contains("deletion");
}
/**
* get the base representation of this Variant
*
* @return a string, of ploidy
*/
@Override
public String getAlternateBase() {
return getAllelesFWDString();
}
/**
* gets the alternate bases. Use this method if teh allele count is greater then 2
*
* @return
*/
@Override
public List getAlternateBases() {
List list = new ArrayList();
list.add(this.getAlternateBase());
return list;
}
public boolean isIndel() {
return isInsertion() || isDeletion() || varType.contains("in-del");
}
/**
* gets the alternate base is the case of a SNP. Throws an IllegalStateException in the case
* of
*
* @return a char, representing the alternate base
*/
@Override
public char getAlternativeBaseForSNP() {
return getAltSnpFWD(); /*
if (!this.isSNP()) throw new IllegalStateException("we're not a SNP");
if (getAlternateBase().charAt(0) == this.getReference())
return getAlternateBase().charAt(1);
return getAlternateBase().charAt(0); */
}
/**
* gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
*
* @return a char, representing the alternate base
*/
@Override
public char getReferenceForSNP() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
public boolean isReference() {
return false;
} // snp locations are never "reference", there's always a variant
public boolean isHapmap() {
return validationStatus.contains("by-hapmap");
}
public boolean is2Hit2Allele() {
return validationStatus.contains("by-2hit-2allele");
}
// ----------------------------------------------------------------------
//
// formatting
//
// ----------------------------------------------------------------------
public String toString() {
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1,
name, strand, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
public String toSimpleString() {
return String.format("%s:%s:%s", name, observed, strand);
}
public String toMediumString() {
String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString());
if (isSNP()) s += ":SNP";
if (isIndel()) s += ":Indel";
if (isHapmap()) s += ":Hapmap";
if (is2Hit2Allele()) s += ":2Hit";
return s;
}
public String repl() {
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(),
name, strand, refBases, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
public boolean parseLine(final Object header, final String[] parts) {
try {
String contig = parts[1];
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1));
name = parts[4];
strand = parts[6];
refBases = parts[7];
if (strand == "-")
refBases = BaseUtils.simpleReverseComplement(refBases);
observed = parts[9];
molType = parts[10];
varType = parts[11];
validationStatus = parts[12];
avHet = Double.parseDouble(parts[13]);
avHetSE = Double.parseDouble(parts[14]);
func = parts[15];
locType = parts[16];
weight = Integer.parseInt(parts[17]);
//System.out.printf("Parsed %s%n", toString());
return true;
} catch (MalformedGenomeLocException ex) {
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
throw ex;
} catch (ArrayIndexOutOfBoundsException ex) {
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
throw new RuntimeException("Badly formed dbSNP line: " + ex);
} catch (RuntimeException e) {
System.out.printf(" Exception caught during parsing DBSNP line %s%n", Utils.join(" <=> ", parts));
throw e;
}
}
public String getAltBasesFWD() {
List alleles = getAllelesFWD();
return (alleles.get(0).equals(refBases) ? alleles.get(1) : alleles.get(0));
}
public char getAltSnpFWD() throws IllegalStateException {
if (!isSNP())
throw new IllegalStateException("I'm not a SNP");
return getAltBasesFWD().charAt(0);
}
public double getConsensusConfidence() {
// TODO Auto-generated method stub
return Double.MAX_VALUE;
}
public List getGenotype() throws IllegalStateException {
return Arrays.asList(Utils.join("", getAllelesFWD()));
}
public double getMAF() {
// Fixme: update to actually get MAF
//return avHet;
return -1;
}
public double getHeterozygosity() {
return avHet;
}
public int getPloidy() throws IllegalStateException {
// TODO Auto-generated method stub
return 0;
}
public double getVariationConfidence() {
// TODO Auto-generated method stub
return Double.MAX_VALUE;
}
public boolean isGenotype() {
// TODO Auto-generated method stub
return false;
}
public boolean isBiallelic() {
// TODO Auto-generated method stub
return observed.indexOf('/') == observed.lastIndexOf('/');
}
public int length() {
return (int) (loc.getStop() - loc.getStart() + 1);
}
/**
* get the genotype
*
* @return a map in lexigraphical order of the genotypes
*/
@Override
public org.broadinstitute.sting.utils.genotype.Genotype getCalledGenotype() {
return new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence());
}
/**
* get the likelihoods
*
* @return an array in lexigraphical order of the likelihoods
*/
@Override
public List getGenotypes() {
List list = new ArrayList();
list.add(new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence()));
return list;
}
/**
* do we have the specified genotype? not all backedByGenotypes
* have all the genotype data.
*
* @param x the genotype
*
* @return true if available, false otherwise
*/
@Override
public boolean hasGenotype(DiploidGenotype x) {
return (!x.toString().equals(this.getAltBasesFWD())) ? false : true;
}
}