package org.broadinstitute.sting.gatk.refdata; import edu.mit.broad.picard.util.SequenceUtil; import java.util.*; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; /** * Example format: * 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1 * 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1 * * User: mdepristo * Date: Feb 27, 2009 * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ public class rodDbSNP extends ReferenceOrderedDatum { public GenomeLoc loc; // genome location of SNP // Reference sequence chromosome or scaffold // Start and stop positions in chrom public String name; // Reference SNP identifier or Affy SNP name public String strand; // Which DNA strand contains the observed alleles public String refBases; // the reference base according to NCBI, in the dbSNP file public String observed; // The sequences of the observed alleles from rs-fasta files public String molType; // Sample type from exemplar ss public String varType; // The class of variant (simple, insertion, deletion, range, etc.) // Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion' public String validationStatus; // The validation status of the SNP // one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap') public double avHet; // The average heterozygosity from all observations public double avHetSE; // The Standard Error for the average heterozygosity public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.) // set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5', // 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5') public String locType; // How the variant affects the reference sequence // enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion') public int weight; // The quality of the alignment // ---------------------------------------------------------------------- // // Constructors // // ---------------------------------------------------------------------- public rodDbSNP() {} // ---------------------------------------------------------------------- // // manipulating the SNP information // // ---------------------------------------------------------------------- public GenomeLoc getLocation() { return loc; } public boolean onFwdStrand() { return strand.equals("+"); } // Get the reference bases on the forward strand public String getRefBasesFWD() { if ( onFwdStrand() ) return refBases; else return SequenceUtil.reverseComplement(refBases); } public List getAllelesFWD() { List alleles = null; if ( onFwdStrand() ) alleles = Arrays.asList(observed.split("/")); else alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/")); //System.out.printf("getAlleles %s on %s %b => %s %n", observed, strand, onFwdStrand(), Utils.join("/", alleles)); return alleles; } public String getAllelesFWDString() { return Utils.join("/", getAllelesFWD()); } // ---------------------------------------------------------------------- // // What kind of variant are we? // // ---------------------------------------------------------------------- public boolean isSNP() { return varType.contains("single"); } public boolean isInsertion() { return varType.contains("insertion"); } public boolean isDeletion() { return varType.contains("deletion"); } public boolean isIndel() { return varType.contains("in-del"); } public boolean isHapmap() { return validationStatus.contains("by-hapmap"); } public boolean is2Hit2Allele() { return validationStatus.contains("by-2hit-2allele"); } // ---------------------------------------------------------------------- // // formatting // // ---------------------------------------------------------------------- public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", getContig(), getStart(), getStop(), name, strand, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } public String toSimpleString() { return String.format("%s:%s:%s", name, observed, strand); } public String toMediumString() { String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString()); if ( isSNP() ) s += ":SNP"; if ( isIndel() ) s += ":Indel"; if ( isHapmap() ) s += ":Hapmap"; if ( is2Hit2Allele() ) s += ":2Hit"; return s; } public String repl() { return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", 585, getContig(), getStart()-1, getStop()-1, name, strand, refBases, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } public void parseLine(final String[] parts) { try { String contig = parts[1]; long start = Long.parseLong(parts[2]) + 1; // The final is 0 based long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based loc = new GenomeLoc(contig, start, stop); name = parts[4]; refBases = parts[5]; strand = parts[6]; observed = parts[9]; molType = parts[10]; varType = parts[11]; validationStatus = parts[12]; avHet = Double.parseDouble(parts[13]); avHetSE = Double.parseDouble(parts[14]); func = parts[15]; locType = parts[16]; weight = Integer.parseInt(parts[17]); } catch ( RuntimeException e ) { System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); throw e; } } }