2009-03-16 06:37:20 +08:00
|
|
|
package org.broadinstitute.sting.gatk.refdata;
|
|
|
|
|
|
|
|
|
|
import edu.mit.broad.picard.util.SequenceUtil;
|
|
|
|
|
|
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
|
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
|
|
|
import org.broadinstitute.sting.utils.Utils;
|
2009-04-03 04:48:59 +08:00
|
|
|
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Example format:
|
|
|
|
|
* 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1
|
|
|
|
|
* 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1
|
|
|
|
|
*
|
|
|
|
|
* User: mdepristo
|
|
|
|
|
* Date: Feb 27, 2009
|
|
|
|
|
* Time: 10:47:14 AM
|
|
|
|
|
* To change this template use File | Settings | File Templates.
|
|
|
|
|
*/
|
2009-04-03 04:48:59 +08:00
|
|
|
public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant {
|
2009-03-16 06:37:20 +08:00
|
|
|
public GenomeLoc loc; // genome location of SNP
|
|
|
|
|
// Reference sequence chromosome or scaffold
|
|
|
|
|
// Start and stop positions in chrom
|
|
|
|
|
|
|
|
|
|
public String name; // Reference SNP identifier or Affy SNP name
|
|
|
|
|
public String strand; // Which DNA strand contains the observed alleles
|
|
|
|
|
|
|
|
|
|
public String refBases; // the reference base according to NCBI, in the dbSNP file
|
|
|
|
|
public String observed; // The sequences of the observed alleles from rs-fasta files
|
|
|
|
|
|
|
|
|
|
public String molType; // Sample type from exemplar ss
|
|
|
|
|
public String varType; // The class of variant (simple, insertion, deletion, range, etc.)
|
|
|
|
|
// Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion'
|
|
|
|
|
public String validationStatus; // The validation status of the SNP
|
|
|
|
|
// one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap')
|
|
|
|
|
|
|
|
|
|
public double avHet; // The average heterozygosity from all observations
|
|
|
|
|
public double avHetSE; // The Standard Error for the average heterozygosity
|
|
|
|
|
|
|
|
|
|
public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.)
|
|
|
|
|
// set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5',
|
|
|
|
|
// 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5')
|
|
|
|
|
public String locType; // How the variant affects the reference sequence
|
|
|
|
|
// enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion')
|
|
|
|
|
|
|
|
|
|
public int weight; // The quality of the alignment
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Constructors
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public rodDbSNP() {}
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// manipulating the SNP information
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public GenomeLoc getLocation() { return loc; }
|
|
|
|
|
|
|
|
|
|
public boolean onFwdStrand() {
|
|
|
|
|
return strand.equals("+");
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-03 04:48:59 +08:00
|
|
|
/** Returns bases in the reference allele as a String. String can be empty (as in insertion into
|
|
|
|
|
* the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters
|
|
|
|
|
* (for longer indels).
|
|
|
|
|
*
|
|
|
|
|
* @return reference allele, forward strand
|
|
|
|
|
*/
|
|
|
|
|
public String getRefBasesFWD() {
|
2009-03-16 06:37:20 +08:00
|
|
|
if ( onFwdStrand() )
|
|
|
|
|
return refBases;
|
|
|
|
|
else
|
|
|
|
|
return SequenceUtil.reverseComplement(refBases);
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-03 04:48:59 +08:00
|
|
|
/**
|
|
|
|
|
* Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException
|
|
|
|
|
* if variant is not a SNP.
|
|
|
|
|
*
|
|
|
|
|
* @return reference base on the forward strand
|
|
|
|
|
*/
|
|
|
|
|
public char getRefSnpFWD() throws IllegalStateException {
|
|
|
|
|
if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP");
|
|
|
|
|
if ( onFwdStrand() ) return refBases.charAt(0);
|
|
|
|
|
else return SequenceUtil.reverseComplement(refBases).charAt(0);
|
|
|
|
|
}
|
|
|
|
|
|
2009-03-16 06:37:20 +08:00
|
|
|
public List<String> getAllelesFWD() {
|
|
|
|
|
List<String> alleles = null;
|
|
|
|
|
if ( onFwdStrand() )
|
|
|
|
|
alleles = Arrays.asList(observed.split("/"));
|
|
|
|
|
else
|
|
|
|
|
alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/"));
|
|
|
|
|
|
|
|
|
|
//System.out.printf("getAlleles %s on %s %b => %s %n", observed, strand, onFwdStrand(), Utils.join("/", alleles));
|
|
|
|
|
return alleles;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getAllelesFWDString() {
|
|
|
|
|
return Utils.join("/", getAllelesFWD());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// What kind of variant are we?
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public boolean isSNP() { return varType.contains("single"); }
|
|
|
|
|
public boolean isInsertion() { return varType.contains("insertion"); }
|
|
|
|
|
public boolean isDeletion() { return varType.contains("deletion"); }
|
2009-04-03 04:48:59 +08:00
|
|
|
public boolean isIndel() { return isInsertion() || isDeletion() || varType.contains("in-del"); }
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
public boolean isHapmap() { return validationStatus.contains("by-hapmap"); }
|
|
|
|
|
public boolean is2Hit2Allele() { return validationStatus.contains("by-2hit-2allele"); }
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// formatting
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public String toString() {
|
|
|
|
|
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
|
|
|
|
getContig(), getStart(), getStop(), name, strand, refBases, observed, molType,
|
|
|
|
|
varType, validationStatus, avHet, avHetSE, func, locType, weight );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String toSimpleString() {
|
|
|
|
|
return String.format("%s:%s:%s", name, observed, strand);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String toMediumString() {
|
|
|
|
|
String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString());
|
|
|
|
|
if ( isSNP() ) s += ":SNP";
|
|
|
|
|
if ( isIndel() ) s += ":Indel";
|
|
|
|
|
if ( isHapmap() ) s += ":Hapmap";
|
|
|
|
|
if ( is2Hit2Allele() ) s += ":2Hit";
|
|
|
|
|
return s;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String repl() {
|
|
|
|
|
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
|
|
|
|
585, getContig(), getStart()-1, getStop()-1, name, strand, refBases, refBases, observed, molType,
|
|
|
|
|
varType, validationStatus, avHet, avHetSE, func, locType, weight );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void parseLine(final String[] parts) {
|
|
|
|
|
try {
|
|
|
|
|
String contig = parts[1];
|
|
|
|
|
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
|
|
|
|
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
|
|
|
|
loc = new GenomeLoc(contig, start, stop);
|
|
|
|
|
|
|
|
|
|
name = parts[4];
|
|
|
|
|
refBases = parts[5];
|
|
|
|
|
strand = parts[6];
|
|
|
|
|
observed = parts[9];
|
|
|
|
|
molType = parts[10];
|
|
|
|
|
varType = parts[11];
|
|
|
|
|
validationStatus = parts[12];
|
|
|
|
|
avHet = Double.parseDouble(parts[13]);
|
|
|
|
|
avHetSE = Double.parseDouble(parts[14]);
|
|
|
|
|
func = parts[15];
|
|
|
|
|
locType = parts[16];
|
|
|
|
|
weight = Integer.parseInt(parts[17]);
|
|
|
|
|
} catch ( RuntimeException e ) {
|
|
|
|
|
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-04-03 04:48:59 +08:00
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String getAltBasesFWD() {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public char getAltSnpFWD() throws IllegalStateException {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public double getConsensusConfidence() {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public List<String> getGenotype() throws IllegalStateException {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public double getMAF() {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public int getPloidy() throws IllegalStateException {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public double getVariationConfidence() {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public boolean isGenotype() {
|
|
|
|
|
// TODO Auto-generated method stub
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|