2009-06-26 23:43:41 +08:00
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2009 The Broad Institute
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person
|
|
|
|
|
* obtaining a copy of this software and associated documentation
|
|
|
|
|
* files (the "Software"), to deal in the Software without
|
|
|
|
|
* restriction, including without limitation the rights to use,
|
|
|
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
* copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following
|
|
|
|
|
* conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice shall be
|
|
|
|
|
* included in all copies or substantial portions of the Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
|
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
|
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
|
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
|
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2009-07-22 00:20:10 +08:00
|
|
|
package org.broadinstitute.sting.gatk.refdata;
|
|
|
|
|
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
2009-10-23 14:31:15 +08:00
|
|
|
import org.broadinstitute.sting.utils.Utils;
|
2009-09-19 04:19:34 +08:00
|
|
|
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
|
|
|
|
import org.broadinstitute.sting.utils.genotype.Genotype;
|
2009-09-24 02:24:05 +08:00
|
|
|
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
2009-11-01 13:35:47 +08:00
|
|
|
import org.broadinstitute.sting.utils.genotype.geli.GeliGenotypeCall;
|
2009-07-22 00:20:10 +08:00
|
|
|
|
|
|
|
|
import java.io.IOException;
|
2009-09-19 04:19:34 +08:00
|
|
|
import java.util.ArrayList;
|
2009-07-22 00:20:10 +08:00
|
|
|
import java.util.Arrays;
|
2009-09-14 13:34:33 +08:00
|
|
|
import java.util.List;
|
2009-07-22 00:20:10 +08:00
|
|
|
|
2009-10-07 01:44:24 +08:00
|
|
|
public class RodGeliText extends BasicReferenceOrderedDatum implements VariationRod, VariantBackedByGenotype {
|
2009-09-19 04:19:34 +08:00
|
|
|
public enum Genotype_Strings {
|
2009-09-15 12:48:42 +08:00
|
|
|
AA, AC, AG, AT, CC, CG, CT, GG, GT, TT
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-17 05:03:47 +08:00
|
|
|
public GenomeLoc loc;
|
|
|
|
|
public char refBase = 'N';
|
|
|
|
|
public int depth;
|
|
|
|
|
public int maxMappingQuality;
|
|
|
|
|
public String bestGenotype = "NN";
|
|
|
|
|
public double lodBtr;
|
|
|
|
|
public double lodBtnb;
|
|
|
|
|
public double[] genotypeLikelihoods = new double[10];
|
2009-09-15 12:48:42 +08:00
|
|
|
|
2009-09-05 02:40:43 +08:00
|
|
|
public RodGeliText(final String name) {
|
2009-07-22 00:20:10 +08:00
|
|
|
super(name);
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public String delimiterRegex() {
|
|
|
|
|
return "\\s+";
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
|
|
|
|
public boolean parseLine(Object header, String[] parts) throws IOException {
|
2009-09-15 12:48:42 +08:00
|
|
|
if (parts.length < 18)
|
2009-07-23 01:54:44 +08:00
|
|
|
throw new IOException("Invalid rodVariant row found -- too few elements. Expected 18+, got " + parts.length);
|
2009-06-18 05:32:21 +08:00
|
|
|
if (!parts[0].startsWith("#")) {
|
2009-06-22 22:39:41 +08:00
|
|
|
loc = GenomeLocParser.createGenomeLoc(parts[0], Long.valueOf(parts[1]));
|
2009-09-23 02:20:43 +08:00
|
|
|
refBase = Character.toUpperCase(parts[2].charAt(0));
|
2009-06-18 05:32:21 +08:00
|
|
|
depth = Integer.valueOf(parts[3]);
|
|
|
|
|
maxMappingQuality = Integer.valueOf(parts[4]);
|
2009-09-19 05:01:43 +08:00
|
|
|
|
2009-09-19 05:04:25 +08:00
|
|
|
// UPPER case and sort
|
2009-09-19 05:01:43 +08:00
|
|
|
char[] x = parts[5].toUpperCase().toCharArray();
|
|
|
|
|
Arrays.sort(x);
|
|
|
|
|
bestGenotype = new String(x);
|
|
|
|
|
|
2009-07-17 05:03:47 +08:00
|
|
|
lodBtr = Double.valueOf(parts[6]);
|
|
|
|
|
lodBtnb = Double.valueOf(parts[7]);
|
2009-06-18 05:32:21 +08:00
|
|
|
|
|
|
|
|
for (int pieceIndex = 8, offset = 0; pieceIndex < 18; pieceIndex++, offset++) {
|
2009-07-17 05:03:47 +08:00
|
|
|
genotypeLikelihoods[offset] = Double.valueOf(parts[pieceIndex]);
|
2009-06-18 05:32:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String toString() {
|
2009-07-10 03:45:18 +08:00
|
|
|
return String.format("%s\t%d\t%c\t%d\t%d\t%s\t%4.4f\t%4.4f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f",
|
2009-09-15 12:48:42 +08:00
|
|
|
loc.getContig(),
|
|
|
|
|
loc.getStart(),
|
|
|
|
|
refBase,
|
|
|
|
|
depth,
|
|
|
|
|
maxMappingQuality,
|
|
|
|
|
bestGenotype,
|
|
|
|
|
lodBtr,
|
|
|
|
|
lodBtnb,
|
|
|
|
|
genotypeLikelihoods[0],
|
|
|
|
|
genotypeLikelihoods[1],
|
|
|
|
|
genotypeLikelihoods[2],
|
|
|
|
|
genotypeLikelihoods[3],
|
|
|
|
|
genotypeLikelihoods[4],
|
|
|
|
|
genotypeLikelihoods[5],
|
|
|
|
|
genotypeLikelihoods[6],
|
|
|
|
|
genotypeLikelihoods[7],
|
|
|
|
|
genotypeLikelihoods[8],
|
|
|
|
|
genotypeLikelihoods[9]
|
2009-06-18 05:32:21 +08:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public GenomeLoc getLocation() {
|
|
|
|
|
return loc;
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-14 13:34:33 +08:00
|
|
|
/**
|
|
|
|
|
* get the reference base(s) at this position
|
|
|
|
|
*
|
|
|
|
|
* @return the reference base or bases, as a string
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
2009-09-15 12:48:42 +08:00
|
|
|
public String getReference() {
|
|
|
|
|
return String.valueOf(this.refBase);
|
2009-09-14 13:34:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* get the -1 * (log 10 of the error value)
|
|
|
|
|
*
|
|
|
|
|
* @return the log based error estimate
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public double getNegLog10PError() {
|
|
|
|
|
return Math.abs(lodBtr);
|
|
|
|
|
}
|
|
|
|
|
|
2009-10-23 14:31:15 +08:00
|
|
|
/**
|
|
|
|
|
* gets the alternate alleles. This method should return all the alleles present at the location,
|
|
|
|
|
* NOT including the reference base. This is returned as a string list with no guarantee ordering
|
|
|
|
|
* of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
|
|
|
|
|
* frequency).
|
|
|
|
|
*
|
|
|
|
|
* @return an alternate allele list
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public List<String> getAlternateAlleleList() {
|
|
|
|
|
List<String> list = new ArrayList<String>();
|
|
|
|
|
for (char base : bestGenotype.toCharArray())
|
|
|
|
|
if (base != refBase)
|
|
|
|
|
list.add(String.valueOf(base));
|
|
|
|
|
return list;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* gets the alleles. This method should return all the alleles present at the location,
|
|
|
|
|
* including the reference base. The first allele should always be the reference allele, followed
|
|
|
|
|
* by an unordered list of alternate alleles.
|
|
|
|
|
*
|
|
|
|
|
* @return an alternate allele list
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public List<String> getAlleleList() {
|
|
|
|
|
List<String> list = new ArrayList<String>();
|
|
|
|
|
if (this.bestGenotype.contains(getReference())) list.add(getReference());
|
|
|
|
|
for (char c : this.bestGenotype.toCharArray())
|
|
|
|
|
if (c != Utils.stringToChar(getReference()))
|
|
|
|
|
list.add(String.valueOf(c));
|
|
|
|
|
return list;
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-22 00:20:10 +08:00
|
|
|
public String getRefBasesFWD() {
|
|
|
|
|
return String.format("%c", getRefSnpFWD());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public char getRefSnpFWD() throws IllegalStateException {
|
|
|
|
|
return refBase;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getAltBasesFWD() {
|
|
|
|
|
return String.format("%c", getAltSnpFWD());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public char getAltSnpFWD() throws IllegalStateException {
|
2009-10-23 14:31:15 +08:00
|
|
|
// both ref and bestGenotype have been uppercased, so it's safe to use ==
|
2009-08-08 03:37:07 +08:00
|
|
|
char c = (bestGenotype.charAt(0) == refBase) ? bestGenotype.charAt(1) : bestGenotype.charAt(0);
|
|
|
|
|
//System.out.printf("%s : %c and %c%n", bestGenotype, refBase, c);
|
|
|
|
|
return c;
|
2009-07-22 00:20:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isReference() {
|
2009-08-20 04:32:29 +08:00
|
|
|
return refBase == bestGenotype.charAt(0) && refBase == bestGenotype.charAt(1);
|
2009-07-22 00:20:10 +08:00
|
|
|
}
|
|
|
|
|
|
2009-09-14 13:34:33 +08:00
|
|
|
/**
|
|
|
|
|
* get the frequency of this variant
|
|
|
|
|
*
|
|
|
|
|
* @return VariantFrequency with the stored frequency
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public double getNonRefAlleleFrequency() {
|
|
|
|
|
return 1.0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** @return the VARIANT_TYPE of the current variant */
|
|
|
|
|
@Override
|
|
|
|
|
public VARIANT_TYPE getType() {
|
|
|
|
|
return VARIANT_TYPE.SNP;
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-22 00:20:10 +08:00
|
|
|
public boolean isSNP() {
|
2009-09-15 12:48:42 +08:00
|
|
|
if (this.getReference().length() == 1)
|
2009-09-23 04:54:47 +08:00
|
|
|
return (this.refBase != this.bestGenotype.charAt(0) || this.refBase != this.bestGenotype.charAt(1));
|
2009-09-15 12:48:42 +08:00
|
|
|
return false;
|
2009-07-22 00:20:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isInsertion() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isDeletion() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isIndel() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-14 13:34:33 +08:00
|
|
|
/**
|
|
|
|
|
* gets the alternate base is the case of a SNP. Throws an IllegalStateException in the case
|
|
|
|
|
* of
|
|
|
|
|
*
|
|
|
|
|
* @return a char, representing the alternate base
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public char getAlternativeBaseForSNP() {
|
|
|
|
|
if (!this.isSNP()) throw new IllegalStateException("we're not a SNP");
|
2009-09-23 04:54:47 +08:00
|
|
|
// we know that if we're a SNP, the alt is a single base
|
2009-09-15 12:48:42 +08:00
|
|
|
if (this.bestGenotype.toString().charAt(0) == getReference().charAt(0))
|
2009-09-14 13:34:33 +08:00
|
|
|
return this.bestGenotype.toString().charAt(1);
|
|
|
|
|
return this.bestGenotype.toString().charAt(0);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
/**
|
|
|
|
|
* gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
|
|
|
|
|
*
|
|
|
|
|
* @return a char, representing the alternate base
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public char getReferenceForSNP() {
|
|
|
|
|
if (!isSNP()) throw new IllegalStateException("This site is not a SNP");
|
|
|
|
|
// we know that if we're a SNP, the reference is a single base
|
|
|
|
|
if (bestGenotype.toString().charAt(0) != getReference().charAt(0))
|
|
|
|
|
return bestGenotype.toString().charAt(1);
|
|
|
|
|
else
|
|
|
|
|
return bestGenotype.toString().charAt(0);
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-22 00:20:10 +08:00
|
|
|
public double getMAF() {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public double getHeterozygosity() {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isGenotype() {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public double getVariationConfidence() {
|
|
|
|
|
return lodBtr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public double getConsensusConfidence() {
|
|
|
|
|
return lodBtnb;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public List<String> getGenotype() throws IllegalStateException {
|
|
|
|
|
return Arrays.asList(getBestGenotype());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getPloidy() throws IllegalStateException {
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isBiallelic() {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-11 23:01:50 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public int length() {
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
2009-07-28 11:25:03 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public char getReferenceBase() {
|
|
|
|
|
return refBase;
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public int getPileupDepth() {
|
|
|
|
|
return depth;
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public int getMaxMappingQuality() {
|
|
|
|
|
return maxMappingQuality;
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public String getBestGenotype() {
|
|
|
|
|
return bestGenotype;
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public double getLodBtr() {
|
|
|
|
|
return lodBtr;
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public double getLodBtnb() {
|
|
|
|
|
return lodBtnb;
|
|
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
|
2009-09-15 12:48:42 +08:00
|
|
|
public double[] getGenotypeLikelihoods() {
|
|
|
|
|
return genotypeLikelihoods;
|
|
|
|
|
}
|
2009-06-18 15:26:37 +08:00
|
|
|
|
|
|
|
|
public void adjustLikelihoods(double[] likelihoods) {
|
|
|
|
|
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
|
|
|
|
|
genotypeLikelihoods[likelihoodIndex] += likelihoods[likelihoodIndex];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
String bestGenotype = "NN";
|
|
|
|
|
double bestLikelihood = Double.NEGATIVE_INFINITY;
|
|
|
|
|
double nextBestLikelihood = Double.NEGATIVE_INFINITY;
|
|
|
|
|
double refLikelihood = Double.NEGATIVE_INFINITY;
|
|
|
|
|
|
|
|
|
|
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
|
|
|
|
|
if (genotypeLikelihoods[likelihoodIndex] > bestLikelihood) {
|
|
|
|
|
bestLikelihood = genotypeLikelihoods[likelihoodIndex];
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
bestGenotype = Genotype_Strings.values()[likelihoodIndex].toString();
|
2009-06-18 15:26:37 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
|
|
|
|
|
if (genotypeLikelihoods[likelihoodIndex] > nextBestLikelihood && genotypeLikelihoods[likelihoodIndex] < bestLikelihood) {
|
|
|
|
|
nextBestLikelihood = genotypeLikelihoods[likelihoodIndex];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
|
2009-09-19 04:19:34 +08:00
|
|
|
if (refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(0) &&
|
|
|
|
|
refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(1)) {
|
2009-06-18 15:26:37 +08:00
|
|
|
refLikelihood = genotypeLikelihoods[likelihoodIndex];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.bestGenotype = bestGenotype;
|
2009-07-17 05:03:47 +08:00
|
|
|
this.lodBtr = (bestLikelihood - refLikelihood);
|
|
|
|
|
this.lodBtnb = (bestLikelihood - nextBestLikelihood);
|
2009-06-18 15:26:37 +08:00
|
|
|
}
|
2009-07-15 02:53:27 +08:00
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
|
|
|
|
|
/**
|
2009-09-19 06:25:16 +08:00
|
|
|
* get the genotype
|
2009-09-19 04:19:34 +08:00
|
|
|
*
|
2009-09-19 06:25:16 +08:00
|
|
|
* @return a map in lexigraphical order of the genotypes
|
2009-09-19 04:19:34 +08:00
|
|
|
*/
|
|
|
|
|
@Override
|
2009-09-19 06:38:51 +08:00
|
|
|
public Genotype getCalledGenotype() {
|
2009-11-01 13:35:47 +08:00
|
|
|
return new GeliGenotypeCall(refBase, getLocation(), bestGenotype, lodBtnb);
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-09-14 13:34:33 +08:00
|
|
|
/**
|
|
|
|
|
* get the likelihoods
|
|
|
|
|
*
|
|
|
|
|
* @return an array in lexigraphical order of the likelihoods
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
2009-09-19 04:19:34 +08:00
|
|
|
public List<Genotype> getGenotypes() {
|
|
|
|
|
List<Genotype> ret = new ArrayList<Genotype>();
|
2009-11-01 13:35:47 +08:00
|
|
|
ret.add(new GeliGenotypeCall(refBase, getLocation(), bestGenotype, lodBtnb));
|
2009-09-19 04:19:34 +08:00
|
|
|
return ret;
|
2009-09-15 12:48:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* do we have the specified genotype? not all backedByGenotypes
|
|
|
|
|
* have all the genotype data.
|
|
|
|
|
*
|
|
|
|
|
* @param x the genotype
|
|
|
|
|
*
|
|
|
|
|
* @return true if available, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public boolean hasGenotype(DiploidGenotype x) {
|
|
|
|
|
return (x.toString().equals(this.getAltBasesFWD()));
|
2009-09-14 13:34:33 +08:00
|
|
|
}
|
2009-06-18 05:32:21 +08:00
|
|
|
}
|