diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java index 9c90693bc..9ad2b8ac9 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java @@ -2,8 +2,9 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.genotype.Genotype; -import java.util.ArrayList; +import java.util.Arrays; import java.util.List; + /** * loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom * chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0 @@ -23,28 +24,6 @@ public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes, public char getAltSnpFWD() throws IllegalStateException { return getAltBasesFWD().charAt(0); } public boolean isReference() { return getVariationConfidence() < 0.01; } - /** - * gets the alternate base. Use this method if we're biallelic - * - * @return - */ - @Override - public String getAlternateBases() { - return getAltBasesFWD(); - } - - /** - * gets the alternate bases. Use this method if the allele count is greater then 2 - * - * @return - */ - @Override - public List getAlternateBaseList() { - List str = new ArrayList(); - str.add(this.getAltBasesFWD()); - return str; - } - /** * get the frequency of this variant * @@ -120,6 +99,33 @@ public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes, return this.getVariationConfidence(); } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + @Override + public List getAlternateAlleleList() { + return Arrays.asList(getAltBasesFWD()); + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + @Override + public List getAlleleList() { + List alleles = Arrays.asList(this.getReference()); + alleles.addAll(getAlternateAlleleList()); + return alleles; + } + public int length() { return 1; } // SNPCallFromGenotypes interface diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java index 88b08253f..f4fce2ba1 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java @@ -161,10 +161,9 @@ public class RodGLF implements VariationRod, Iterator { @Override public char getAlternativeBaseForSNP() { if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); - if (getAlternateBases().charAt(0) == this.getReference().charAt(0)) - return getAlternateBases().charAt(1); - return getAlternateBases().charAt(0); - + List alleles = this.getAlternateAlleleList(); + if (alleles.size() != 1) throw new StingException("We're not biAllelic()"); + return Utils.stringToChar(alleles.get(0)); } /** @@ -175,10 +174,7 @@ public class RodGLF implements VariationRod, Iterator { @Override public char getReferenceForSNP() { if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); - if (getAlternateBases().charAt(0) == this.getReference().charAt(0)) - return getAlternateBases().charAt(0); - return getAlternateBases().charAt(1); - + return Utils.stringToChar(getReference()); } /** @@ -207,7 +203,7 @@ public class RodGLF implements VariationRod, Iterator { * Get the nth best genotype (one based), i.e. to get the best genotype pass in 1, * the second best 2, etdc. * - * @param nthBest the nth best genotype to get + * @param nthBest the nth best genotype to get (1 based, NOT ZERO BASED) * * @return a GENOTYPE object representing the nth best genotype */ @@ -253,28 +249,6 @@ public class RodGLF implements VariationRod, Iterator { ((VariableLengthCall) mRecord).getIndelLen1() < 0); } - /** - * get the base representation of this Variant - * - * @return a string, of ploidy - */ - @Override - public String getAlternateBases() { - return this.getBestGenotype(1).toString(); - } - - /** - * gets the alternate bases. Use this method if teh allele count is greater then 2 - * - * @return - */ - @Override - public List getAlternateBaseList() { - List list = new ArrayList(); - list.add(this.getAlternateBases()); - return list; - } - /** * Returns minor allele frequency. * @@ -310,6 +284,42 @@ public class RodGLF implements VariationRod, Iterator { return Math.abs(getBestGenotypeValue(1) - ((SinglePointCall) mRecord).getLikelihoods()[index]) / GLFRecord.LIKELIHOOD_SCALE_FACTOR; } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + @Override + public List getAlternateAlleleList() { + LikelihoodObject.GENOTYPE genotype = getBestGenotype(1); + List ret = new ArrayList(); + for (char c : genotype.toString().toCharArray()) { + if (!String.valueOf(c).equals(this.getReference())) ret.add(String.valueOf(c)); + } + return ret; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + @Override + public List getAlleleList() { + LikelihoodObject.GENOTYPE genotype = getBestGenotype(1); + List list = new ArrayList(); + if (genotype.toString().contains(this.getReference())) list.add(this.getReference()); + for (char c : genotype.toString().toCharArray()) + if (c != Utils.stringToChar(getReference())) + list.add(String.valueOf(c)); + return list; + } + public int length() { return 1; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodGeliText.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodGeliText.java index fae7321a1..1630300aa 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodGeliText.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodGeliText.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.BasicGenotype; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.genotype.Genotype; @@ -134,6 +135,40 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation return Math.abs(lodBtr); } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + @Override + public List getAlternateAlleleList() { + List list = new ArrayList(); + for (char base : bestGenotype.toCharArray()) + if (base != refBase) + list.add(String.valueOf(base)); + return list; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + @Override + public List getAlleleList() { + List list = new ArrayList(); + if (this.bestGenotype.contains(getReference())) list.add(getReference()); + for (char c : this.bestGenotype.toCharArray()) + if (c != Utils.stringToChar(getReference())) + list.add(String.valueOf(c)); + return list; + } + public String getRefBasesFWD() { return String.format("%c", getRefSnpFWD()); } @@ -147,7 +182,7 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation } public char getAltSnpFWD() throws IllegalStateException { - // both ref and bestGenotype have been uppercased, so it's safe to use == + // both ref and bestGenotype have been uppercased, so it's safe to use == char c = (bestGenotype.charAt(0) == refBase) ? bestGenotype.charAt(1) : bestGenotype.charAt(0); //System.out.printf("%s : %c and %c%n", bestGenotype, refBase, c); return c; @@ -187,28 +222,6 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation return false; } - /** - * get the base representation of this Variant - * - * @return a string, of ploidy - */ - @Override - public String getAlternateBases() { - return this.bestGenotype; - } - - /** - * gets the alternate bases. If this is homref, throws an UnsupportedOperationException - * - * @return - */ - @Override - public List getAlternateBaseList() { - List list = new ArrayList(); - list.add(this.getAlternateBases()); - return list; - } - public boolean isIndel() { return false; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java index f1da8fe29..fa53b011e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodGenotypeChipAsGFF.java @@ -2,10 +2,13 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.utils.genotype.BasicGenotype; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; + import java.util.*; import java.util.regex.MatchResult; import java.util.regex.Pattern; @@ -96,6 +99,34 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements return 4; // 1/10000 error } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + @Override + public List getAlternateAlleleList() { + throw new StingException("Hapmap is unable to provide an alternate allele list; the reference is unknown"); + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + @Override + public List getAlleleList() { + List ret = new ArrayList(); + for (char c : feature.toCharArray()) + ret.add(String.valueOf(c)); + return ret; + } + public String getAttribute(final String key) { return attributes.get(key); } @@ -181,28 +212,6 @@ public class RodGenotypeChipAsGFF extends BasicReferenceOrderedDatum implements public char getAltSnpFWD() throws IllegalStateException { return 0; } public boolean isReference() { return ! isSNP(); } - /** - * gets the alternate bases. If this is homref, throws an UnsupportedOperationException - * - * @return - */ - @Override - public String getAlternateBases() { - return this.feature; - } - - /** - * gets the alternate bases. Use this method if teh allele count is greater then 2 - * - * @return - */ - @Override - public List getAlternateBaseList() { - List list = new ArrayList(); - list.add(this.getAlternateBases()); - return list; - } - /** * get the frequency of this variant * diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java index 172e30c52..9299f2a7b 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodVCF.java @@ -183,7 +183,7 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, /** are we bi-allelic? */ @Override public boolean isBiallelic() { - return (this.getAlternateBaseList().size() == 1); + return (this.getAlternateAlleleList().size() == 1); } /** @@ -197,6 +197,37 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, return mCurrentRecord.getQual() / 10.0; } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + @Override + public List getAlternateAlleleList() { + List list = new ArrayList(); + for (VCFGenotypeEncoding enc : mCurrentRecord.getAlternateAlleles()) + list.add(enc.toString()); + return list; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + @Override + public List getAlleleList() { + List ret = new ArrayList(); + ret.add(String.valueOf(mCurrentRecord.getReferenceBase())); + ret.addAll(getAlternateAlleleList()); + return ret; + } + /** * are we truely a variant, given a reference * @@ -207,31 +238,6 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, return (!mCurrentRecord.hasAlternateAllele()); } - /** - * gets the alternate bases. If this is homref, throws an UnsupportedOperationException - * - * @return - */ - @Override - public String getAlternateBases() { - if (!this.isBiallelic()) - throw new UnsupportedOperationException("We're not biallelic, so please call getAlternateBaseList instead"); - return this.mCurrentRecord.getAlternateAlleles().get(0).toString(); - } - - /** - * gets the alternate bases. If this is homref, throws an UnsupportedOperationException - * - * @return - */ - @Override - public List getAlternateBaseList() { - List list = new ArrayList(); - for (VCFGenotypeEncoding enc : mCurrentRecord.getAlternateAlleles()) - list.add(enc.toString()); - return list; - } - /** * are we an insertion or a deletion? yes, then return true. No? Well, false then. * diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java index dc192fb80..c4eff9ac9 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java @@ -1,9 +1,10 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.genotype.Genotype; -import java.util.Arrays; + +import java.util.ArrayList; import java.util.List; public class SangerSNPROD extends TabularROD implements SNPCallFromGenotypes { @@ -100,23 +101,35 @@ public class SangerSNPROD extends TabularROD implements SNPCallFromGenotypes { } /** - * gets the alternate base. Use this method if we're biallelic + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). * - * @return + * @return an alternate allele list */ @Override - public String getAlternateBases() { - return this.get("3"); + public List getAlternateAlleleList() { + List ret = new ArrayList(); + for (char c : get("3").toCharArray()) + ret.add(String.valueOf(c)); + return ret; } /** - * gets the alternate bases. Use this method if the allele count is greater then 2 (not biallelic) + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. * - * @return + * @return an alternate allele list */ @Override - public List getAlternateBaseList() { - return Arrays.asList(this.get("3")); + public List getAlleleList() { + List ret = new ArrayList(); + ret.add(this.getReference()); + for (char c : get("3").toCharArray()) + ret.add(String.valueOf(c)); + return ret; } public int length() { return 1; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SequenomROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/SequenomROD.java index b838920c1..cd9216990 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SequenomROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SequenomROD.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.Variation; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -96,23 +97,31 @@ public class SequenomROD extends TabularROD implements Variation { } /** - * gets the alternate bases. Use this method if we're biallelic + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). * - * @return + * @return an alternate allele list */ @Override - public String getAlternateBases() { - return getAltBasesFWD(); + public List getAlternateAlleleList() { + List ret = new ArrayList(); + for (char c: getAltBasesFWD().toCharArray()) + ret.add(String.valueOf(c)); + return ret; } /** - * gets the alternate bases. Use this method if the allele count is greater then 2 (not biallelic) + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. * - * @return + * @return an alternate allele list */ @Override - public List getAlternateBaseList() { - throw new StingException("SequenomRod is not biallelic"); + public List getAlleleList() { + throw new StingException("SequenomRod doesn't know of the reference, and can't generate allele lists"); } public boolean isHom() { return false; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java index 37f5fb9cd..7691bec2c 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java @@ -67,26 +67,6 @@ public class SimpleIndelROD extends TabularROD implements Genotype, VariationRod public boolean isSNP() { return false; } public boolean isReference() { return false; } - /** - * gets the alternate base. Use this method if we're biallelic - * - * @return - */ - @Override - public String getAlternateBases() { - return getFWDAlleles().get(0); - } - - /** - * gets the alternate bases. Use this method if teh allele count is greater then 2 - * - * @return - */ - @Override - public List getAlternateBaseList() { - return getFWDAlleles(); - } - public boolean isInsertion() { if ( is1KGFormat() ) return this.get("3").equals("I"); @@ -135,6 +115,35 @@ public class SimpleIndelROD extends TabularROD implements Genotype, VariationRod return getVariationConfidence(); } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + @Override + public List getAlternateAlleleList() { + List ret = getAlleleList(); + for (String val : ret) { + if (val.equals(this.getReference())) ret.remove(val); + } + return ret; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + @Override + public List getAlleleList() { + return this.getFWDAlleles(); + } + public boolean isHom() { return false; } public boolean isHet() { return false; } public double getHeterozygosity() { return 0.0; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java index 93f0df4bb..34a04a02d 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -4,10 +4,12 @@ import net.sf.samtools.util.SequenceUtil; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.BasicGenotype; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; /** @@ -22,8 +24,8 @@ import java.util.List; */ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod, VariantBackedByGenotype { public GenomeLoc loc; // genome location of SNP - // Reference sequence chromosome or scaffold - // Start and stop positions in chrom + // Reference sequence chromosome or scaffold + // Start and stop positions in chrom public String name; // Reference SNP identifier or Affy SNP name public String strand; // Which DNA strand contains the observed alleles @@ -33,18 +35,18 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod public String molType; // Sample type from exemplar ss public String varType; // The class of variant (simple, insertion, deletion, range, etc.) - // Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion' + // Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion' public String validationStatus; // The validation status of the SNP - // one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap') + // one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap') public double avHet; // The average heterozygosity from all observations public double avHetSE; // The Standard Error for the average heterozygosity public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.) - // set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5', - // 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5') + // set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5', + // 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5') public String locType; // How the variant affects the reference sequence - // enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion') + // enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion') public int weight; // The quality of the alignment @@ -73,7 +75,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod */ @Override public String getReference() { - return getRefBasesFWD(); + return refBases; } /** @@ -86,58 +88,43 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod return 4; // -log10(0.0001) } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + public List getAlternateAlleleList() { + List ret = getAlleleList(); + for (String allele : ret) + if (allele.equals(getReference())) ret.remove(allele); + return ret; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + public List getAlleleList() { + List ret; //ref first!!!!! + if (onFwdStrand()) + ret = Arrays.asList(observed.split("/")); + else + ret = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/")); + if (ret.size() > 0 && ret.contains(getReference()) && !ret.get(0).equals(this.getReference())) + Collections.swap(ret,ret.indexOf(getReference()),0); + return ret; + } + public boolean onFwdStrand() { return strand.equals("+"); } - /** - * Returns bases in the reference allele as a String. String can be empty (as in insertion into - * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters - * (for longer indels). - * - * @return reference allele, forward strand - */ - public String getRefBasesFWD() { - // fix - at least this way we ensure that we'll get the other base compared to getAltBasesFWD() - return (getAllelesFWD().get(0).equals(refBases)) ? getAllelesFWD().get(0) : getAllelesFWD().get(1); - //if ( onFwdStrand() ) - // return refBases; - //else - // return SequenceUtil.reverseComplement(refBases); - } - - /** - * Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException - * if variant is not a SNP. - * - * @return reference base on the forward strand - */ - public char getRefSnpFWD() throws IllegalStateException { - //System.out.printf("refbases is %s but %s%n", refBases, toString()); - if (isIndel()) throw new IllegalStateException("Variant is not a SNP"); - // fix - at least this way we ensure that we'll get the other base compared to getAltBasesFWD() - List alleles = getAllelesFWD(); - String val = (alleles.get(0).equals(refBases) ? alleles.get(0) : alleles.get(1)); - return val.charAt(0); - // if ( onFwdStrand() ) return refBases.charAt(0); - // else return SequenceUtil.reverseComplement(refBases).charAt(0); - } - - public List getAllelesFWD() { - List alleles = null; - if (onFwdStrand()) - alleles = Arrays.asList(observed.split("/")); - else - alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/")); - - //System.out.printf("getAlleles %s on %s %b => %s %n", observed, strand, onFwdStrand(), Utils.join("/", alleles)); - return alleles; - } - - public String getAllelesFWDString() { - return Utils.join("", getAllelesFWD()); - } - /** * get the frequency of this variant * @@ -145,7 +132,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod */ @Override public double getNonRefAlleleFrequency() { - return 0; //To change body of implemented methods use File | Settings | File Templates. + return 0; // dbSNP doesn't know the allele frequency } /** @return the VARIANT_TYPE of the current variant */ @@ -170,28 +157,6 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod return varType.contains("deletion"); } - /** - * get the base representation of this Variant - * - * @return a string, of ploidy - */ - @Override - public String getAlternateBases() { - return getAllelesFWDString(); - } - - /** - * gets the alternate bases. Use this method if teh allele count is greater then 2 - * - * @return - */ - @Override - public List getAlternateBaseList() { - List list = new ArrayList(); - list.add(this.getAlternateBases()); - return list; - } - public boolean isIndel() { return isInsertion() || isDeletion() || varType.contains("in-del"); } @@ -204,11 +169,12 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod */ @Override public char getAlternativeBaseForSNP() { - return getAltSnpFWD(); /* - if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); - if (getAlternateBases().charAt(0) == this.getReference()) - return getAlternateBases().charAt(1); - return getAlternateBases().charAt(0); */ + if (!isSNP()) throw new StingException("We're not a SNP; called in DbSNP rod at position " + this.loc); + if (!isBiallelic()) throw new StingException("We're not biallelic; at position " + this.loc); + List ret = this.getAlternateAlleleList(); + if (ret.size() == 1 && ret.get(0).length() == 1) + return ret.get(0).charAt(0); + throw new StingException("getAlternativeBaseForSNP failed for DbSNP rod " + this.loc); } /** @@ -218,12 +184,14 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod */ @Override public char getReferenceForSNP() { - return 0; //To change body of implemented methods use File | Settings | File Templates. + if (!isSNP()) throw new StingException("We're not a SNP; called in DbSNP rod at position " + this.loc); + if (refBases.length() != 1) throw new StingException("The reference base in DbSNP must be zero, at position " + this.loc + " was " + refBases); + return refBases.charAt(0); // we know it's length 1, this is ok } public boolean isReference() { - return false; - } // snp locations are never "reference", there's always a variant + return false; // snp locations are never "reference", there's always a variant + } public boolean isHapmap() { return validationStatus.contains("by-hapmap"); @@ -250,7 +218,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod } public String toMediumString() { - String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString()); + String s = String.format("%s:%s:%s", getLocation().toString(), name, Utils.join("",this.getAlleleList())); if (isSNP()) s += ":SNP"; if (isIndel()) s += ":Indel"; if (isHapmap()) s += ":Hapmap"; @@ -300,49 +268,12 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod } } - public String getAltBasesFWD() { - List alleles = getAllelesFWD(); - return (alleles.get(0).equals(refBases) ? alleles.get(1) : alleles.get(0)); - } - - public char getAltSnpFWD() throws IllegalStateException { - if (!isSNP()) - throw new IllegalStateException("I'm not a SNP"); - return getAltBasesFWD().charAt(0); - } - - public double getConsensusConfidence() { - // TODO Auto-generated method stub - return Double.MAX_VALUE; - } - - public List getGenotype() throws IllegalStateException { - return Arrays.asList(Utils.join("", getAllelesFWD())); - } - - public double getMAF() { - // Fixme: update to actually get MAF - //return avHet; - return -1; - } - public double getHeterozygosity() { return avHet; } public int getPloidy() throws IllegalStateException { - // TODO Auto-generated method stub - return 0; - } - - public double getVariationConfidence() { - // TODO Auto-generated method stub - return Double.MAX_VALUE; - } - - public boolean isGenotype() { - // TODO Auto-generated method stub - return false; + return 2; // our DbSNP assumes a diploid human } public boolean isBiallelic() { @@ -350,10 +281,6 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod return observed.indexOf('/') == observed.lastIndexOf('/'); } - public int length() { - return (int) (loc.getStop() - loc.getStart() + 1); - } - /** * get the genotype * @@ -361,7 +288,10 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod */ @Override public org.broadinstitute.sting.utils.genotype.Genotype getCalledGenotype() { - return new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence()); + return new BasicGenotype(getLocation(), + BasicGenotype.alleleListToString(getAlleleList()), + Utils.stringToChar(getReference()), + getNegLog10PError()); } /** @@ -371,11 +301,12 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod */ @Override public List getGenotypes() { - List list = new ArrayList(); - list.add(new BasicGenotype(this.getLocation(), this.getAltBasesFWD(), this.getRefSnpFWD(), this.getConsensusConfidence())); + ArrayList list = new ArrayList(); + list.add(getCalledGenotype()); return list; } + /** * do we have the specified genotype? not all backedByGenotypes * have all the genotype data. @@ -386,21 +317,21 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod */ @Override public boolean hasGenotype(DiploidGenotype x) { - return (!x.toString().equals(this.getAltBasesFWD())) ? false : true; + return (!x.toString().equals(BasicGenotype.alleleListToString(getAlleleList()))) ? false : true; } public static rodDbSNP getFirstRealSNP(RODRecordList dbsnpList) { - if ( dbsnpList == null ) - return null; + if (dbsnpList == null) + return null; - rodDbSNP dbsnp = null; - for ( ReferenceOrderedDatum d : dbsnpList ) { - if ( ((rodDbSNP)d).isSNP() ) { - dbsnp = (rodDbSNP)d; - break; - } - } + rodDbSNP dbsnp = null; + for (ReferenceOrderedDatum d : dbsnpList) { + if (((rodDbSNP) d).isSNP()) { + dbsnp = (rodDbSNP) d; + break; + } + } - return dbsnp; + return dbsnp; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index 820553e7c..e5ecab166 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -49,19 +49,20 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker { continue; // if we have multiple variants at a locus, just take the first damn one we see for now Variation variant = (Variation) rod; + if (variant.getAlleleList().size() != 2) System.err.println("Not two " + Utils.join("-",variant.getAlleleList())); if (!rod.getName().startsWith("snpmask") && variant.isDeletion()) { - deletionBasesRemaining = variant.getAlternateBases().length(); + deletionBasesRemaining = variant.getAlleleList().get(0).length(); basesSeen++; if (indelsWriter != null) - indelsWriter.println(fasta.getCurrentID() + ":" + basesSeen + "-" + (basesSeen + variant.getAlternateBases().length())); + indelsWriter.println(fasta.getCurrentID() + ":" + basesSeen + "-" + (basesSeen + variant.getAlleleList().get(0).length())); // delete the next n bases, not this one return new Pair(context.getLocation(), refBase); } else if (!rod.getName().startsWith("snpmask") && variant.isInsertion()) { basesSeen++; if (indelsWriter != null) - indelsWriter.println(fasta.getCurrentID() + ":" + basesSeen + "-" + (basesSeen + variant.getAlternateBases().length())); - basesSeen += variant.getAlternateBases().length(); - return new Pair(context.getLocation(), refBase.concat(variant.getAlternateBases())); + indelsWriter.println(fasta.getCurrentID() + ":" + basesSeen + "-" + (basesSeen + variant.getAlleleList().get(0).length())); + basesSeen += variant.getAlleleList().get(0).length(); + return new Pair(context.getLocation(), refBase.concat(Utils.join("",variant.getAlleleList()))); } else if (variant.isSNP()) { basesSeen++; return new Pair(context.getLocation(), (rod.getName().startsWith("snpmask") ? "N" : String.valueOf(variant.getAlternativeBaseForSNP()))); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java index 81c370d0a..ed7edb695 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java @@ -70,9 +70,9 @@ public class PickSequenomProbes extends RefWalker { if ( variant.isSNP() ) assay_sequence = leading_bases + "[" + refBase + "/" + variant.getAlternativeBaseForSNP() + "]" + trailing_bases; else if ( variant.isInsertion() ) - assay_sequence = leading_bases + refBase + "[-/" + variant.getAlternateBases() + "]" + trailing_bases; + assay_sequence = leading_bases + refBase + "[-/" + Utils.join("",variant.getAlleleList()) + "]" + trailing_bases; else if ( variant.isDeletion() ) - assay_sequence = leading_bases + refBase + "[" + variant.getAlternateBases() + "/-]" + trailing_bases.substring(variant.getAlternateBases().length()); + assay_sequence = leading_bases + refBase + "[" + Utils.join("",variant.getAlleleList()) + "/-]" + trailing_bases.substring(variant.getAlleleList().size()); else return ""; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PopPriorWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PopPriorWalker.java index 3f4b01568..c158d547f 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PopPriorWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PopPriorWalker.java @@ -1,22 +1,22 @@ package org.broadinstitute.sting.playground.gatk.walkers; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import net.sf.samtools.SAMRecord; - -import java.util.List; -import java.util.Formatter; -import static java.lang.Math.log10; - import edu.mit.broad.picard.genotype.DiploidGenotype; import net.sf.picard.PicardException; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import static java.lang.Math.log10; +import java.util.Arrays; +import java.util.Formatter; +import java.util.List; public class PopPriorWalker extends LocusWalker { @@ -174,7 +174,7 @@ public class PopPriorWalker extends LocusWalker { priors = getKnownSiteKnownFreqPriors(((byte)(upRef & 0xff)), knownAlleles, hapmap.getVarAlleleFreq()); } else if (dbsnpInfo != null && dbsnpInfo.isSNP()) { - List knownAlleles = dbsnpInfo.getAllelesFWD(); + List knownAlleles = Arrays.asList(Utils.join("",dbsnpInfo.getAlleleList())); priorType = "DBSNP"; rodString = "[DBSNP: " + dbsnpInfo.toMediumString() + "]"; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/IndelSubsets.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/IndelSubsets.java index 563dea3c4..940b66de7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/IndelSubsets.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/IndelSubsets.java @@ -68,7 +68,8 @@ public class IndelSubsets implements ConcordanceType { // only deal with a valid indel Variation indel = ( indel1 != null ? indel1 : indel2 ); - int size = ( indel.getAlternateBases().length() <= sizeCutoff ? 0 : 1 ); + // we only deal with the first allele + int size = ( indel.getAlternateAlleleList().get(0).length() <= sizeCutoff ? 0 : 1 ); int homopol = ( homopolymerRunSize(ref, indel) <= homopolymerCutoff ? 0 : 1 ); writers[set1][set2][size][homopol].println(indel.toString()); @@ -80,7 +81,7 @@ public class IndelSubsets implements ConcordanceType { GenomeLoc locus = ref.getLocus(); int refBasePos = (int)(locus.getStart() - window.getStart()); - char indelBase = indel.isDeletion() ? bases[refBasePos+1] : indel.getAlternateBases().charAt(0); + char indelBase = indel.isDeletion() ? bases[refBasePos+1] : indel.getAlternateAlleleList().get(0).charAt(0); int leftRun = 0; for ( int i = refBasePos; i >= 0; i--) { if ( bases[i] != indelBase ) @@ -88,9 +89,9 @@ public class IndelSubsets implements ConcordanceType { leftRun++; } - indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.getAlternateBases().length(),bases.length-1)] : indel.getAlternateBases().charAt(indel.getAlternateBases().length()-1); + indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.getAlternateAlleleList().get(0).length(),bases.length-1)] : indel.getAlternateAlleleList().get(0).charAt(indel.getAlternateAlleleList().get(0).length()-1); int rightRun = 0; - for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.getAlternateBases().length() : 1); i < bases.length; i++) { + for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.getAlternateAlleleList().get(0).length() : 1); i < bases.length; i++) { if ( bases[i] != indelBase ) break; rightRun++; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java index 6878d91fa..28842eafd 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/IndelMetricsAnalysis.java @@ -12,10 +12,9 @@ import java.util.List; * SOFTWARE COPYRIGHT NOTICE AGREEMENT * This software and its documentation are copyright 2009 by the * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * + *

* This software is supplied without any warranty or guaranteed support whatsoever. Neither * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * */ public class IndelMetricsAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis { long insertions = 0; @@ -30,19 +29,20 @@ public class IndelMetricsAnalysis extends BasicVariantAnalysis implements Genoty } public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) { - if ( eval != null && eval.isInsertion() ) { - if ( eval.isInsertion() ) + if (eval != null && eval.isInsertion()) { + if (eval.isInsertion()) insertions++; - else if ( eval.isDeletion() ) + else if (eval.isDeletion()) deletions++; else throw new RuntimeException("Variation is indel, but isn't insertion or deletion!"); - if ( eval.getAlternateBases().length() < 100 ) { - sizes[eval.isDeletion() ? 0 : 1][eval.getAlternateBases().length()]++; - if ( eval.getAlternateBases().length() > maxSize ) - maxSize = eval.getAlternateBases().length(); - } + for (String allele : eval.getAlleleList()) + if (allele.length() < 100) { + sizes[eval.isDeletion() ? 0 : 1][allele.length()]++; + if (allele.length() > maxSize) + maxSize = allele.length(); + } } return null; @@ -56,7 +56,7 @@ public class IndelMetricsAnalysis extends BasicVariantAnalysis implements Genoty s.add("Size Distribution"); s.add("size\tdeletions\tinsertions"); - for ( int i = 1; i <= maxSize; i++ ) + for (int i = 1; i <= maxSize; i++) s.add(String.format("%d\t%d\t\t%d", i, sizes[0][i], sizes[1][i])); return s; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java index 8af756b2d..3b4d4210b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java @@ -201,7 +201,7 @@ class PooledConcordanceTable { public boolean pooledCallIsRef(Variation eval, char ref) { // code broken out for easy alteration when we start using pool-specific variations - return eval.getAlternateBases().equalsIgnoreCase((Utils.dupString(ref,2))); + return Utils.join("",eval.getAlleleList()).equalsIgnoreCase((Utils.dupString(ref,2))); } public int calculateNumFrequencyIndeces(int poolSize) { @@ -225,7 +225,7 @@ class PooledConcordanceTable { for ( Variation eval : evals ) { if ( mismatchingCalls(firstEval, eval, ref) ) { // todo -- make this not a StingException but go to the log - throw new StingException("Tri-Allelic Position "+eval.getAlternateBases()+"/"+firstEval.getAlternateBases() + " Ref: "+ ref + " not supported"); + throw new StingException("Tri-Allelic Position "+Utils.join("",eval.getAlleleList())+"/"+Utils.join("",firstEval.getAlleleList()) + " Ref: "+ ref + " not supported"); } else { alternateFrequency += calledVariantFrequency(eval,ref); } @@ -249,10 +249,10 @@ class PooledConcordanceTable { public boolean mismatchingCalls(Variation eval, Variation chip, char ref) { // eval and chip guaranteed to be non-null - char chipF = chip.getAlternateBases().charAt(0); - char chipS = chip.getAlternateBases().charAt(1); - char evalF = chip.getAlternateBases().charAt(0); - char evalS = chip.getAlternateBases().charAt(1); + char chipF = Utils.stringToChar(chip.getAlleleList().get(0)); + char chipS = Utils.stringToChar(chip.getAlleleList().get(1)); + char evalF = Utils.stringToChar(eval.getAlleleList().get(0)); + char evalS = Utils.stringToChar(eval.getAlleleList().get(1)); boolean mismatch; if (chipF == ref) { if ( chipS == ref ) { @@ -277,7 +277,7 @@ class PooledConcordanceTable { public double calledVariantFrequency( Variation var, char ref ) { // code broken out for easy alteration when we start using pool-specific variations - String varStr = var.getAlternateBases(); + String varStr = Utils.join("",var.getAlleleList()); double freq; if ( varStr.charAt(0) != ref && varStr.charAt(1) != ref ) { freq = (double) 2; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java index bc5058e07..faae1be0b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantDBCoverage.java @@ -1,7 +1,9 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.refdata.BrokenRODSimulator; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.Variation; import java.util.ArrayList; @@ -90,9 +92,9 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA } public boolean discordantP(Variation dbSNP, Variation eval) { if (eval != null) { - char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : eval.getReference().charAt(0); + char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : Utils.stringToChar(eval.getReference()); if (dbSNP != null && dbSNP.isSNP()) - return !dbSNP.getAlternateBases().contains(String.valueOf(alt)); + return !dbSNP.getAlleleList().contains(String.valueOf(alt)); } return false; } @@ -125,7 +127,7 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA if (dbsnp.isSNP() && eval.isSNP() && discordantP(dbsnp, eval)) { return String.format("Discordant [DBSNP %s] [EVAL %s]", dbsnp, eval); } else if (dbsnp.isIndel() && eval.isSNP()) { - return String.format("SNP-at-indel DBSNP=%s %s", dbsnp.getAlternateBases(), eval); + return String.format("SNP-at-indel DBSNP=%s %s", Utils.join("",dbsnp.getAlleleList()), eval); } else { return null; } diff --git a/java/src/org/broadinstitute/sting/utils/GenotypeUtils.java b/java/src/org/broadinstitute/sting/utils/GenotypeUtils.java index 53b3f1359..818d671bc 100644 --- a/java/src/org/broadinstitute/sting/utils/GenotypeUtils.java +++ b/java/src/org/broadinstitute/sting/utils/GenotypeUtils.java @@ -81,7 +81,7 @@ public class GenotypeUtils { if ( var instanceof Genotype ) return ((Genotype)var).isHet(); - String genotype = var.getAlternateBases(); + String genotype = Utils.join("",var.getAlleleList()); if ( genotype.length() < 1 ) return false; diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 707a48d46..efbbed182 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -2,16 +2,11 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.*; import net.sf.samtools.util.StringUtil; -import net.sf.picard.reference.ReferenceSequenceFile; - -import java.util.*; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.BufferedReader; - import org.apache.log4j.Logger; +import java.io.File; +import java.util.*; + /** * Created by IntelliJ IDEA. * User: depristo @@ -20,9 +15,7 @@ import org.apache.log4j.Logger; * To change this template use File | Settings | File Templates. */ public class Utils { - /** - * our log, which we want to capture anything from this class - */ + /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(Utils.class); public static void warnUser(final String msg) { @@ -45,25 +38,28 @@ public class Utils { /** * Compares two objects, either of which might be null. + * * @param lhs One object to compare. * @param rhs The other object to compare. + * * @return True if the two objects are equal, false otherwise. */ public static boolean equals(Object lhs, Object rhs) { - if( lhs == null && rhs == null ) return true; - else if( lhs == null ) return false; + if (lhs == null && rhs == null) return true; + else if (lhs == null) return false; else return lhs.equals(rhs); } public static List cons(final T elt, final List l) { List l2 = new ArrayList(); l2.add(elt); - if ( l != null ) l2.addAll(l); + if (l != null) l2.addAll(l); return l2; } /** * pretty print the warning message supplied + * * @param message the message */ private static void prettyPrintWarningMessage(String message) { @@ -71,13 +67,13 @@ public class Utils { while (builder.length() > 70) { int space = builder.lastIndexOf(" ", 70); if (space <= 0) space = 70; - logger.warn(String.format("* %s", builder.substring(0,space))); - builder.delete(0,space + 1); + logger.warn(String.format("* %s", builder.substring(0, space))); + builder.delete(0, space + 1); } logger.warn(String.format("* %s", builder)); } - public static SAMFileHeader copySAMFileHeader( SAMFileHeader toCopy ) { + public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { SAMFileHeader copy = new SAMFileHeader(); copy.setSortOrder(toCopy.getSortOrder()); @@ -86,9 +82,9 @@ public class Utils { copy.setReadGroups(toCopy.getReadGroups()); copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - for ( Map.Entry e : toCopy.getAttributes()) + for (Map.Entry e : toCopy.getAttributes()) copy.setAttribute(e.getKey(), e.getValue()); - + return copy; } @@ -104,6 +100,7 @@ public class Utils { * * @param pred filtering condition ( objects, for which pred.apply() is true pass the filter ) * @param c collection to filter (will not be modified) + * * @return new list built from elements of passing the filter * @see #filterInPlace(Predicate pred, Collection c) */ @@ -135,6 +132,7 @@ public class Utils { * * @param pred filtering condition (only elements, for which pred.apply() is true will be kept in the collection) * @param c collection to filter (will be modified - should be mutable and should implement remove() ) + * * @return reference to the same (modified) collection * @see #filter(Predicate pred, Collection c) */ @@ -175,12 +173,12 @@ public class Utils { public static ArrayList subseq(char[] fullArray) { byte[] fullByteArray = new byte[fullArray.length]; - StringUtil.charsToBytes(fullArray,0,fullArray.length,fullByteArray,0); + StringUtil.charsToBytes(fullArray, 0, fullArray.length, fullByteArray, 0); return subseq(fullByteArray); } public static ArrayList subseq(byte[] fullArray) { - return subseq(fullArray, 0, fullArray.length-1); + return subseq(fullArray, 0, fullArray.length - 1); } public static ArrayList subseq(byte[] fullArray, int start, int end) { @@ -204,9 +202,9 @@ public class Utils { public static boolean is454Read(SAMRecord read) { SAMReadGroupRecord readGroup = read.getReadGroup(); - if ( readGroup != null ) { + if (readGroup != null) { Object readPlatformAttr = readGroup.getAttribute("PL"); - if ( readPlatformAttr != null ) + if (readPlatformAttr != null) return readPlatformAttr.toString().toUpperCase().contains("454"); } return false; @@ -248,7 +246,7 @@ public class Utils { return ""; } StringBuilder ret = new StringBuilder(strings[start]); - for (int i = start+1; i < end; ++i) { + for (int i = start + 1; i < end; ++i) { ret.append(separator); ret.append(strings[i]); } @@ -332,13 +330,13 @@ public class Utils { public static Integer[] SortPermutation(final double[] A) { class comparator implements Comparator { public int compare(Integer a, Integer b) { - if (A[a.intValue()] < A[ b.intValue() ]) { + if (A[a.intValue()] < A[b.intValue()]) { return -1; } - if (A[ a.intValue() ] == A[ b.intValue() ]) { + if (A[a.intValue()] == A[b.intValue()]) { return 0; } - if (A[ a.intValue() ] > A[ b.intValue() ]) { + if (A[a.intValue()] > A[b.intValue()]) { return 1; } return 0; @@ -401,8 +399,7 @@ public class Utils { return output; } - public static List PermuteList(List list, Integer[] permutation) - { + public static List PermuteList(List list, Integer[] permutation) { List output = new ArrayList(); for (int i = 0; i < permutation.length; i++) { output.add(list.get(permutation[i])); @@ -412,47 +409,59 @@ public class Utils { /** Draw N random elements from list. */ - public static List RandomSubset(List list, int N) - { - if (list.size() <= N) { return list; } + public static List RandomSubset(List list, int N) { + if (list.size() <= N) { + return list; + } java.util.Random random = new java.util.Random(); int idx[] = new int[list.size()]; - for (int i = 0; i < list.size(); i++) { idx[i] = random.nextInt(); } + for (int i = 0; i < list.size(); i++) { + idx[i] = random.nextInt(); + } - Integer[] perm = SortPermutation(idx); + Integer[] perm = SortPermutation(idx); List ans = new ArrayList(); - for (int i = 0; i < N; i++) { ans.add(list.get(perm[i])); } + for (int i = 0; i < N; i++) { + ans.add(list.get(perm[i])); + } return ans; } // lifted from the internet // http://www.cs.princeton.edu/introcs/91float/Gamma.java.html - public static double logGamma(double x) - { - double tmp = (x - 0.5) * Math.log(x + 4.5) - (x + 4.5); - double ser = 1.0 + 76.18009173 / (x + 0) - 86.50532033 / (x + 1) - + 24.01409822 / (x + 2) - 1.231739516 / (x + 3) - + 0.00120858003 / (x + 4) - 0.00000536382 / (x + 5); - return tmp + Math.log(ser * Math.sqrt(2 * Math.PI)); + public static double logGamma(double x) { + double tmp = (x - 0.5) * Math.log(x + 4.5) - (x + 4.5); + double ser = 1.0 + 76.18009173 / (x + 0) - 86.50532033 / (x + 1) + + 24.01409822 / (x + 2) - 1.231739516 / (x + 3) + + 0.00120858003 / (x + 4) - 0.00000536382 / (x + 5); + return tmp + Math.log(ser * Math.sqrt(2 * Math.PI)); } - public static double percentage(double x, double base) { return (base> 0 ? (x/base)*100.0 : 0); } - public static double percentage(int x, int base) { return (base> 0 ? ((double)x/(double)base)*100.0 : 0); } - public static double percentage(long x, long base) { return (base> 0 ? ((double)x/(double)base)*100.0 : 0); } + public static double percentage(double x, double base) { + return (base > 0 ? (x / base) * 100.0 : 0); + } - public static String dupString( char c, int nCopies ) { + public static double percentage(int x, int base) { + return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); + } + + public static double percentage(long x, long base) { + return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); + } + + public static String dupString(char c, int nCopies) { char[] chars = new char[nCopies]; - Arrays.fill(chars,c); + Arrays.fill(chars, c); return new String(chars); } public static int countOccurrences(char c, String s) { int count = 0; - for ( int i = 0; i < s.length(); i++ ) { + for (int i = 0; i < s.length(); i++) { count += s.charAt(i) == c ? 1 : 0; } return count; @@ -460,17 +469,17 @@ public class Utils { public static int countOccurrences(T x, List l) { int count = 0; - for ( T y : l ) { - if ( x.equals(y) ) count++; + for (T y : l) { + if (x.equals(y)) count++; } - + return count; } public static byte listMaxByte(List quals) { - if ( quals.size() == 0 ) return 0; + if (quals.size() == 0) return 0; byte m = quals.get(0); - for ( byte b : quals ) { + for (byte b : quals) { m = b > m ? b : m; } return m; @@ -479,67 +488,84 @@ public class Utils { /** Returns indices of all occurrences of the specified symbol in the string */ public static int[] indexOfAll(String s, int ch) { - int[] pos = new int[64]; - int z = 0; - - for ( int i = 0 ; i < s.length() ; i++ ) { - if ( s.charAt(i) == ch ) pos[z++] = i; - } - return reallocate(pos,z); + int[] pos = new int[64]; + int z = 0; + + for (int i = 0; i < s.length(); i++) { + if (s.charAt(i) == ch) pos[z++] = i; + } + return reallocate(pos, z); } - - /** Returns new (reallocated) integer array of the specified size, with content + + /** + * Returns new (reallocated) integer array of the specified size, with content * of the original array orig copied into it. If newSize is * less than the size of the original array, only first newSize elements will be copied. * If new size is greater than the size of the original array, the content of the original array will be padded - * with zeros up to the new size. Finally, if new size is the same as original size, no memory reallocation - * will be performed and the original array will be returned instead. + * with zeros up to the new size. Finally, if new size is the same as original size, no memory reallocation + * will be performed and the original array will be returned instead. + * * @param orig * @param newSize + * * @return */ public static int[] reallocate(int[] orig, int newSize) { - if ( orig.length == newSize ) return orig; - int[] new_array = new int[newSize]; - int L = ( newSize > orig.length ? orig.length : newSize ); - for ( int i = 0 ; i < L ; i++ ) new_array[i] = orig[i]; - return new_array; + if (orig.length == newSize) return orig; + int[] new_array = new int[newSize]; + int L = (newSize > orig.length ? orig.length : newSize); + for (int i = 0; i < L; i++) new_array[i] = orig[i]; + return new_array; } - + /* TEST ME - public static void main(String[] argv) { - List l1 = new LinkedList(); - List l2 = new ArrayList(); + public static void main(String[] argv) { + List l1 = new LinkedList(); + List l2 = new ArrayList(); - l1.add(1); - l1.add(5); - l1.add(3); - l1.add(10); - l1.add(4); - l1.add(2); - l2.add(1); - l2.add(5); - l2.add(3); - l2.add(10); - l2.add(4); - l2.add(2); + l1.add(1); + l1.add(5); + l1.add(3); + l1.add(10); + l1.add(4); + l1.add(2); + l2.add(1); + l2.add(5); + l2.add(3); + l2.add(10); + l2.add(4); + l2.add(2); - Predicate p = new Predicate() { - public boolean apply(Integer i) { - return i > 2; - } - }; - filterInPlace(p, l1); - filterInPlace(p, l2); + Predicate p = new Predicate() { + public boolean apply(Integer i) { + return i > 2; + } + }; + filterInPlace(p, l1); + filterInPlace(p, l2); - for ( int i = 0 ; i < l1.size(); i++ ) System.out.print(" "+l1.get(i)); - System.out.println(); - for ( int i = 0 ; i < l2.size(); i++ ) System.out.print(" " + l2.get(i)); - System.out.println(); + for ( int i = 0 ; i < l1.size(); i++ ) System.out.print(" "+l1.get(i)); + System.out.println(); + for ( int i = 0 ; i < l2.size(); i++ ) System.out.print(" " + l2.get(i)); + System.out.println(); + } + + */ + + /** + * a helper method. Turns a single character string into a char. + * + * @param str the string + * + * @return a char + */ + public static char stringToChar(String str) { + if (str.length() != 1) throw new IllegalArgumentException("String length must be one"); + return str.charAt(0); } -*/ + } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java b/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java index e4863cf6a..2082d127a 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/BasicGenotype.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils.genotype; import org.broadinstitute.sting.utils.GenomeLoc; +import java.util.List; + /** * @author aaron @@ -153,4 +155,17 @@ public class BasicGenotype implements Genotype { if (!isVariant(this.mRef)) throw new IllegalStateException("this genotype is not a variant"); return new BasicVariation(this.getBases(), String.valueOf(mRef), this.getBases().length(), mLocation, mNegLog10PError); } + + /** + * Turn a list of alleles into a genotype + * @param alleles the list of alleles + * @return a string representation of this list + */ + public static String alleleListToString(List alleles) { + StringBuilder builder = new StringBuilder(); + for (String allele : alleles) + builder.append(allele); + return builder.toString(); + } + } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java b/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java index 361c617fc..87535f52b 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/BasicVariation.java @@ -1,9 +1,11 @@ package org.broadinstitute.sting.utils.genotype; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; -import java.util.List; import java.util.ArrayList; +import java.util.List; /** * User: aaron @@ -40,6 +42,7 @@ public class BasicVariation implements Variation { public BasicVariation(String bases, String reference, int length, GenomeLoc location, double confidence) { mBases = bases; mRef = reference; + if (mRef.length() != 1) throw new StingException("The reference must be a single base"); mLength = length; mLocation = location; mConfidence = confidence; @@ -82,23 +85,6 @@ public class BasicVariation implements Variation { return (mLength < 0); } - @Override - public String getAlternateBases() { - return mBases; - } - - /** - * gets the alternate bases. Use this method if teh allele count is greater then 2 - * - * @return - */ - @Override - public List getAlternateBaseList() { - List list = new ArrayList(); - list.add(this.getAlternateBases()); - return list; - } - @Override public GenomeLoc getLocation() { return mLocation; @@ -112,7 +98,7 @@ public class BasicVariation implements Variation { /** are we bi-allelic? */ @Override public boolean isBiallelic() { - return true; + return (getAlternateAlleleList().size() == 1); } @Override @@ -120,6 +106,40 @@ public class BasicVariation implements Variation { return mConfidence; } + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + @Override + public List getAlternateAlleleList() { + List list = new ArrayList(); + for (char c : this.mBases.toCharArray()) + if (c != Utils.stringToChar(mRef)) + list.add(String.valueOf(c)); + return list; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + @Override + public List getAlleleList() { + List list = new ArrayList(); + if (this.mBases.contains(mRef)) list.add(mRef); + for (char c : this.mBases.toCharArray()) + if (c != Utils.stringToChar(mRef)) + list.add(String.valueOf(c)); + return list; + } + @Override public boolean isReference() { if (mLength != 0) return false; @@ -149,11 +169,8 @@ public class BasicVariation implements Variation { @Override public char getAlternativeBaseForSNP() { if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); - - // we know that if we're a snp, the reference is a single base, so charAt(0) is safe - if (getAlternateBases().charAt(0) == this.getReference().charAt(0)) - return getAlternateBases().charAt(1); - return getAlternateBases().charAt(0); + if (!this.isBiallelic() || this.getAlternateAlleleList().size() != 1) throw new IllegalStateException("we're not biallelic"); + return Utils.stringToChar(this.getAlternateAlleleList().get(0)); } /** @@ -164,11 +181,8 @@ public class BasicVariation implements Variation { @Override public char getReferenceForSNP() { if (!this.isSNP()) throw new IllegalStateException("we're not a SNP"); - - // we know that if we're a snp, the reference is a single base, so charAt(0) is safe - if (getAlternateBases().charAt(0) == this.getReference().charAt(0)) - return getAlternateBases().charAt(0); - return getAlternateBases().charAt(1); + if (!this.isBiallelic()) throw new IllegalStateException("we're not biallelic"); + return Utils.stringToChar(this.mRef); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Variation.java b/java/src/org/broadinstitute/sting/utils/genotype/Variation.java index bce1b1c72..c7c8c21dd 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Variation.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Variation.java @@ -14,14 +14,21 @@ import java.util.List; public interface Variation { // the types of variants we currently allow public enum VARIANT_TYPE { - SNP, INDEL, REFERENCE // though reference is not really a variant + SNP, INDEL, REFERENCE // though reference is not really a variant, we need to represent it } + /** are we bi-allelic? */ + public boolean isBiallelic(); + /** * get the frequency of this variant, if we're a variant. If we're reference this method - * should return 0. + * should return 0. If we can't provide an alternate allele frequency, this should also + * return 0. * - * @return double with the stored frequency + * WARNING: This method is only valid for biAllelic data, the contract is to check isBiallelic() + * before calling this method + * + * @return double the minor allele frequency */ public double getNonRefAlleleFrequency(); @@ -32,7 +39,8 @@ public interface Variation { public VARIANT_TYPE getType(); /** - * are we a SNP? If not we're a Indel/deletion or the reference + * are we a SNP? If not we're a Indel/deletion or the reference. This method must be call before you use + * the convenience methods getAlternativeBaseForSNP or getReferenceForSNP, to ensure that you're working with a SNP * * @return true if we're a SNP */ @@ -60,22 +68,26 @@ public interface Variation { public boolean isReference(); /** - * get the location that this Variant represents + * are we an insertion or a deletion? yes, then return true. No? false. + * + * @return true if we're an insertion or deletion + */ + public boolean isIndel(); + + /** + * get the location of this Variant * * @return a GenomeLoc */ public GenomeLoc getLocation(); /** - * get the reference base(s) at this position + * get the reference base(s) for this Variant * * @return the reference base or bases, as a string */ public String getReference(); - /** are we bi-allelic? */ - public boolean isBiallelic(); - /** * get the -1 * (log 10 of the error value) * @@ -83,26 +95,26 @@ public interface Variation { */ public double getNegLog10PError(); - /** - * gets the alternate base. Use this method if we're biallelic - * - * @return - */ - public String getAlternateBases(); /** - * gets the alternate bases. Use this method if the allele count is greater then 2 (not biallelic) + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). * - * @return + * @return an alternate allele list */ - public List getAlternateBaseList(); + public List getAlternateAlleleList(); /** - * are we an insertion or a deletion? yes, then return true. No? false. + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. If the reference base is not an allele in this varation + * it will not be in the list (i.e. there is no guarantee that the reference base is in the list). * - * @return true if we're an insertion or deletion + * @return an alternate allele list */ - public boolean isIndel(); + public List getAlleleList(); /** * gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/rodDbSNPTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/rodDbSNPTest.java index 9f2bc09ff..a179ad2f6 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/rodDbSNPTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/rodDbSNPTest.java @@ -56,10 +56,10 @@ public class rodDbSNPTest extends BaseTest { rodDbSNP var = (rodDbSNP)rod; if (rod.isSNP()) { // quick check, if we're not triallelic, make sure the ref is right - if (var.getRefSnpFWD() == var.refBases.charAt(0) || var.getAltSnpFWD() == var.refBases.charAt(0)) + if (var.getReferenceForSNP() == var.refBases.charAt(0) || var.getAlternativeBaseForSNP() == var.refBases.charAt(0)) // also make sure the ref is a single character if (var.refBases.length() == 1) - Assert.assertTrue(var.refBases.charAt(0)==var.getRefSnpFWD()); + Assert.assertTrue(var.refBases.charAt(0)==var.getReferenceForSNP()); if (var.getLocation().getContig().equals("1") && var.getLocation().getStart() >= 10000000 && var.getLocation().getStart() <= 11000000) { diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalkerIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalkerIntegrationTest.java index e064b05f2..b9c7c2141 100644 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalkerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalkerIntegrationTest.java @@ -82,7 +82,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest { @Test public void testEvalVariantRODOutputViolations() { List md5 = new ArrayList(); - md5.add("ad2ca71dfa7e45f369380178c4f8e69f"); + md5.add("d84e5b2a23ab1cf028145f09cd1e9f5b"); /** * the above MD5 was calculated from running the following command: