diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 3bb3d7d5a..2b3290595 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -95,9 +95,9 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota for ( byte base : ref.getBases() ) { int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex ) + if ( baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal() ) gc++; - else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex ) + else if ( baseIndex == BaseUtils.Base.A.ordinal() || baseIndex == BaseUtils.Base.T.ordinal() ) at++; else ; // ignore diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 1e4c55e0d..b10daab58 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -938,7 +938,7 @@ public class DepthOfCoverage extends LocusWalker { if ( lowerCaseSNPs ) { sequence.append(Character.toLowerCase((char) ref.getBase())); } else { - sequence.append((char) BaseUtils.N); + sequence.append((char) BaseUtils.Base.N.base); } rawSequence.append(Character.toUpperCase((char) ref.getBase())); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index 28f128dd3..6db44efd5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -111,8 +111,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData1() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); @@ -160,9 +160,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData2() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_T)); @@ -213,10 +213,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData3() { - Allele reference_ACT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C,BaseUtils.T},true); - Allele alt_AC = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C}); - Allele alt_A = Allele.create(BaseUtils.A); - Allele alt_ATT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.T,BaseUtils.T}); + Allele reference_ACT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base,BaseUtils.Base.T.base},true); + Allele alt_AC = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base}); + Allele alt_A = Allele.create(BaseUtils.Base.A.base); + Allele alt_ATT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.T.base,BaseUtils.Base.T.base}); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_ATT)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); @@ -267,9 +267,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData4() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); @@ -316,9 +316,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData5() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", new ArrayList(0)); @@ -368,8 +368,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private List> getData6() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); // site 1 - @@ -396,8 +396,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite1 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - reference_A = Allele.create(BaseUtils.A,true); - Allele alt_T = Allele.create(BaseUtils.T); + reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); // site 2 - // sample 1: no-call/hom-ref @@ -421,7 +421,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite2 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - Allele alt_G = Allele.create(BaseUtils.G); + Allele alt_G = Allele.create(BaseUtils.Base.G.base); // site 3 - // sample 1: alleles do not match @@ -605,10 +605,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { public List> getData7() { - Allele ref1 = Allele.create(BaseUtils.T,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.G); - Allele alt3 = Allele.create(BaseUtils.A); + Allele ref1 = Allele.create(BaseUtils.Base.T.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.G.base); + Allele alt3 = Allele.create(BaseUtils.Base.A.base); GenomeLoc loc1 = genomeLocParser.createGenomeLoc("chr1",1,1); VariantContextBuilder site1Eval = new VariantContextBuilder(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index 573291d06..fe2eee2a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -217,9 +217,9 @@ public class CoverageUtils { private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount(); - } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { - counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount(); + counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount(); + } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) { + counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount(); } else { try { counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java index 9cd1be2d9..668d3fd5f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java @@ -86,7 +86,7 @@ public class GCContentByInterval extends LocusWalker { if (tracker == null) return null; int baseIndex = ref.getBaseIndex(); - return (baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex) ? 1L : 0L; + return (baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal()) ? 1L : 0L; } public Long reduce(Long toAdd, Long runningCount) { diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 3d43d5d4d..88eaa8910 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -33,6 +33,7 @@ import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.utils.BaseUtils; import java.io.File; import java.io.FileNotFoundException; @@ -41,9 +42,10 @@ import java.util.Arrays; /** * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * - * Thread-safe! Uses a thread-local cache + * Thread-safe! Uses a thread-local cache. * - * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set + * Automatically upper-cases the bases coming in, unless the flag preserveCase is explicitly set. + * Automatically converts IUPAC bases to Ns, unless the flag preserveIUPAC is explicitly set. */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); @@ -64,10 +66,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { private final long cacheMissBackup; /** - * If true, we will preserve the case of the original base in the genome, not + * If true, we will preserve the case of the original base in the genome */ private final boolean preserveCase; + /** + * If true, we will preserve the IUPAC bases in the genome + */ + private final boolean preserveIUPAC; + // information about checking efficiency long cacheHits = 0; long cacheMisses = 0; @@ -97,13 +104,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param index the index of the fasta file, used for efficient random access * @param cacheSize the size in bp of the cache we will use for this reader * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) { super(fasta, index); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + this.preserveIUPAC = preserveIUPAC; } /** @@ -122,19 +131,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + preserveIUPAC = false; } -// /** -// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. -// * -// * @param fasta The file to open. -// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. -// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. -// */ -// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { -// this(fasta, index, DEFAULT_CACHE_SIZE); -// } - /** * Same as general constructor but allows one to override the default cacheSize * @@ -145,7 +144,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size in bp of the cache we will use for this reader */ public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { - this(fasta, index, cacheSize, false); + this(fasta, index, cacheSize, false, false); } /** @@ -240,6 +239,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { return ! isPreservingCase(); } + /** + * Is this CachingIndexedFastaReader keeping the IUPAC bases in the fasta, or is it turning them into Ns? + * + * @return true if the IUPAC bases coming from this reader are not modified + */ + public boolean isPreservingIUPAC() { + return preserveIUPAC; + } + /** * Gets the subsequence of the contig in the range [start,stop] * @@ -261,8 +269,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true); } else { - // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true + // todo -- potential optimization is to check if contig.name == contig, as this in general will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); if (stop > contigInfo.getSequenceLength()) @@ -276,6 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { // convert all of the bases in the sequence to upper case if we aren't preserving cases if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true); } else { cacheHits++; } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index c0e18f227..5a5358208 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -52,7 +51,7 @@ public class PileupElement implements Comparable { private final static EnumSet ON_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); - public static final byte DELETION_BASE = BaseUtils.D; + public static final byte DELETION_BASE = BaseUtils.Base.D.base; public static final byte DELETION_QUAL = (byte) 16; public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 0907a0239..b7a813ec2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -402,13 +401,13 @@ public class AlignmentUtils { switch (ce.getOperator()) { case I: if (alignPos > 0) { - if (alignment[alignPos - 1] == BaseUtils.A) { + if (alignment[alignPos - 1] == BaseUtils.Base.A.base) { alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.C) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.C.base) { alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.T) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.T.base) { alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.G) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.G.base) { alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } } diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java index 819041a3e..7a37e8de5 100644 --- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java @@ -26,6 +26,7 @@ package org.broadinstitute.variant.utils; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.Random; @@ -34,42 +35,66 @@ import java.util.Random; * BaseUtils contains some basic utilities for manipulating nucleotides. */ public class BaseUtils { - public final static byte A = (byte) 'A'; - public final static byte C = (byte) 'C'; - public final static byte G = (byte) 'G'; - public final static byte T = (byte) 'T'; - public final static byte N = (byte) 'N'; - public final static byte D = (byte) 'D'; + public enum Base { + A ((byte)'A'), + C ((byte)'C'), + G ((byte)'G'), + T ((byte)'T'), + N ((byte)'N'), + D ((byte)'D'); - // - // todo -- we need a generalized base abstraction using the Base enum. - // + public byte base; + + private Base(final byte base) { + this.base = base; + } + } + + // todo -- add this to the generalized base abstraction using the Base enum. public final static byte[] BASES = {'A', 'C', 'G', 'T'}; public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; static private final int[] baseIndexMap = new int[256]; static { Arrays.fill(baseIndexMap, -1); - baseIndexMap['A'] = 0; - baseIndexMap['a'] = 0; - baseIndexMap['*'] = 0; // the wildcard character counts as an A - baseIndexMap['C'] = 1; - baseIndexMap['c'] = 1; - baseIndexMap['G'] = 2; - baseIndexMap['g'] = 2; - baseIndexMap['T'] = 3; - baseIndexMap['t'] = 3; + baseIndexMap['A'] = Base.A.ordinal(); + baseIndexMap['a'] = Base.A.ordinal(); + baseIndexMap['*'] = Base.A.ordinal(); // the wildcard character counts as an A + baseIndexMap['C'] = Base.C.ordinal(); + baseIndexMap['c'] = Base.C.ordinal(); + baseIndexMap['G'] = Base.G.ordinal(); + baseIndexMap['g'] = Base.G.ordinal(); + baseIndexMap['T'] = Base.T.ordinal(); + baseIndexMap['t'] = Base.T.ordinal(); } - // todo -- fix me (enums?) - public static final byte DELETION_INDEX = 4; - public static final byte NO_CALL_INDEX = 5; // (this is 'N') - - public static final int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A'); - public static final int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C'); - public static final int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G'); - public static final int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T'); + static private final int[] baseIndexWithIupacMap = baseIndexMap.clone(); + static { + baseIndexWithIupacMap['*'] = -1; // the wildcard character is bad + baseIndexWithIupacMap['N'] = Base.N.ordinal(); + baseIndexWithIupacMap['n'] = Base.N.ordinal(); + baseIndexWithIupacMap['R'] = Base.N.ordinal(); + baseIndexWithIupacMap['r'] = Base.N.ordinal(); + baseIndexWithIupacMap['Y'] = Base.N.ordinal(); + baseIndexWithIupacMap['y'] = Base.N.ordinal(); + baseIndexWithIupacMap['M'] = Base.N.ordinal(); + baseIndexWithIupacMap['m'] = Base.N.ordinal(); + baseIndexWithIupacMap['K'] = Base.N.ordinal(); + baseIndexWithIupacMap['k'] = Base.N.ordinal(); + baseIndexWithIupacMap['W'] = Base.N.ordinal(); + baseIndexWithIupacMap['w'] = Base.N.ordinal(); + baseIndexWithIupacMap['S'] = Base.N.ordinal(); + baseIndexWithIupacMap['s'] = Base.N.ordinal(); + baseIndexWithIupacMap['B'] = Base.N.ordinal(); + baseIndexWithIupacMap['b'] = Base.N.ordinal(); + baseIndexWithIupacMap['D'] = Base.N.ordinal(); + baseIndexWithIupacMap['d'] = Base.N.ordinal(); + baseIndexWithIupacMap['H'] = Base.N.ordinal(); + baseIndexWithIupacMap['h'] = Base.N.ordinal(); + baseIndexWithIupacMap['V'] = Base.N.ordinal(); + baseIndexWithIupacMap['v'] = Base.N.ordinal(); + } // Use a fixed random seed to allow for deterministic results when using random bases private static final Random randomNumberGen = new Random(47382911L); @@ -96,10 +121,10 @@ public class BaseUtils { } public static boolean isTransition(byte base1, byte base2) { - int b1 = simpleBaseToBaseIndex(base1); - int b2 = simpleBaseToBaseIndex(base2); - return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 || - b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; + final int b1 = simpleBaseToBaseIndex(base1); + final int b2 = simpleBaseToBaseIndex(base2); + return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() || + b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal(); } public static boolean isTransversion(byte base1, byte base2) { @@ -141,6 +166,19 @@ public class BaseUtils { return base >= 'A' && base <= 'Z'; } + public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) { + final int length = bases.length; + for ( int i = 0; i < length; i++ ) { + final int baseIndex = baseIndexWithIupacMap[bases[i]]; + if ( baseIndex == Base.N.ordinal() ) { + bases[i] = 'N'; + } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { + throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); + } + } + return bases; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -231,10 +269,10 @@ public class BaseUtils { switch (base) { case 'd': case 'D': - return DELETION_INDEX; + return Base.D.ordinal(); case 'n': case 'N': - return NO_CALL_INDEX; + return Base.N.ordinal(); default: return simpleBaseToBaseIndex(base); diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java index 372d13a7a..4f918f718 100644 --- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java @@ -50,6 +50,21 @@ public class BaseUtilsUnitTest extends BaseTest { Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0); } + @Test + public void testConvertIUPACtoN() { + + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false), new byte[]{'A', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false), new byte[]{'N', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false), new byte[]{'A', 'N', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false), new byte[]{'A', 'A', 'N'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false), new byte[]{'N', 'N', 'N'}); + } + + private void checkBytesAreEqual(final byte[] b1, final byte[] b2) { + for ( int i = 0; i < b1.length; i++ ) + Assert.assertEquals(b1[i], b2[i]); + } + @Test public void testTransitionTransversion() { logger.warn("Executing testTransitionTransversion"); diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java index 49720d1f6..03d6f457f 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -154,9 +154,9 @@ public class GenotypeLikelihoodsUnitTest { public void testGetQualFromLikelihoodsMultiAllelic() { GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); - Allele ref = Allele.create(BaseUtils.A,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.T); + Allele ref = Allele.create(BaseUtils.Base.A.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.T.base); List allAlleles = Arrays.asList(ref,alt1,alt2); List gtAlleles = Arrays.asList(alt1,alt2); GenotypeBuilder gtBuilder = new GenotypeBuilder();