From 392b5cbcdfd5200f04d0b26f9e73d16399e17769 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 10:22:43 -0500 Subject: [PATCH] The CachingIndexedFastaSequenceFile now automatically converts IUPAC bases to Ns and errors out on other non-standard bases. This way walkers won't see anything except the standard bases plus Ns in the reference. Added option to turn off this feature (to maintain backwards compatibility). As part of this commit I cleaned up the BaseUtils code by adding a Base enum and removing all of the static indexes for each of the bases. This uncovered a bug in the way the DepthOfCoverage walker counts deletions (it was counting Ns instead!) that isn't covered by tests. Fortunately that walker is being deprecated soon... --- .../gatk/walkers/annotator/GCContent.java | 4 +- .../walkers/coverage/DepthOfCoverage.java | 2 +- .../coverage/DepthOfCoverageStats.java | 2 +- .../validation/ValidationAmplicons.java | 2 +- .../ConcordanceMetricsUnitTest.java | 48 ++++----- .../gatk/walkers/coverage/CoverageUtils.java | 6 +- .../walkers/coverage/GCContentByInterval.java | 2 +- .../CachingIndexedFastaSequenceFile.java | 44 +++++--- .../sting/utils/pileup/PileupElement.java | 3 +- .../sting/utils/sam/AlignmentUtils.java | 9 +- .../variant/utils/BaseUtils.java | 102 ++++++++++++------ .../variant/utils/BaseUtilsUnitTest.java | 15 +++ .../GenotypeLikelihoodsUnitTest.java | 6 +- 13 files changed, 153 insertions(+), 92 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 3bb3d7d5a..2b3290595 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -95,9 +95,9 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota for ( byte base : ref.getBases() ) { int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex ) + if ( baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal() ) gc++; - else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex ) + else if ( baseIndex == BaseUtils.Base.A.ordinal() || baseIndex == BaseUtils.Base.T.ordinal() ) at++; else ; // ignore diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 1e4c55e0d..b10daab58 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -938,7 +938,7 @@ public class DepthOfCoverage extends LocusWalker { if ( lowerCaseSNPs ) { sequence.append(Character.toLowerCase((char) ref.getBase())); } else { - sequence.append((char) BaseUtils.N); + sequence.append((char) BaseUtils.Base.N.base); } rawSequence.append(Character.toUpperCase((char) ref.getBase())); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index 28f128dd3..6db44efd5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -111,8 +111,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData1() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); @@ -160,9 +160,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData2() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_T)); @@ -213,10 +213,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData3() { - Allele reference_ACT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C,BaseUtils.T},true); - Allele alt_AC = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C}); - Allele alt_A = Allele.create(BaseUtils.A); - Allele alt_ATT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.T,BaseUtils.T}); + Allele reference_ACT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base,BaseUtils.Base.T.base},true); + Allele alt_AC = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base}); + Allele alt_A = Allele.create(BaseUtils.Base.A.base); + Allele alt_ATT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.T.base,BaseUtils.Base.T.base}); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_ATT)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); @@ -267,9 +267,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData4() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); @@ -316,9 +316,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData5() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", new ArrayList(0)); @@ -368,8 +368,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private List> getData6() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); // site 1 - @@ -396,8 +396,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite1 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - reference_A = Allele.create(BaseUtils.A,true); - Allele alt_T = Allele.create(BaseUtils.T); + reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); // site 2 - // sample 1: no-call/hom-ref @@ -421,7 +421,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite2 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - Allele alt_G = Allele.create(BaseUtils.G); + Allele alt_G = Allele.create(BaseUtils.Base.G.base); // site 3 - // sample 1: alleles do not match @@ -605,10 +605,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { public List> getData7() { - Allele ref1 = Allele.create(BaseUtils.T,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.G); - Allele alt3 = Allele.create(BaseUtils.A); + Allele ref1 = Allele.create(BaseUtils.Base.T.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.G.base); + Allele alt3 = Allele.create(BaseUtils.Base.A.base); GenomeLoc loc1 = genomeLocParser.createGenomeLoc("chr1",1,1); VariantContextBuilder site1Eval = new VariantContextBuilder(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index 573291d06..fe2eee2a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -217,9 +217,9 @@ public class CoverageUtils { private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount(); - } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { - counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount(); + counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount(); + } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) { + counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount(); } else { try { counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java index 9cd1be2d9..668d3fd5f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java @@ -86,7 +86,7 @@ public class GCContentByInterval extends LocusWalker { if (tracker == null) return null; int baseIndex = ref.getBaseIndex(); - return (baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex) ? 1L : 0L; + return (baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal()) ? 1L : 0L; } public Long reduce(Long toAdd, Long runningCount) { diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 3d43d5d4d..88eaa8910 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -33,6 +33,7 @@ import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.utils.BaseUtils; import java.io.File; import java.io.FileNotFoundException; @@ -41,9 +42,10 @@ import java.util.Arrays; /** * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * - * Thread-safe! Uses a thread-local cache + * Thread-safe! Uses a thread-local cache. * - * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set + * Automatically upper-cases the bases coming in, unless the flag preserveCase is explicitly set. + * Automatically converts IUPAC bases to Ns, unless the flag preserveIUPAC is explicitly set. */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); @@ -64,10 +66,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { private final long cacheMissBackup; /** - * If true, we will preserve the case of the original base in the genome, not + * If true, we will preserve the case of the original base in the genome */ private final boolean preserveCase; + /** + * If true, we will preserve the IUPAC bases in the genome + */ + private final boolean preserveIUPAC; + // information about checking efficiency long cacheHits = 0; long cacheMisses = 0; @@ -97,13 +104,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param index the index of the fasta file, used for efficient random access * @param cacheSize the size in bp of the cache we will use for this reader * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) { super(fasta, index); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + this.preserveIUPAC = preserveIUPAC; } /** @@ -122,19 +131,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + preserveIUPAC = false; } -// /** -// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. -// * -// * @param fasta The file to open. -// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. -// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. -// */ -// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { -// this(fasta, index, DEFAULT_CACHE_SIZE); -// } - /** * Same as general constructor but allows one to override the default cacheSize * @@ -145,7 +144,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size in bp of the cache we will use for this reader */ public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { - this(fasta, index, cacheSize, false); + this(fasta, index, cacheSize, false, false); } /** @@ -240,6 +239,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { return ! isPreservingCase(); } + /** + * Is this CachingIndexedFastaReader keeping the IUPAC bases in the fasta, or is it turning them into Ns? + * + * @return true if the IUPAC bases coming from this reader are not modified + */ + public boolean isPreservingIUPAC() { + return preserveIUPAC; + } + /** * Gets the subsequence of the contig in the range [start,stop] * @@ -261,8 +269,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true); } else { - // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true + // todo -- potential optimization is to check if contig.name == contig, as this in general will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); if (stop > contigInfo.getSequenceLength()) @@ -276,6 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { // convert all of the bases in the sequence to upper case if we aren't preserving cases if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true); } else { cacheHits++; } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index c0e18f227..5a5358208 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -52,7 +51,7 @@ public class PileupElement implements Comparable { private final static EnumSet ON_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); - public static final byte DELETION_BASE = BaseUtils.D; + public static final byte DELETION_BASE = BaseUtils.Base.D.base; public static final byte DELETION_QUAL = (byte) 16; public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 0907a0239..b7a813ec2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -402,13 +401,13 @@ public class AlignmentUtils { switch (ce.getOperator()) { case I: if (alignPos > 0) { - if (alignment[alignPos - 1] == BaseUtils.A) { + if (alignment[alignPos - 1] == BaseUtils.Base.A.base) { alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.C) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.C.base) { alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.T) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.T.base) { alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.G) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.G.base) { alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } } diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java index 819041a3e..7a37e8de5 100644 --- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java @@ -26,6 +26,7 @@ package org.broadinstitute.variant.utils; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.Random; @@ -34,42 +35,66 @@ import java.util.Random; * BaseUtils contains some basic utilities for manipulating nucleotides. */ public class BaseUtils { - public final static byte A = (byte) 'A'; - public final static byte C = (byte) 'C'; - public final static byte G = (byte) 'G'; - public final static byte T = (byte) 'T'; - public final static byte N = (byte) 'N'; - public final static byte D = (byte) 'D'; + public enum Base { + A ((byte)'A'), + C ((byte)'C'), + G ((byte)'G'), + T ((byte)'T'), + N ((byte)'N'), + D ((byte)'D'); - // - // todo -- we need a generalized base abstraction using the Base enum. - // + public byte base; + + private Base(final byte base) { + this.base = base; + } + } + + // todo -- add this to the generalized base abstraction using the Base enum. public final static byte[] BASES = {'A', 'C', 'G', 'T'}; public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; static private final int[] baseIndexMap = new int[256]; static { Arrays.fill(baseIndexMap, -1); - baseIndexMap['A'] = 0; - baseIndexMap['a'] = 0; - baseIndexMap['*'] = 0; // the wildcard character counts as an A - baseIndexMap['C'] = 1; - baseIndexMap['c'] = 1; - baseIndexMap['G'] = 2; - baseIndexMap['g'] = 2; - baseIndexMap['T'] = 3; - baseIndexMap['t'] = 3; + baseIndexMap['A'] = Base.A.ordinal(); + baseIndexMap['a'] = Base.A.ordinal(); + baseIndexMap['*'] = Base.A.ordinal(); // the wildcard character counts as an A + baseIndexMap['C'] = Base.C.ordinal(); + baseIndexMap['c'] = Base.C.ordinal(); + baseIndexMap['G'] = Base.G.ordinal(); + baseIndexMap['g'] = Base.G.ordinal(); + baseIndexMap['T'] = Base.T.ordinal(); + baseIndexMap['t'] = Base.T.ordinal(); } - // todo -- fix me (enums?) - public static final byte DELETION_INDEX = 4; - public static final byte NO_CALL_INDEX = 5; // (this is 'N') - - public static final int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A'); - public static final int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C'); - public static final int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G'); - public static final int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T'); + static private final int[] baseIndexWithIupacMap = baseIndexMap.clone(); + static { + baseIndexWithIupacMap['*'] = -1; // the wildcard character is bad + baseIndexWithIupacMap['N'] = Base.N.ordinal(); + baseIndexWithIupacMap['n'] = Base.N.ordinal(); + baseIndexWithIupacMap['R'] = Base.N.ordinal(); + baseIndexWithIupacMap['r'] = Base.N.ordinal(); + baseIndexWithIupacMap['Y'] = Base.N.ordinal(); + baseIndexWithIupacMap['y'] = Base.N.ordinal(); + baseIndexWithIupacMap['M'] = Base.N.ordinal(); + baseIndexWithIupacMap['m'] = Base.N.ordinal(); + baseIndexWithIupacMap['K'] = Base.N.ordinal(); + baseIndexWithIupacMap['k'] = Base.N.ordinal(); + baseIndexWithIupacMap['W'] = Base.N.ordinal(); + baseIndexWithIupacMap['w'] = Base.N.ordinal(); + baseIndexWithIupacMap['S'] = Base.N.ordinal(); + baseIndexWithIupacMap['s'] = Base.N.ordinal(); + baseIndexWithIupacMap['B'] = Base.N.ordinal(); + baseIndexWithIupacMap['b'] = Base.N.ordinal(); + baseIndexWithIupacMap['D'] = Base.N.ordinal(); + baseIndexWithIupacMap['d'] = Base.N.ordinal(); + baseIndexWithIupacMap['H'] = Base.N.ordinal(); + baseIndexWithIupacMap['h'] = Base.N.ordinal(); + baseIndexWithIupacMap['V'] = Base.N.ordinal(); + baseIndexWithIupacMap['v'] = Base.N.ordinal(); + } // Use a fixed random seed to allow for deterministic results when using random bases private static final Random randomNumberGen = new Random(47382911L); @@ -96,10 +121,10 @@ public class BaseUtils { } public static boolean isTransition(byte base1, byte base2) { - int b1 = simpleBaseToBaseIndex(base1); - int b2 = simpleBaseToBaseIndex(base2); - return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 || - b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; + final int b1 = simpleBaseToBaseIndex(base1); + final int b2 = simpleBaseToBaseIndex(base2); + return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() || + b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal(); } public static boolean isTransversion(byte base1, byte base2) { @@ -141,6 +166,19 @@ public class BaseUtils { return base >= 'A' && base <= 'Z'; } + public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) { + final int length = bases.length; + for ( int i = 0; i < length; i++ ) { + final int baseIndex = baseIndexWithIupacMap[bases[i]]; + if ( baseIndex == Base.N.ordinal() ) { + bases[i] = 'N'; + } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { + throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); + } + } + return bases; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -231,10 +269,10 @@ public class BaseUtils { switch (base) { case 'd': case 'D': - return DELETION_INDEX; + return Base.D.ordinal(); case 'n': case 'N': - return NO_CALL_INDEX; + return Base.N.ordinal(); default: return simpleBaseToBaseIndex(base); diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java index 372d13a7a..4f918f718 100644 --- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java @@ -50,6 +50,21 @@ public class BaseUtilsUnitTest extends BaseTest { Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0); } + @Test + public void testConvertIUPACtoN() { + + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false), new byte[]{'A', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false), new byte[]{'N', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false), new byte[]{'A', 'N', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false), new byte[]{'A', 'A', 'N'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false), new byte[]{'N', 'N', 'N'}); + } + + private void checkBytesAreEqual(final byte[] b1, final byte[] b2) { + for ( int i = 0; i < b1.length; i++ ) + Assert.assertEquals(b1[i], b2[i]); + } + @Test public void testTransitionTransversion() { logger.warn("Executing testTransitionTransversion"); diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java index 49720d1f6..03d6f457f 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -154,9 +154,9 @@ public class GenotypeLikelihoodsUnitTest { public void testGetQualFromLikelihoodsMultiAllelic() { GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); - Allele ref = Allele.create(BaseUtils.A,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.T); + Allele ref = Allele.create(BaseUtils.Base.A.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.T.base); List allAlleles = Arrays.asList(ref,alt1,alt2); List gtAlleles = Arrays.asList(alt1,alt2); GenotypeBuilder gtBuilder = new GenotypeBuilder();