diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java index 9cac5d536..6f2616c69 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java @@ -28,15 +28,18 @@ package org.broadinstitute.gatk.utils; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import com.google.java.contract.ThrowEnsures; -import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; +import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.tribble.Feature; +import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * Factory class for creating GenomeLocs */ @@ -105,6 +108,7 @@ public final class GenomeLocParser { this(seqDict, ValidationLevel.STANDARD); } + static public String AMBIGUOUS_CONTIG_WARNINIG = "Ambiguous contig prefix!"; /** * Create a genome loc parser based on seqDict with the specified level of validation * @param seqDict the sequence dictionary to use when creating genome locs @@ -126,6 +130,24 @@ public final class GenomeLocParser { logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); } } + + final Pattern numericEnding = Pattern.compile(".*:[0-9][0-9,]*(\\+|-[0-9][0-9,]*)?$"); + for(final SAMSequenceRecord contig1: this.getContigs().getSequences()) { + final String name1 = contig1.getSequenceName(); + for (final SAMSequenceRecord contig2 : this.getContigs().getSequences()) { + if (contig1 == contig2) continue; + final String name2 = contig2.getSequenceName(); + + final Matcher numericMatcher = numericEnding.matcher(name1); + if (name1.startsWith(name2) && numericMatcher.matches()) { + logger.warn(AMBIGUOUS_CONTIG_WARNINIG); + logger.warn("Contig name" + name1 + " has a prefix that is also the name of another contig (" + + name1 + + "), and has an ending that looks like a genomic position. The contigs in this reference may not parse reliably." + + " In addition, the existence of a colon (:) in the contig name is not compliant with the VCF specifications (1.4.1.1)."); + } + } + } } /** @@ -350,7 +372,7 @@ public final class GenomeLocParser { @Requires("str != null") @Ensures("result != null") public GenomeLoc parseGenomeLoc(final String str) { - // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' + // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' or 'chr2:500+' //System.out.printf("Parsing location '%s'%n", str); String contig = null; @@ -358,15 +380,15 @@ public final class GenomeLocParser { int stop = -1; final int colonIndex = str.lastIndexOf(":"); - if(colonIndex == -1) { - contig = str.substring(0, str.length()); // chr1 + if (colonIndex == -1 || contigIsInDictionary(str) ) { + contig = str; // chr1 stop = Integer.MAX_VALUE; } else { contig = str.substring(0, colonIndex); final int dashIndex = str.indexOf('-', colonIndex); try { - if(dashIndex == -1) { - if(str.charAt(str.length() - 1) == '+') { + if (dashIndex == -1) { + if (str.charAt(str.length() - 1) == '+') { start = parsePosition(str.substring(colonIndex + 1, str.length() - 1)); // chr:1+ stop = Integer.MAX_VALUE; } else { @@ -377,7 +399,7 @@ public final class GenomeLocParser { start = parsePosition(str.substring(colonIndex + 1, dashIndex)); // chr1:1-1 stop = parsePosition(str.substring(dashIndex + 1)); } - } catch(Exception e) { + } catch (Exception e) { throw new UserException("Failed to parse Genome Location string: " + str, e); } } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java index 2d192d32d..29a051b36 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java @@ -26,10 +26,10 @@ package org.broadinstitute.gatk.utils.sam; import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; @@ -108,13 +108,39 @@ public class ArtificialSAMUtils { // make up some sequence records for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); - rec.setSequenceLength(chromosomeSize); dict.addSequence(rec); } header.setSequenceDictionary(dict); return header; } + + /** + * Creates an artificial sam header, matching the parameters, + * + * @param contigNames List of names for contigs + * @param contigLengths List of lengths for contigs. the two lists must be of the same size. + * @return a SAMFileHeader with the references with the inputed names and lengths. + **/ + public static SAMFileHeader createArtificialSamHeader(List contigNames, List contigLengths) { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate); + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + + // make up some sequence records + if (contigLengths.size() != contigNames.size()) { + throw new ReviewedGATKException("Passed in contigLengths is different from contigNames"); + } + + for (int i = 0; i < contigLengths.size(); i++) { + SAMSequenceRecord rec = new SAMSequenceRecord(contigNames.get(i), contigLengths.get(i)); + dict.addSequence(rec); + } + header.setSequenceDictionary(dict); + return header; + } + + /** * Creates an artificial sam header based on the sequence dictionary dict * diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java index 2f49bbd60..c02bae7fd 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java @@ -29,17 +29,17 @@ package org.broadinstitute.gatk.utils; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.CollectionUtil; import htsjdk.tribble.Feature; import htsjdk.tribble.SimpleFeature; -import org.broadinstitute.gatk.utils.BaseTest; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -47,9 +47,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; +import java.util.*; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; @@ -65,12 +63,88 @@ public class GenomeLocParserUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; private SAMFileHeader header; + private GenomeLocParser bizareGenomeLocParser; + private SAMFileHeader bizareHeader; + @BeforeClass - public void init() { + public void init() { header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + + //Crazy contig names that are all allowed by the SAM-Spec + bizareHeader = ArtificialSAMUtils.createArtificialSamHeader( + CollectionUtil.makeList("LooksLikeHasPos:01",//1 + "LooksLikeARange:01-02",//2 + "LooksLikeWhole:1+",//3 + "\'Quoted\'",//4 + "***Stars***",//5 + "TwoColons:01:02",//6 + "ConfusingElements:1-2",//7 + "ConfusingElements:1",//8 + "ConfusingElements:1+"//9 + // , "ConfusingElements" //10 ... This element causes trouble + ), + CollectionUtil.makeList(20, 20, 20, 20, 20, 20, 20, 20, 20 + // ,20 // ... not sure how to deal with it. + )); + bizareGenomeLocParser = new GenomeLocParser(bizareHeader.getSequenceDictionary()); } + + @DataProvider() + public Object[][] TestConfusingContigNamesData(){ + List tests=new ArrayList<>(); + + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1-")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1,0-200","ConfusingElements:1,0")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-3","ConfusingElements")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1","ConfusingElements")}); + tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1+","ConfusingElements")}); + + return tests.toArray(new Object[tests.size()][]); + } + + @Test(dataProvider = "TestConfusingContigNamesData" ) + void TestConfusingContigNames(final List contigNames){ + + ValidationAppender checkWarning = new ValidationAppender(GenomeLocParser.AMBIGUOUS_CONTIG_WARNINIG); + + final SAMFileHeader confusingHeader = ArtificialSAMUtils.createArtificialSamHeader( + contigNames, Collections.nCopies(contigNames.size(), 20)); + + logger.addAppender(checkWarning); + new GenomeLocParser(confusingHeader.getSequenceDictionary()); + + Assert.assertTrue(checkWarning.foundString(),"Expected warning about prefixes"); + } + + + @DataProvider + public Object[][] testBizzarreContigNamesData() { + List tests = new ArrayList<>(); + + for (final SAMSequenceRecord sequence : bizareGenomeLocParser.getContigs().getSequences()) { + String name = sequence.getSequenceName(); + tests.add(new Object[]{name, new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())}); + tests.add(new Object[]{name + ":1", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 1)}); + tests.add(new Object[]{name + ":1+", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())}); + tests.add(new Object[]{name + ":1-10", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 10)}); + } + + return tests.toArray(new Object[tests.size()][]); + } + + @Test(dataProvider = "testBizzarreContigNamesData") + public void testBizzarreContigNames(String inputString, GenomeLoc expectedResult) { + + Assert.assertEquals(bizareGenomeLocParser.parseGenomeLoc(inputString), expectedResult); + } + + @Test(expectedExceptions=UserException.MalformedGenomeLoc.class) public void testGetContigIndex() { assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference