- Fixed bug in GenomeLoc parser

- Added a warning when two contigs are too similar that it might cause problems with parsing
- Added tests of modified parser and of warning.
This commit is contained in:
Yossi Farjoun 2016-01-16 22:37:12 -05:00
parent 8ab2eef6f0
commit 7896055be3
3 changed files with 140 additions and 18 deletions

View File

@ -28,15 +28,18 @@ package org.broadinstitute.gatk.utils;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import com.google.java.contract.ThrowEnsures; import com.google.java.contract.ThrowEnsures;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger; import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature; import htsjdk.tribble.Feature;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** /**
* Factory class for creating GenomeLocs * Factory class for creating GenomeLocs
*/ */
@ -105,6 +108,7 @@ public final class GenomeLocParser {
this(seqDict, ValidationLevel.STANDARD); this(seqDict, ValidationLevel.STANDARD);
} }
static public String AMBIGUOUS_CONTIG_WARNINIG = "Ambiguous contig prefix!";
/** /**
* Create a genome loc parser based on seqDict with the specified level of validation * Create a genome loc parser based on seqDict with the specified level of validation
* @param seqDict the sequence dictionary to use when creating genome locs * @param seqDict the sequence dictionary to use when creating genome locs
@ -126,6 +130,24 @@ public final class GenomeLocParser {
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
} }
} }
final Pattern numericEnding = Pattern.compile(".*:[0-9][0-9,]*(\\+|-[0-9][0-9,]*)?$");
for(final SAMSequenceRecord contig1: this.getContigs().getSequences()) {
final String name1 = contig1.getSequenceName();
for (final SAMSequenceRecord contig2 : this.getContigs().getSequences()) {
if (contig1 == contig2) continue;
final String name2 = contig2.getSequenceName();
final Matcher numericMatcher = numericEnding.matcher(name1);
if (name1.startsWith(name2) && numericMatcher.matches()) {
logger.warn(AMBIGUOUS_CONTIG_WARNINIG);
logger.warn("Contig name" + name1 + " has a prefix that is also the name of another contig (" +
name1 +
"), and has an ending that looks like a genomic position. The contigs in this reference may not parse reliably." +
" In addition, the existence of a colon (:) in the contig name is not compliant with the VCF specifications (1.4.1.1).");
}
}
}
} }
/** /**
@ -350,7 +372,7 @@ public final class GenomeLocParser {
@Requires("str != null") @Requires("str != null")
@Ensures("result != null") @Ensures("result != null")
public GenomeLoc parseGenomeLoc(final String str) { public GenomeLoc parseGenomeLoc(final String str) {
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' or 'chr2:500+'
//System.out.printf("Parsing location '%s'%n", str); //System.out.printf("Parsing location '%s'%n", str);
String contig = null; String contig = null;
@ -358,15 +380,15 @@ public final class GenomeLocParser {
int stop = -1; int stop = -1;
final int colonIndex = str.lastIndexOf(":"); final int colonIndex = str.lastIndexOf(":");
if(colonIndex == -1) { if (colonIndex == -1 || contigIsInDictionary(str) ) {
contig = str.substring(0, str.length()); // chr1 contig = str; // chr1
stop = Integer.MAX_VALUE; stop = Integer.MAX_VALUE;
} else { } else {
contig = str.substring(0, colonIndex); contig = str.substring(0, colonIndex);
final int dashIndex = str.indexOf('-', colonIndex); final int dashIndex = str.indexOf('-', colonIndex);
try { try {
if(dashIndex == -1) { if (dashIndex == -1) {
if(str.charAt(str.length() - 1) == '+') { if (str.charAt(str.length() - 1) == '+') {
start = parsePosition(str.substring(colonIndex + 1, str.length() - 1)); // chr:1+ start = parsePosition(str.substring(colonIndex + 1, str.length() - 1)); // chr:1+
stop = Integer.MAX_VALUE; stop = Integer.MAX_VALUE;
} else { } else {
@ -377,7 +399,7 @@ public final class GenomeLocParser {
start = parsePosition(str.substring(colonIndex + 1, dashIndex)); // chr1:1-1 start = parsePosition(str.substring(colonIndex + 1, dashIndex)); // chr1:1-1
stop = parsePosition(str.substring(dashIndex + 1)); stop = parsePosition(str.substring(dashIndex + 1));
} }
} catch(Exception e) { } catch (Exception e) {
throw new UserException("Failed to parse Genome Location string: " + str, e); throw new UserException("Failed to parse Genome Location string: " + str, e);
} }
} }

View File

@ -26,10 +26,10 @@
package org.broadinstitute.gatk.utils.sam; package org.broadinstitute.gatk.utils.sam;
import htsjdk.samtools.*; import htsjdk.samtools.*;
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
@ -108,13 +108,39 @@ public class ArtificialSAMUtils {
// make up some sequence records // make up some sequence records
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */);
rec.setSequenceLength(chromosomeSize);
dict.addSequence(rec); dict.addSequence(rec);
} }
header.setSequenceDictionary(dict); header.setSequenceDictionary(dict);
return header; return header;
} }
/**
* Creates an artificial sam header, matching the parameters,
*
* @param contigNames List of names for contigs
* @param contigLengths List of lengths for contigs. the two lists must be of the same size.
* @return a SAMFileHeader with the references with the inputed names and lengths.
**/
public static SAMFileHeader createArtificialSamHeader(List<String> contigNames, List<Integer> contigLengths) {
SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate);
SAMSequenceDictionary dict = new SAMSequenceDictionary();
// make up some sequence records
if (contigLengths.size() != contigNames.size()) {
throw new ReviewedGATKException("Passed in contigLengths is different from contigNames");
}
for (int i = 0; i < contigLengths.size(); i++) {
SAMSequenceRecord rec = new SAMSequenceRecord(contigNames.get(i), contigLengths.get(i));
dict.addSequence(rec);
}
header.setSequenceDictionary(dict);
return header;
}
/** /**
* Creates an artificial sam header based on the sequence dictionary dict * Creates an artificial sam header based on the sequence dictionary dict
* *

View File

@ -29,17 +29,17 @@ package org.broadinstitute.gatk.utils;
import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.tribble.Feature; import htsjdk.tribble.Feature;
import htsjdk.tribble.SimpleFeature; import htsjdk.tribble.SimpleFeature;
import org.broadinstitute.gatk.utils.BaseTest; import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
@ -47,9 +47,7 @@ import org.testng.annotations.Test;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.util.Arrays; import java.util.*;
import java.util.LinkedList;
import java.util.List;
import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertTrue;
@ -65,12 +63,88 @@ public class GenomeLocParserUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser; private GenomeLocParser genomeLocParser;
private SAMFileHeader header; private SAMFileHeader header;
private GenomeLocParser bizareGenomeLocParser;
private SAMFileHeader bizareHeader;
@BeforeClass @BeforeClass
public void init() { public void init() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
//Crazy contig names that are all allowed by the SAM-Spec
bizareHeader = ArtificialSAMUtils.createArtificialSamHeader(
CollectionUtil.makeList("LooksLikeHasPos:01",//1
"LooksLikeARange:01-02",//2
"LooksLikeWhole:1+",//3
"\'Quoted\'",//4
"***Stars***",//5
"TwoColons:01:02",//6
"ConfusingElements:1-2",//7
"ConfusingElements:1",//8
"ConfusingElements:1+"//9
// , "ConfusingElements" //10 ... This element causes trouble
),
CollectionUtil.makeList(20, 20, 20, 20, 20, 20, 20, 20, 20
// ,20 // ... not sure how to deal with it.
));
bizareGenomeLocParser = new GenomeLocParser(bizareHeader.getSequenceDictionary());
} }
@DataProvider()
public Object[][] TestConfusingContigNamesData(){
List<Object[]> tests=new ArrayList<>();
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1-")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1,0-200","ConfusingElements:1,0")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-3","ConfusingElements")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1","ConfusingElements")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1+","ConfusingElements")});
return tests.toArray(new Object[tests.size()][]);
}
@Test(dataProvider = "TestConfusingContigNamesData" )
void TestConfusingContigNames(final List<String> contigNames){
ValidationAppender checkWarning = new ValidationAppender(GenomeLocParser.AMBIGUOUS_CONTIG_WARNINIG);
final SAMFileHeader confusingHeader = ArtificialSAMUtils.createArtificialSamHeader(
contigNames, Collections.nCopies(contigNames.size(), 20));
logger.addAppender(checkWarning);
new GenomeLocParser(confusingHeader.getSequenceDictionary());
Assert.assertTrue(checkWarning.foundString(),"Expected warning about prefixes");
}
@DataProvider
public Object[][] testBizzarreContigNamesData() {
List<Object[]> tests = new ArrayList<>();
for (final SAMSequenceRecord sequence : bizareGenomeLocParser.getContigs().getSequences()) {
String name = sequence.getSequenceName();
tests.add(new Object[]{name, new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())});
tests.add(new Object[]{name + ":1", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 1)});
tests.add(new Object[]{name + ":1+", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())});
tests.add(new Object[]{name + ":1-10", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 10)});
}
return tests.toArray(new Object[tests.size()][]);
}
@Test(dataProvider = "testBizzarreContigNamesData")
public void testBizzarreContigNames(String inputString, GenomeLoc expectedResult) {
Assert.assertEquals(bizareGenomeLocParser.parseGenomeLoc(inputString), expectedResult);
}
@Test(expectedExceptions=UserException.MalformedGenomeLoc.class) @Test(expectedExceptions=UserException.MalformedGenomeLoc.class)
public void testGetContigIndex() { public void testGetContigIndex() {
assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference