Merge pull request #1273 from broadinstitute/yf_Fix_GATK_bug_around_hg38_contignames
Various fixes surrounding GenomeLoc parser and hg38 contig names
This commit is contained in:
commit
3b327ac0e4
|
|
@ -28,15 +28,18 @@ package org.broadinstitute.gatk.utils;
|
|||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import com.google.java.contract.ThrowEnsures;
|
||||
import htsjdk.samtools.reference.ReferenceSequenceFile;
|
||||
import htsjdk.samtools.SAMRecord;
|
||||
import htsjdk.samtools.SAMSequenceDictionary;
|
||||
import htsjdk.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import htsjdk.samtools.reference.ReferenceSequenceFile;
|
||||
import htsjdk.tribble.Feature;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Factory class for creating GenomeLocs
|
||||
*/
|
||||
|
|
@ -105,6 +108,7 @@ public final class GenomeLocParser {
|
|||
this(seqDict, ValidationLevel.STANDARD);
|
||||
}
|
||||
|
||||
static public String AMBIGUOUS_CONTIG_WARNINIG = "Ambiguous contig prefix!";
|
||||
/**
|
||||
* Create a genome loc parser based on seqDict with the specified level of validation
|
||||
* @param seqDict the sequence dictionary to use when creating genome locs
|
||||
|
|
@ -126,6 +130,24 @@ public final class GenomeLocParser {
|
|||
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
|
||||
}
|
||||
}
|
||||
|
||||
final Pattern numericEnding = Pattern.compile(".*:[0-9][0-9,]*(\\+|-[0-9][0-9,]*)?$");
|
||||
for(final SAMSequenceRecord contig1: this.getContigs().getSequences()) {
|
||||
final String name1 = contig1.getSequenceName();
|
||||
for (final SAMSequenceRecord contig2 : this.getContigs().getSequences()) {
|
||||
if (contig1 == contig2) continue;
|
||||
final String name2 = contig2.getSequenceName();
|
||||
|
||||
final Matcher numericMatcher = numericEnding.matcher(name1);
|
||||
if (name1.startsWith(name2) && numericMatcher.matches()) {
|
||||
logger.warn(AMBIGUOUS_CONTIG_WARNINIG);
|
||||
logger.warn("Contig name" + name1 + " has a prefix that is also the name of another contig (" +
|
||||
name1 +
|
||||
"), and has an ending that looks like a genomic position. The contigs in this reference may not parse reliably." +
|
||||
" In addition, the existence of a colon (:) in the contig name is not compliant with the VCF specifications (1.4.1.1).");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -350,7 +372,7 @@ public final class GenomeLocParser {
|
|||
@Requires("str != null")
|
||||
@Ensures("result != null")
|
||||
public GenomeLoc parseGenomeLoc(final String str) {
|
||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' or 'chr2:500+'
|
||||
//System.out.printf("Parsing location '%s'%n", str);
|
||||
|
||||
String contig = null;
|
||||
|
|
@ -358,15 +380,15 @@ public final class GenomeLocParser {
|
|||
int stop = -1;
|
||||
|
||||
final int colonIndex = str.lastIndexOf(":");
|
||||
if(colonIndex == -1) {
|
||||
contig = str.substring(0, str.length()); // chr1
|
||||
if (colonIndex == -1 || contigIsInDictionary(str) ) {
|
||||
contig = str; // chr1
|
||||
stop = Integer.MAX_VALUE;
|
||||
} else {
|
||||
contig = str.substring(0, colonIndex);
|
||||
final int dashIndex = str.indexOf('-', colonIndex);
|
||||
try {
|
||||
if(dashIndex == -1) {
|
||||
if(str.charAt(str.length() - 1) == '+') {
|
||||
if (dashIndex == -1) {
|
||||
if (str.charAt(str.length() - 1) == '+') {
|
||||
start = parsePosition(str.substring(colonIndex + 1, str.length() - 1)); // chr:1+
|
||||
stop = Integer.MAX_VALUE;
|
||||
} else {
|
||||
|
|
@ -377,7 +399,7 @@ public final class GenomeLocParser {
|
|||
start = parsePosition(str.substring(colonIndex + 1, dashIndex)); // chr1:1-1
|
||||
stop = parsePosition(str.substring(dashIndex + 1));
|
||||
}
|
||||
} catch(Exception e) {
|
||||
} catch (Exception e) {
|
||||
throw new UserException("Failed to parse Genome Location string: " + str, e);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,10 +26,10 @@
|
|||
package org.broadinstitute.gatk.utils.sam;
|
||||
|
||||
import htsjdk.samtools.*;
|
||||
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
|
||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||
import org.broadinstitute.gatk.utils.Utils;
|
||||
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
|
||||
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
|
||||
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
|
||||
import org.broadinstitute.gatk.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -108,13 +108,39 @@ public class ArtificialSAMUtils {
|
|||
// make up some sequence records
|
||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */);
|
||||
rec.setSequenceLength(chromosomeSize);
|
||||
dict.addSequence(rec);
|
||||
}
|
||||
header.setSequenceDictionary(dict);
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates an artificial sam header, matching the parameters,
|
||||
*
|
||||
* @param contigNames List of names for contigs
|
||||
* @param contigLengths List of lengths for contigs. the two lists must be of the same size.
|
||||
* @return a SAMFileHeader with the references with the inputed names and lengths.
|
||||
**/
|
||||
public static SAMFileHeader createArtificialSamHeader(List<String> contigNames, List<Integer> contigLengths) {
|
||||
SAMFileHeader header = new SAMFileHeader();
|
||||
header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate);
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
|
||||
// make up some sequence records
|
||||
if (contigLengths.size() != contigNames.size()) {
|
||||
throw new ReviewedGATKException("Passed in contigLengths is different from contigNames");
|
||||
}
|
||||
|
||||
for (int i = 0; i < contigLengths.size(); i++) {
|
||||
SAMSequenceRecord rec = new SAMSequenceRecord(contigNames.get(i), contigLengths.get(i));
|
||||
dict.addSequence(rec);
|
||||
}
|
||||
header.setSequenceDictionary(dict);
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates an artificial sam header based on the sequence dictionary dict
|
||||
*
|
||||
|
|
|
|||
|
|
@ -29,17 +29,17 @@ package org.broadinstitute.gatk.utils;
|
|||
import htsjdk.samtools.SAMFileHeader;
|
||||
import htsjdk.samtools.SAMSequenceDictionary;
|
||||
import htsjdk.samtools.SAMSequenceRecord;
|
||||
import htsjdk.samtools.util.CollectionUtil;
|
||||
import htsjdk.tribble.Feature;
|
||||
import htsjdk.tribble.SimpleFeature;
|
||||
import org.broadinstitute.gatk.utils.BaseTest;
|
||||
import htsjdk.variant.variantcontext.Allele;
|
||||
import htsjdk.variant.variantcontext.VariantContext;
|
||||
import htsjdk.variant.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
|
||||
import htsjdk.variant.variantcontext.Allele;
|
||||
import htsjdk.variant.variantcontext.VariantContext;
|
||||
import htsjdk.variant.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
|
@ -47,9 +47,7 @@ import org.testng.annotations.Test;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
import static org.testng.Assert.assertEquals;
|
||||
import static org.testng.Assert.assertTrue;
|
||||
|
|
@ -65,12 +63,88 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
|||
private GenomeLocParser genomeLocParser;
|
||||
private SAMFileHeader header;
|
||||
|
||||
private GenomeLocParser bizareGenomeLocParser;
|
||||
private SAMFileHeader bizareHeader;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
public void init() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
|
||||
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
|
||||
//Crazy contig names that are all allowed by the SAM-Spec
|
||||
bizareHeader = ArtificialSAMUtils.createArtificialSamHeader(
|
||||
CollectionUtil.makeList("LooksLikeHasPos:01",//1
|
||||
"LooksLikeARange:01-02",//2
|
||||
"LooksLikeWhole:1+",//3
|
||||
"\'Quoted\'",//4
|
||||
"***Stars***",//5
|
||||
"TwoColons:01:02",//6
|
||||
"ConfusingElements:1-2",//7
|
||||
"ConfusingElements:1",//8
|
||||
"ConfusingElements:1+"//9
|
||||
// , "ConfusingElements" //10 ... This element causes trouble
|
||||
),
|
||||
CollectionUtil.makeList(20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
// ,20 // ... not sure how to deal with it.
|
||||
));
|
||||
bizareGenomeLocParser = new GenomeLocParser(bizareHeader.getSequenceDictionary());
|
||||
}
|
||||
|
||||
|
||||
@DataProvider()
|
||||
public Object[][] TestConfusingContigNamesData(){
|
||||
List<Object[]> tests=new ArrayList<>();
|
||||
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1-")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1,0-200","ConfusingElements:1,0")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-3","ConfusingElements")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1","ConfusingElements")});
|
||||
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1+","ConfusingElements")});
|
||||
|
||||
return tests.toArray(new Object[tests.size()][]);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "TestConfusingContigNamesData" )
|
||||
void TestConfusingContigNames(final List<String> contigNames){
|
||||
|
||||
ValidationAppender checkWarning = new ValidationAppender(GenomeLocParser.AMBIGUOUS_CONTIG_WARNINIG);
|
||||
|
||||
final SAMFileHeader confusingHeader = ArtificialSAMUtils.createArtificialSamHeader(
|
||||
contigNames, Collections.nCopies(contigNames.size(), 20));
|
||||
|
||||
logger.addAppender(checkWarning);
|
||||
new GenomeLocParser(confusingHeader.getSequenceDictionary());
|
||||
|
||||
Assert.assertTrue(checkWarning.foundString(),"Expected warning about prefixes");
|
||||
}
|
||||
|
||||
|
||||
@DataProvider
|
||||
public Object[][] testBizzarreContigNamesData() {
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for (final SAMSequenceRecord sequence : bizareGenomeLocParser.getContigs().getSequences()) {
|
||||
String name = sequence.getSequenceName();
|
||||
tests.add(new Object[]{name, new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())});
|
||||
tests.add(new Object[]{name + ":1", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 1)});
|
||||
tests.add(new Object[]{name + ":1+", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())});
|
||||
tests.add(new Object[]{name + ":1-10", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 10)});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[tests.size()][]);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "testBizzarreContigNamesData")
|
||||
public void testBizzarreContigNames(String inputString, GenomeLoc expectedResult) {
|
||||
|
||||
Assert.assertEquals(bizareGenomeLocParser.parseGenomeLoc(inputString), expectedResult);
|
||||
}
|
||||
|
||||
|
||||
@Test(expectedExceptions=UserException.MalformedGenomeLoc.class)
|
||||
public void testGetContigIndex() {
|
||||
assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference
|
||||
|
|
|
|||
Loading…
Reference in New Issue