Merge pull request #1273 from broadinstitute/yf_Fix_GATK_bug_around_hg38_contignames

Various fixes surrounding GenomeLoc parser and hg38 contig names
This commit is contained in:
Yossi Farjoun 2016-02-02 07:10:58 -05:00
commit 3b327ac0e4
3 changed files with 140 additions and 18 deletions

View File

@ -28,15 +28,18 @@ package org.broadinstitute.gatk.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import com.google.java.contract.ThrowEnsures;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.tribble.Feature;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Factory class for creating GenomeLocs
*/
@ -105,6 +108,7 @@ public final class GenomeLocParser {
this(seqDict, ValidationLevel.STANDARD);
}
static public String AMBIGUOUS_CONTIG_WARNINIG = "Ambiguous contig prefix!";
/**
* Create a genome loc parser based on seqDict with the specified level of validation
* @param seqDict the sequence dictionary to use when creating genome locs
@ -126,6 +130,24 @@ public final class GenomeLocParser {
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
}
}
final Pattern numericEnding = Pattern.compile(".*:[0-9][0-9,]*(\\+|-[0-9][0-9,]*)?$");
for(final SAMSequenceRecord contig1: this.getContigs().getSequences()) {
final String name1 = contig1.getSequenceName();
for (final SAMSequenceRecord contig2 : this.getContigs().getSequences()) {
if (contig1 == contig2) continue;
final String name2 = contig2.getSequenceName();
final Matcher numericMatcher = numericEnding.matcher(name1);
if (name1.startsWith(name2) && numericMatcher.matches()) {
logger.warn(AMBIGUOUS_CONTIG_WARNINIG);
logger.warn("Contig name" + name1 + " has a prefix that is also the name of another contig (" +
name1 +
"), and has an ending that looks like a genomic position. The contigs in this reference may not parse reliably." +
" In addition, the existence of a colon (:) in the contig name is not compliant with the VCF specifications (1.4.1.1).");
}
}
}
}
/**
@ -350,7 +372,7 @@ public final class GenomeLocParser {
@Requires("str != null")
@Ensures("result != null")
public GenomeLoc parseGenomeLoc(final String str) {
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' or 'chr2:500+'
//System.out.printf("Parsing location '%s'%n", str);
String contig = null;
@ -358,15 +380,15 @@ public final class GenomeLocParser {
int stop = -1;
final int colonIndex = str.lastIndexOf(":");
if(colonIndex == -1) {
contig = str.substring(0, str.length()); // chr1
if (colonIndex == -1 || contigIsInDictionary(str) ) {
contig = str; // chr1
stop = Integer.MAX_VALUE;
} else {
contig = str.substring(0, colonIndex);
final int dashIndex = str.indexOf('-', colonIndex);
try {
if(dashIndex == -1) {
if(str.charAt(str.length() - 1) == '+') {
if (dashIndex == -1) {
if (str.charAt(str.length() - 1) == '+') {
start = parsePosition(str.substring(colonIndex + 1, str.length() - 1)); // chr:1+
stop = Integer.MAX_VALUE;
} else {
@ -377,7 +399,7 @@ public final class GenomeLocParser {
start = parsePosition(str.substring(colonIndex + 1, dashIndex)); // chr1:1-1
stop = parsePosition(str.substring(dashIndex + 1));
}
} catch(Exception e) {
} catch (Exception e) {
throw new UserException("Failed to parse Genome Location string: " + str, e);
}
}

View File

@ -26,10 +26,10 @@
package org.broadinstitute.gatk.utils.sam;
import htsjdk.samtools.*;
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator;
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
@ -108,13 +108,39 @@ public class ArtificialSAMUtils {
// make up some sequence records
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */);
rec.setSequenceLength(chromosomeSize);
dict.addSequence(rec);
}
header.setSequenceDictionary(dict);
return header;
}
/**
* Creates an artificial sam header, matching the parameters,
*
* @param contigNames List of names for contigs
* @param contigLengths List of lengths for contigs. the two lists must be of the same size.
* @return a SAMFileHeader with the references with the inputed names and lengths.
**/
public static SAMFileHeader createArtificialSamHeader(List<String> contigNames, List<Integer> contigLengths) {
SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate);
SAMSequenceDictionary dict = new SAMSequenceDictionary();
// make up some sequence records
if (contigLengths.size() != contigNames.size()) {
throw new ReviewedGATKException("Passed in contigLengths is different from contigNames");
}
for (int i = 0; i < contigLengths.size(); i++) {
SAMSequenceRecord rec = new SAMSequenceRecord(contigNames.get(i), contigLengths.get(i));
dict.addSequence(rec);
}
header.setSequenceDictionary(dict);
return header;
}
/**
* Creates an artificial sam header based on the sequence dictionary dict
*

View File

@ -29,17 +29,17 @@ package org.broadinstitute.gatk.utils;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.tribble.Feature;
import htsjdk.tribble.SimpleFeature;
import org.broadinstitute.gatk.utils.BaseTest;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
@ -47,9 +47,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.*;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
@ -65,12 +63,88 @@ public class GenomeLocParserUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private SAMFileHeader header;
private GenomeLocParser bizareGenomeLocParser;
private SAMFileHeader bizareHeader;
@BeforeClass
public void init() {
public void init() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
//Crazy contig names that are all allowed by the SAM-Spec
bizareHeader = ArtificialSAMUtils.createArtificialSamHeader(
CollectionUtil.makeList("LooksLikeHasPos:01",//1
"LooksLikeARange:01-02",//2
"LooksLikeWhole:1+",//3
"\'Quoted\'",//4
"***Stars***",//5
"TwoColons:01:02",//6
"ConfusingElements:1-2",//7
"ConfusingElements:1",//8
"ConfusingElements:1+"//9
// , "ConfusingElements" //10 ... This element causes trouble
),
CollectionUtil.makeList(20, 20, 20, 20, 20, 20, 20, 20, 20
// ,20 // ... not sure how to deal with it.
));
bizareGenomeLocParser = new GenomeLocParser(bizareHeader.getSequenceDictionary());
}
@DataProvider()
public Object[][] TestConfusingContigNamesData(){
List<Object[]> tests=new ArrayList<>();
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1-")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2","ConfusingElements:1")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1,0-200","ConfusingElements:1,0")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-2,00","ConfusingElements:1")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1-3","ConfusingElements")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1","ConfusingElements")});
tests.add(new Object[]{CollectionUtil.makeList("ConfusingElements:1+","ConfusingElements")});
return tests.toArray(new Object[tests.size()][]);
}
@Test(dataProvider = "TestConfusingContigNamesData" )
void TestConfusingContigNames(final List<String> contigNames){
ValidationAppender checkWarning = new ValidationAppender(GenomeLocParser.AMBIGUOUS_CONTIG_WARNINIG);
final SAMFileHeader confusingHeader = ArtificialSAMUtils.createArtificialSamHeader(
contigNames, Collections.nCopies(contigNames.size(), 20));
logger.addAppender(checkWarning);
new GenomeLocParser(confusingHeader.getSequenceDictionary());
Assert.assertTrue(checkWarning.foundString(),"Expected warning about prefixes");
}
@DataProvider
public Object[][] testBizzarreContigNamesData() {
List<Object[]> tests = new ArrayList<>();
for (final SAMSequenceRecord sequence : bizareGenomeLocParser.getContigs().getSequences()) {
String name = sequence.getSequenceName();
tests.add(new Object[]{name, new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())});
tests.add(new Object[]{name + ":1", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 1)});
tests.add(new Object[]{name + ":1+", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, sequence.getSequenceLength())});
tests.add(new Object[]{name + ":1-10", new GenomeLoc(name, bizareGenomeLocParser.getContigIndex(name), 1, 10)});
}
return tests.toArray(new Object[tests.size()][]);
}
@Test(dataProvider = "testBizzarreContigNamesData")
public void testBizzarreContigNames(String inputString, GenomeLoc expectedResult) {
Assert.assertEquals(bizareGenomeLocParser.parseGenomeLoc(inputString), expectedResult);
}
@Test(expectedExceptions=UserException.MalformedGenomeLoc.class)
public void testGetContigIndex() {
assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference