From 8fbc0c847356652579727ec35fe3efd337510cd0 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 24 Nov 2009 19:19:47 +0000 Subject: [PATCH] fix for bug GSA-234: fasta index files couldn't handle anything but letters, numbers, or spaces in the contig name git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2147 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/fasta/FastaSequenceIndex.java | 4 +- .../utils/fasta/FastaSequenceIndexTest.java | 62 +++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java index 7d44c3786..916317945 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java @@ -47,8 +47,8 @@ public class FastaSequenceIndex implements Iterable { Scanner scanner = new Scanner(indexFile); while( scanner.hasNext() ) { - // Tokenize and validate the index line. - String result = scanner.findInLine("([\\w\\s\\.]+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); + // Tokenize and validate the index line. + String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); if( result == null ) throw new PicardException("Found invalid line in index file:" + scanner.nextLine()); MatchResult tokens = scanner.match(); diff --git a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java b/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java index 397d0657b..50ddd6037 100755 --- a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java +++ b/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java @@ -19,17 +19,25 @@ import java.util.Iterator; * To change this template use File | Settings | File Templates. */ public class FastaSequenceIndexTest extends BaseTest { + // our basic human 18 fai private static String sequenceIndexName = null; private FastaSequenceIndex sequenceIndex = null; + // a custom index that tests the colon, and semi-colon, and other random characters + private static String sequenceIndexColonSemiColonTestName = null; + private FastaSequenceIndex sequenceIndexColonSemiColonTest = null; + + @BeforeClass public static void initialize() { sequenceIndexName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai"; + sequenceIndexColonSemiColonTestName = "/humgen/gsa-scr1/GATK_Data/Validation_Data/testing.fai"; } @Before public void doForEachTest() throws FileNotFoundException { sequenceIndex = new FastaSequenceIndex( new File(sequenceIndexName) ); + sequenceIndexColonSemiColonTest = new FastaSequenceIndex( new File(sequenceIndexColonSemiColonTestName) ); } @Test @@ -189,4 +197,58 @@ public class FastaSequenceIndexTest extends BaseTest { Assert.assertEquals("Contig chrX_random is not present", "chrX_random", sequenceIndexEntries.next().getContig()); Assert.assertFalse("Iterator still has more entries", sequenceIndexEntries.hasNext()); } + + @Test + public void testSpecialCharacters() { + /* file contents: + chrM 16571 6 50 51 + chr1;boat 247249719 16915 50 51 + chr2:money 242951149 252211635 50 51 + chr3::; 199501827 500021813 50 51 + ;;;;;; 1234 1234 1234 1234 + file:gi|17981852|ref|NC_001807.4| 16571 2911876801 70 71 + */ + Iterator sequenceIndexEntries = sequenceIndexColonSemiColonTest.iterator(); + FastaSequenceIndexEntry ent = sequenceIndexEntries.next(); + Assert.assertEquals("Contig chrM is not present","chrM",ent.getContig()); + Assert.assertEquals("Contig chrM size is not correct",16571,ent.getSize()); + Assert.assertEquals("Contig chrM location is not correct",6,ent.getLocation()); + Assert.assertEquals("Contig chrM bases per line is not correct",50,ent.getBasesPerLine()); + Assert.assertEquals("Contig chrM bytes per line is not correct",51,ent.getBytesPerLine()); + + ent = sequenceIndexEntries.next(); + Assert.assertEquals("Contig chr1;boat is not present","chr1;boat",ent.getContig()); + Assert.assertEquals("Contig chr1;boat size is not correct",247249719,ent.getSize()); + Assert.assertEquals("Contig chr1;boat location is not correct",16915,ent.getLocation()); + Assert.assertEquals("Contig chr1;boat bases per line is not correct",50,ent.getBasesPerLine()); + Assert.assertEquals("Contig chr1;boat bytes per line is not correct",51,ent.getBytesPerLine()); + + ent = sequenceIndexEntries.next(); + Assert.assertEquals("Contig chr2:money is not present","chr2:money",ent.getContig()); + Assert.assertEquals("Contig chr2:money size is not correct",242951149,ent.getSize()); + Assert.assertEquals("Contig chr2:money location is not correct",252211635,ent.getLocation()); + Assert.assertEquals("Contig chr2:money bases per line is not correct",50,ent.getBasesPerLine()); + Assert.assertEquals("Contig chr2:money bytes per line is not correct",51,ent.getBytesPerLine()); + + ent = sequenceIndexEntries.next(); + Assert.assertEquals("Contig chr3::; is not present","chr3::;",ent.getContig()); + Assert.assertEquals("Contig chr3::; size is not correct",199501827,ent.getSize()); + Assert.assertEquals("Contig chrM location is not correct",500021813,ent.getLocation()); + Assert.assertEquals("Contig chr3::; bases per line is not correct",50,ent.getBasesPerLine()); + Assert.assertEquals("Contig chr3::; bytes per line is not correct",51,ent.getBytesPerLine()); + + ent = sequenceIndexEntries.next(); + Assert.assertEquals("Contig ;;;;;;;; is not present",";;;;;;;;",ent.getContig()); + Assert.assertEquals("Contig ;;;;;;;; size is not correct",123,ent.getSize()); + Assert.assertEquals("Contig ;;;;;;;; location is not correct",234,ent.getLocation()); + Assert.assertEquals("Contig ;;;;;;;; bases per line is not correct",456,ent.getBasesPerLine()); + Assert.assertEquals("Contig ;;;;;;;; bytes per line is not correct",789,ent.getBytesPerLine()); + + ent = sequenceIndexEntries.next(); + Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| is not present","file:gi|17981852|ref|NC_001807.4|",ent.getContig()); + Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| size is not correct",16571,ent.getSize()); + Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| location is not correct",2911876801L,ent.getLocation()); + Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| bases per line is not correct",70,ent.getBasesPerLine()); + Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| bytes per line is not correct",71,ent.getBytesPerLine()); + } }