From 39a47491a94a0e8de2783950d6480255213eb10f Mon Sep 17 00:00:00 2001 From: aaron Date: Mon, 21 Sep 2009 22:37:47 +0000 Subject: [PATCH] changes to make GenomeLoc string parsing 25% faster git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1675 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/GenomeLocParser.java | 67 ++++++++++--------- .../sting/utils/GenomeLocParserTest.java | 20 +++++- 2 files changed, 52 insertions(+), 35 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 03f5cf440..2dc2e40db 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -52,10 +52,7 @@ import java.util.regex.Pattern; public class GenomeLocParser { private static Logger logger = Logger.getLogger(GenomeLocParser.class); - private static final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 - private static final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2 - private static final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$"); // matches case 3 - private static final Pattern regex4 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)\\+"); // matches case 4 + private static final Pattern mPattern = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)?\\+?(-)?([\\d,]+)?$"); // matches case 3 // -------------------------------------------------------------------------------------------------------------- @@ -143,57 +140,61 @@ public class GenomeLocParser { public static GenomeLoc parseGenomeLoc(final String str) { // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' //System.out.printf("Parsing location '%s'%n", str); - + /*try { + Thread.sleep(2000); + } catch (InterruptedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } */ String contig = null; long start = 1; long stop = Integer.MAX_VALUE; boolean bad = false; - Matcher match1 = regex1.matcher(str); - Matcher match2 = regex2.matcher(str); - Matcher match3 = regex3.matcher(str); - Matcher match4 = regex4.matcher(str); + Matcher match = mPattern.matcher(str); try { - if (match1.matches()) { - contig = match1.group(1); - } else if (match2.matches()) { - contig = match2.group(1); - start = parsePosition(match2.group(2)); - stop = start; - } else if (match4.matches()) { - contig = match4.group(1); - start = parsePosition(match4.group(2)); - } else if (match3.matches()) { - contig = match3.group(1); - start = parsePosition(match3.group(2)); - stop = parsePosition(match3.group(3)); + if (match.matches()) { + contig = match.group(1); + if (match.groupCount() > 1) { + switch (match.groupCount()) { + case 2: + start = stop = parsePosition(match.group(2)); + break; + case 3: + start = parsePosition(match.group(2)); + if (!match.group(3).equals("+")) bad = true; + break; + case 4: + start = parsePosition(match.group(2)); + stop = parsePosition(match.group(4)); + break; + default: + bad = true; + break; - if (start > stop) - bad = true; + } + } } else { bad = true; } - } catch (Exception e) { - bad = true; } - if (bad) { - throw new StingException("Invalid Genome Location string: " + str); + catch (Exception e) { + bad = true; } - if (stop == Integer.MAX_VALUE && hasKnownContigOrdering()) { + if (bad) + throw new StingException("Invalid Genome Location string: "); // + str); + + + if (stop == Integer.MAX_VALUE && hasKnownContigOrdering()) // lookup the actually stop position! stop = getContigInfo(contig).getSequenceLength(); - } if (!isContigValid(contig)) throw new MalformedGenomeLocException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference."); - GenomeLoc loc = parseGenomeLoc(contig, start, stop); - // System.out.printf(" => Parsed location '%s' into %s%n", str, loc); - return loc; } diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java index 2cdfd9ad7..2ee094ee3 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java @@ -3,10 +3,16 @@ package org.broadinstitute.sting.utils; import static junit.framework.Assert.assertTrue; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.junit.Assert.assertEquals; import org.junit.Test; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + /** * @author aaron @@ -117,10 +123,20 @@ public class GenomeLocParserTest extends BaseTest { @Test public void testCreateGenomeLoc5() { - GenomeLoc loc = GenomeLocParser.createGenomeLoc(0,1,100); + GenomeLoc loc = GenomeLocParser.createGenomeLoc(0, 1, 100); GenomeLoc copy = GenomeLocParser.createGenomeLoc(loc); assertEquals(copy.getContigIndex(), 0); assertEquals(100, copy.getStop()); - assertEquals(1, copy .getStart()); + assertEquals(1, copy.getStart()); + } + + @Test + public void testGenomeLocParserList() { + long start = System.currentTimeMillis(); + List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(new String[]{"/humgen/gsa-scr1/GATK_Data/Validation_Data/bigChr1IntervalList.list"})); + Collections.sort(parsedIntervals); + LinkedList loc = new LinkedList(GenomeLocParser.mergeOverlappingLocations(parsedIntervals)); + long stop = System.currentTimeMillis(); + logger.warn("Elapsed time = " + (stop - start)); } }