From 80d547ae71627e2f292a1a9c3d2b70f8e7efd76a Mon Sep 17 00:00:00 2001 From: droazen Date: Fri, 8 Apr 2011 18:33:10 +0000 Subject: [PATCH] Fix for bug GSA-445: Sequence dictionary validation can be very slow with large numbers of contigs. SequenceDictionaryUtils.getCommonContigsByName() was running in O(n^2) time due to poor choice of data structure -- modified it to run in O(n) time. Also removed an unnecessary O(n log n) step at another stage in the sequence dictionary validation process. In tests with a 181,813-entry sequence dictionary, runtime improved from an average of 21.4 minutes to 45.1 seconds. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5604 348d0f76-0448-11de-a6fe-93d51630548a --- .../broadinstitute/sting/gatk/GenomeAnalysisEngine.java | 7 +------ .../sting/utils/SequenceDictionaryUtils.java | 6 +++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index d8f30c5da..7cfbc0762 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -869,12 +869,7 @@ public class GenomeAnalysisEngine { // Compile a set of sequence names that exist in the BAM files. SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - Set readsSequenceNames = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences()) - readsSequenceNames.add(dictionaryEntry.getSequenceName()); - - - if (readsSequenceNames.size() == 0) { + if (readsDictionary.size() == 0) { logger.info("Reads file is unmapped. Skipping validation against reference."); return; } diff --git a/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java b/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java index fc28d6c26..ee47a0f2b 100755 --- a/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java @@ -353,13 +353,13 @@ public class SequenceDictionaryUtils { * @return */ public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - Set intersectingSequenceNames = new HashSet(getContigNames(dict1)); + Set intersectingSequenceNames = getContigNames(dict1); intersectingSequenceNames.retainAll(getContigNames(dict2)); return intersectingSequenceNames; } - public static List getContigNames(SAMSequenceDictionary dict) { - List contigNames = new ArrayList(); + public static Set getContigNames(SAMSequenceDictionary dict) { + Set contigNames = new HashSet((int)(dict.size() / 0.75f) + 1, 0.75f); for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) contigNames.add(dictionaryEntry.getSequenceName()); return contigNames;