Fix for bug GSA-445: Sequence dictionary validation can be very slow with

large numbers of contigs. SequenceDictionaryUtils.getCommonContigsByName() was
running in O(n^2) time due to poor choice of data structure -- modified it to
run in O(n) time. Also removed an unnecessary O(n log n) step at another stage
in the sequence dictionary validation process. In tests with a 181,813-entry
sequence dictionary, runtime improved from an average of 21.4 minutes to 45.1
seconds.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5604 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
droazen 2011-04-08 18:33:10 +00:00
parent b81228fec1
commit 80d547ae71
2 changed files with 4 additions and 9 deletions

View File

@ -869,12 +869,7 @@ public class GenomeAnalysisEngine {
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
Set<String> readsSequenceNames = new TreeSet<String>();
for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences())
readsSequenceNames.add(dictionaryEntry.getSequenceName());
if (readsSequenceNames.size() == 0) {
if (readsDictionary.size() == 0) {
logger.info("Reads file is unmapped. Skipping validation against reference.");
return;
}

View File

@ -353,13 +353,13 @@ public class SequenceDictionaryUtils {
* @return
*/
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
Set<String> intersectingSequenceNames = new HashSet<String>(getContigNames(dict1));
Set<String> intersectingSequenceNames = getContigNames(dict1);
intersectingSequenceNames.retainAll(getContigNames(dict2));
return intersectingSequenceNames;
}
public static List<String> getContigNames(SAMSequenceDictionary dict) {
List<String> contigNames = new ArrayList<String>();
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
Set<String> contigNames = new HashSet<String>((int)(dict.size() / 0.75f) + 1, 0.75f);
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
contigNames.add(dictionaryEntry.getSequenceName());
return contigNames;