Fix for bug GSA-445: Sequence dictionary validation can be very slow with
large numbers of contigs. SequenceDictionaryUtils.getCommonContigsByName() was running in O(n^2) time due to poor choice of data structure -- modified it to run in O(n) time. Also removed an unnecessary O(n log n) step at another stage in the sequence dictionary validation process. In tests with a 181,813-entry sequence dictionary, runtime improved from an average of 21.4 minutes to 45.1 seconds. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5604 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b81228fec1
commit
80d547ae71
|
|
@ -869,12 +869,7 @@ public class GenomeAnalysisEngine {
|
||||||
// Compile a set of sequence names that exist in the BAM files.
|
// Compile a set of sequence names that exist in the BAM files.
|
||||||
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
||||||
|
|
||||||
Set<String> readsSequenceNames = new TreeSet<String>();
|
if (readsDictionary.size() == 0) {
|
||||||
for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences())
|
|
||||||
readsSequenceNames.add(dictionaryEntry.getSequenceName());
|
|
||||||
|
|
||||||
|
|
||||||
if (readsSequenceNames.size() == 0) {
|
|
||||||
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -353,13 +353,13 @@ public class SequenceDictionaryUtils {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||||
Set<String> intersectingSequenceNames = new HashSet<String>(getContigNames(dict1));
|
Set<String> intersectingSequenceNames = getContigNames(dict1);
|
||||||
intersectingSequenceNames.retainAll(getContigNames(dict2));
|
intersectingSequenceNames.retainAll(getContigNames(dict2));
|
||||||
return intersectingSequenceNames;
|
return intersectingSequenceNames;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<String> getContigNames(SAMSequenceDictionary dict) {
|
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
||||||
List<String> contigNames = new ArrayList<String>();
|
Set<String> contigNames = new HashSet<String>((int)(dict.size() / 0.75f) + 1, 0.75f);
|
||||||
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
||||||
contigNames.add(dictionaryEntry.getSequenceName());
|
contigNames.add(dictionaryEntry.getSequenceName());
|
||||||
return contigNames;
|
return contigNames;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue