Fix for bug GSA-445: Sequence dictionary validation can be very slow with
large numbers of contigs. SequenceDictionaryUtils.getCommonContigsByName() was running in O(n^2) time due to poor choice of data structure -- modified it to run in O(n) time. Also removed an unnecessary O(n log n) step at another stage in the sequence dictionary validation process. In tests with a 181,813-entry sequence dictionary, runtime improved from an average of 21.4 minutes to 45.1 seconds. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5604 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b81228fec1
commit
80d547ae71
|
|
@ -869,12 +869,7 @@ public class GenomeAnalysisEngine {
|
|||
// Compile a set of sequence names that exist in the BAM files.
|
||||
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();
|
||||
|
||||
Set<String> readsSequenceNames = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences())
|
||||
readsSequenceNames.add(dictionaryEntry.getSequenceName());
|
||||
|
||||
|
||||
if (readsSequenceNames.size() == 0) {
|
||||
if (readsDictionary.size() == 0) {
|
||||
logger.info("Reads file is unmapped. Skipping validation against reference.");
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -353,13 +353,13 @@ public class SequenceDictionaryUtils {
|
|||
* @return
|
||||
*/
|
||||
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
|
||||
Set<String> intersectingSequenceNames = new HashSet<String>(getContigNames(dict1));
|
||||
Set<String> intersectingSequenceNames = getContigNames(dict1);
|
||||
intersectingSequenceNames.retainAll(getContigNames(dict2));
|
||||
return intersectingSequenceNames;
|
||||
}
|
||||
|
||||
public static List<String> getContigNames(SAMSequenceDictionary dict) {
|
||||
List<String> contigNames = new ArrayList<String>();
|
||||
public static Set<String> getContigNames(SAMSequenceDictionary dict) {
|
||||
Set<String> contigNames = new HashSet<String>((int)(dict.size() / 0.75f) + 1, 0.75f);
|
||||
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
|
||||
contigNames.add(dictionaryEntry.getSequenceName());
|
||||
return contigNames;
|
||||
|
|
|
|||
Loading…
Reference in New Issue