From 8fc631b7aec16b9b30bcbfd5bf07f6c4c7e5dcf4 Mon Sep 17 00:00:00 2001 From: John Wallace Date: Thu, 21 May 2015 11:58:53 -0400 Subject: [PATCH] Fix for mis-sorted VCF files in CatVariants When using CatVariants, VCF files were being sorted solely on the base pair position of the first record, ignoring the chromosome. This can become problematic when merging files from different chromosomes, espeically if you have multiple VCFs per chromosome. As an example, assume the following 3 lines are all in separate files: 1 10 1 100 2 20 The merged VCF from CatVariants (without -assumeSorted) would read: 1 10 2 20 1 100 This has the potential to break tools that expect chromosomes to be contiguous within a VCF file. This commit changes the comparator from one of Pair to one of Pair. We construct a VariantContextComparator from the provided reference, which will sort the first record by chromosome and position properly. Additionally, if -assumeSorted is given, we simply use a null VariantContext as the first record, which will all be equal (as all will be null) --- .../gatk/tools/CatVariants.java | 29 +++++++++++-------- .../tools/CatVariantsIntegrationTest.java | 23 +++++++++++++++ 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java index 391b0202f..5f4e1b516 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.tools; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import org.apache.log4j.BasicConfigurator; @@ -47,6 +48,7 @@ import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextComparator; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; @@ -228,9 +230,9 @@ public class CatVariants extends CommandLineProgram { variant = parseVariantList(variant); - Comparator> positionComparator = new PositionComparator(); + Comparator> positionComparator = new PositionComparator(ref.getSequenceDictionary()); - Queue> priorityQueue; + Queue> priorityQueue; if (assumeSorted) priorityQueue = new LinkedList<>(); else @@ -244,7 +246,7 @@ public class CatVariants extends CommandLineProgram { return 1; if (assumeSorted){ - priorityQueue.add(new Pair<>(0,file)); + priorityQueue.add(new Pair(null,file)); } else{ if (!file.exists()) { @@ -257,9 +259,8 @@ public class CatVariants extends CommandLineProgram { continue; } VariantContext vc = it.next(); - int firstPosition = vc.getStart(); reader.close(); - priorityQueue.add(new Pair<>(firstPosition,file)); + priorityQueue.add(new Pair<>(vc,file)); } } @@ -318,15 +319,19 @@ public class CatVariants extends CommandLineProgram { } } - private static class PositionComparator implements Comparator> { + private static class PositionComparator implements Comparator> { + + VariantContextComparator comp; + + public PositionComparator(final SAMSequenceDictionary dict){ + comp = new VariantContextComparator(dict); + } @Override - public int compare(Pair p1, Pair p2) { - int startPositionP1 = p1.getFirst(); - int startPositionP2 = p2.getFirst(); - if (startPositionP1 == startPositionP2) - return 0; - return startPositionP1 < startPositionP2 ? -1 : 1 ; + public int compare(final Pair p1, final Pair p2) { + final VariantContext startPositionP1 = p1.getFirst(); + final VariantContext startPositionP2 = p2.getFirst(); + return comp.compare(startPositionP1, startPositionP2); } } } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java index 69ae7a472..0e9c80451 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java @@ -57,6 +57,8 @@ public class CatVariantsIntegrationTest { private final File CatVariantsVcf2 = new File(CatVariantsDir, "CatVariantsTest2.vcf"); private final File CatVariantsBcf1 = new File(CatVariantsDir, "CatVariantsTest1.bcf"); private final File CatVariantsBcf2 = new File(CatVariantsDir, "CatVariantsTest2.bcf"); + private final File CatVariantsVcf3 = new File(CatVariantsDir, "CatVariantsTest3.vcf"); + private final File CatVariantsVcf4 = new File(CatVariantsDir, "CatVariantsTest4.vcf"); private class CatVariantsTestProvider extends BaseTest.TestDataProvider { private final File file1; @@ -126,6 +128,27 @@ public class CatVariantsIntegrationTest { } } + @DataProvider(name = "SortOrderTest") + public Object[][] makeSortOrderTestProvider() { + new CatVariantsTestProvider(CatVariantsVcf3, CatVariantsVcf4, BaseTest.createTempFile("CatVariantsSortOrderTest", ".vcf"), "7bfe34c78b9f1b56a28cc33262cdfd97"); + + return CatVariantsTestProvider.getTests(CatVariantsTestProvider.class); + } + + @Test(dataProvider = "SortOrderTest") + public void testSortOrder(final CatVariantsTestProvider cfg) throws IOException { + + ProcessController pc = ProcessController.getThreadLocal(); + ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cfg.getCmdLine())); + pc.execAndCheck(ps); + + MD5DB.MD5Match result = md5db.testFileMD5("testSortOrder", "CatVariantsTestProvider", cfg.outputFile, cfg.md5, false); + if(result.failed) { + final MD5Mismatch failure = new MD5Mismatch(result.actualMD5, result.expectedMD5, result.diffEngineOutput); + Assert.fail(failure.toString()); + } + } + @DataProvider(name = "MismatchedExtensionsTest") public Object[][] makeMismatchedExtensionsTestProvider() { return new Object[][]{