Fix for mis-sorted VCF files in CatVariants

When using CatVariants, VCF files were being sorted solely on the base
pair position of the first record, ignoring the chromosome.  This can
become problematic when merging files from different chromosomes,
espeically if you have multiple VCFs per chromosome.

As an example, assume the following 3 lines are all in separate files:
1       10
1       100
2       20

The merged VCF from CatVariants (without -assumeSorted) would read:
1       10
2       20
1       100

This has the potential to break tools that expect chromosomes to be
contiguous within a VCF file.

This commit changes the comparator from one of Pair<Integer, File> to
one of Pair<VariantContext, File>.  We construct a
VariantContextComparator from the provided reference, which will sort
the first record by chromosome and position properly.  Additionally, if
-assumeSorted is given, we simply use a null VariantContext as the first
record, which will all be equal (as all will be null)
This commit is contained in:
John Wallace 2015-05-21 11:58:53 -04:00 committed by Ron Levine
parent 45a1d82305
commit 8fc631b7ae
2 changed files with 40 additions and 12 deletions

View File

@ -25,6 +25,7 @@
package org.broadinstitute.gatk.tools;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import org.apache.log4j.BasicConfigurator;
@ -47,6 +48,7 @@ import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextComparator;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory;
@ -228,9 +230,9 @@ public class CatVariants extends CommandLineProgram {
variant = parseVariantList(variant);
Comparator<Pair<Integer,File>> positionComparator = new PositionComparator();
Comparator<Pair<VariantContext,File>> positionComparator = new PositionComparator(ref.getSequenceDictionary());
Queue<Pair<Integer,File>> priorityQueue;
Queue<Pair<VariantContext,File>> priorityQueue;
if (assumeSorted)
priorityQueue = new LinkedList<>();
else
@ -244,7 +246,7 @@ public class CatVariants extends CommandLineProgram {
return 1;
if (assumeSorted){
priorityQueue.add(new Pair<>(0,file));
priorityQueue.add(new Pair<VariantContext,File>(null,file));
}
else{
if (!file.exists()) {
@ -257,9 +259,8 @@ public class CatVariants extends CommandLineProgram {
continue;
}
VariantContext vc = it.next();
int firstPosition = vc.getStart();
reader.close();
priorityQueue.add(new Pair<>(firstPosition,file));
priorityQueue.add(new Pair<>(vc,file));
}
}
@ -318,15 +319,19 @@ public class CatVariants extends CommandLineProgram {
}
}
private static class PositionComparator implements Comparator<Pair<Integer,File>> {
private static class PositionComparator implements Comparator<Pair<VariantContext,File>> {
VariantContextComparator comp;
public PositionComparator(final SAMSequenceDictionary dict){
comp = new VariantContextComparator(dict);
}
@Override
public int compare(Pair<Integer,File> p1, Pair<Integer,File> p2) {
int startPositionP1 = p1.getFirst();
int startPositionP2 = p2.getFirst();
if (startPositionP1 == startPositionP2)
return 0;
return startPositionP1 < startPositionP2 ? -1 : 1 ;
public int compare(final Pair<VariantContext,File> p1, final Pair<VariantContext,File> p2) {
final VariantContext startPositionP1 = p1.getFirst();
final VariantContext startPositionP2 = p2.getFirst();
return comp.compare(startPositionP1, startPositionP2);
}
}
}

View File

@ -57,6 +57,8 @@ public class CatVariantsIntegrationTest {
private final File CatVariantsVcf2 = new File(CatVariantsDir, "CatVariantsTest2.vcf");
private final File CatVariantsBcf1 = new File(CatVariantsDir, "CatVariantsTest1.bcf");
private final File CatVariantsBcf2 = new File(CatVariantsDir, "CatVariantsTest2.bcf");
private final File CatVariantsVcf3 = new File(CatVariantsDir, "CatVariantsTest3.vcf");
private final File CatVariantsVcf4 = new File(CatVariantsDir, "CatVariantsTest4.vcf");
private class CatVariantsTestProvider extends BaseTest.TestDataProvider {
private final File file1;
@ -126,6 +128,27 @@ public class CatVariantsIntegrationTest {
}
}
@DataProvider(name = "SortOrderTest")
public Object[][] makeSortOrderTestProvider() {
new CatVariantsTestProvider(CatVariantsVcf3, CatVariantsVcf4, BaseTest.createTempFile("CatVariantsSortOrderTest", ".vcf"), "7bfe34c78b9f1b56a28cc33262cdfd97");
return CatVariantsTestProvider.getTests(CatVariantsTestProvider.class);
}
@Test(dataProvider = "SortOrderTest")
public void testSortOrder(final CatVariantsTestProvider cfg) throws IOException {
ProcessController pc = ProcessController.getThreadLocal();
ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cfg.getCmdLine()));
pc.execAndCheck(ps);
MD5DB.MD5Match result = md5db.testFileMD5("testSortOrder", "CatVariantsTestProvider", cfg.outputFile, cfg.md5, false);
if(result.failed) {
final MD5Mismatch failure = new MD5Mismatch(result.actualMD5, result.expectedMD5, result.diffEngineOutput);
Assert.fail(failure.toString());
}
}
@DataProvider(name = "MismatchedExtensionsTest")
public Object[][] makeMismatchedExtensionsTestProvider() {
return new Object[][]{