Fix for mis-sorted VCF files in CatVariants
When using CatVariants, VCF files were being sorted solely on the base pair position of the first record, ignoring the chromosome. This can become problematic when merging files from different chromosomes, espeically if you have multiple VCFs per chromosome. As an example, assume the following 3 lines are all in separate files: 1 10 1 100 2 20 The merged VCF from CatVariants (without -assumeSorted) would read: 1 10 2 20 1 100 This has the potential to break tools that expect chromosomes to be contiguous within a VCF file. This commit changes the comparator from one of Pair<Integer, File> to one of Pair<VariantContext, File>. We construct a VariantContextComparator from the provided reference, which will sort the first record by chromosome and position properly. Additionally, if -assumeSorted is given, we simply use a null VariantContext as the first record, which will all be equal (as all will be null)
This commit is contained in:
parent
45a1d82305
commit
8fc631b7ae
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.gatk.tools;
|
||||
|
||||
import htsjdk.samtools.SAMSequenceDictionary;
|
||||
import htsjdk.samtools.reference.ReferenceSequenceFile;
|
||||
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
|
||||
import org.apache.log4j.BasicConfigurator;
|
||||
|
|
@ -47,6 +48,7 @@ import htsjdk.variant.vcf.VCFCodec;
|
|||
import htsjdk.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import htsjdk.variant.variantcontext.VariantContext;
|
||||
import htsjdk.variant.variantcontext.VariantContextComparator;
|
||||
import htsjdk.variant.variantcontext.writer.Options;
|
||||
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
|
||||
import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory;
|
||||
|
|
@ -228,9 +230,9 @@ public class CatVariants extends CommandLineProgram {
|
|||
|
||||
variant = parseVariantList(variant);
|
||||
|
||||
Comparator<Pair<Integer,File>> positionComparator = new PositionComparator();
|
||||
Comparator<Pair<VariantContext,File>> positionComparator = new PositionComparator(ref.getSequenceDictionary());
|
||||
|
||||
Queue<Pair<Integer,File>> priorityQueue;
|
||||
Queue<Pair<VariantContext,File>> priorityQueue;
|
||||
if (assumeSorted)
|
||||
priorityQueue = new LinkedList<>();
|
||||
else
|
||||
|
|
@ -244,7 +246,7 @@ public class CatVariants extends CommandLineProgram {
|
|||
return 1;
|
||||
|
||||
if (assumeSorted){
|
||||
priorityQueue.add(new Pair<>(0,file));
|
||||
priorityQueue.add(new Pair<VariantContext,File>(null,file));
|
||||
}
|
||||
else{
|
||||
if (!file.exists()) {
|
||||
|
|
@ -257,9 +259,8 @@ public class CatVariants extends CommandLineProgram {
|
|||
continue;
|
||||
}
|
||||
VariantContext vc = it.next();
|
||||
int firstPosition = vc.getStart();
|
||||
reader.close();
|
||||
priorityQueue.add(new Pair<>(firstPosition,file));
|
||||
priorityQueue.add(new Pair<>(vc,file));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -318,15 +319,19 @@ public class CatVariants extends CommandLineProgram {
|
|||
}
|
||||
}
|
||||
|
||||
private static class PositionComparator implements Comparator<Pair<Integer,File>> {
|
||||
private static class PositionComparator implements Comparator<Pair<VariantContext,File>> {
|
||||
|
||||
VariantContextComparator comp;
|
||||
|
||||
public PositionComparator(final SAMSequenceDictionary dict){
|
||||
comp = new VariantContextComparator(dict);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(Pair<Integer,File> p1, Pair<Integer,File> p2) {
|
||||
int startPositionP1 = p1.getFirst();
|
||||
int startPositionP2 = p2.getFirst();
|
||||
if (startPositionP1 == startPositionP2)
|
||||
return 0;
|
||||
return startPositionP1 < startPositionP2 ? -1 : 1 ;
|
||||
public int compare(final Pair<VariantContext,File> p1, final Pair<VariantContext,File> p2) {
|
||||
final VariantContext startPositionP1 = p1.getFirst();
|
||||
final VariantContext startPositionP2 = p2.getFirst();
|
||||
return comp.compare(startPositionP1, startPositionP2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,6 +57,8 @@ public class CatVariantsIntegrationTest {
|
|||
private final File CatVariantsVcf2 = new File(CatVariantsDir, "CatVariantsTest2.vcf");
|
||||
private final File CatVariantsBcf1 = new File(CatVariantsDir, "CatVariantsTest1.bcf");
|
||||
private final File CatVariantsBcf2 = new File(CatVariantsDir, "CatVariantsTest2.bcf");
|
||||
private final File CatVariantsVcf3 = new File(CatVariantsDir, "CatVariantsTest3.vcf");
|
||||
private final File CatVariantsVcf4 = new File(CatVariantsDir, "CatVariantsTest4.vcf");
|
||||
|
||||
private class CatVariantsTestProvider extends BaseTest.TestDataProvider {
|
||||
private final File file1;
|
||||
|
|
@ -126,6 +128,27 @@ public class CatVariantsIntegrationTest {
|
|||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "SortOrderTest")
|
||||
public Object[][] makeSortOrderTestProvider() {
|
||||
new CatVariantsTestProvider(CatVariantsVcf3, CatVariantsVcf4, BaseTest.createTempFile("CatVariantsSortOrderTest", ".vcf"), "7bfe34c78b9f1b56a28cc33262cdfd97");
|
||||
|
||||
return CatVariantsTestProvider.getTests(CatVariantsTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SortOrderTest")
|
||||
public void testSortOrder(final CatVariantsTestProvider cfg) throws IOException {
|
||||
|
||||
ProcessController pc = ProcessController.getThreadLocal();
|
||||
ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cfg.getCmdLine()));
|
||||
pc.execAndCheck(ps);
|
||||
|
||||
MD5DB.MD5Match result = md5db.testFileMD5("testSortOrder", "CatVariantsTestProvider", cfg.outputFile, cfg.md5, false);
|
||||
if(result.failed) {
|
||||
final MD5Mismatch failure = new MD5Mismatch(result.actualMD5, result.expectedMD5, result.diffEngineOutput);
|
||||
Assert.fail(failure.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MismatchedExtensionsTest")
|
||||
public Object[][] makeMismatchedExtensionsTestProvider() {
|
||||
return new Object[][]{
|
||||
|
|
|
|||
Loading…
Reference in New Issue