diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java
index b1fa87807..e8ef39e3e 100644
--- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java
+++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/tools/CatVariants.java
@@ -53,7 +53,6 @@ import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFact
import java.io.*;
import java.util.*;
-
/**
*
* Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples
@@ -73,13 +72,12 @@ import java.util.*;
*
Input
*
* One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order).
- * The input files should be 'name.vcf' or 'name.VCF' or 'name.bcf' or 'name.BCF'.
* If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag.
*
*
*
Output
*
- * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'.
+ * A combined VCF or BCF. The output file should have the same extension as the input(s).
* <\p>
*
*
Important note
@@ -113,17 +111,17 @@ public class CatVariants extends CommandLineProgram {
* The VCF or BCF files to merge together
*
* CatVariants can take any number of -V arguments on the command line. Each -V argument
- * will be included in the final merged output VCF. The order of arguments does not matter, but it runs more
+ * will be included in the final merged output VCF/BCF. The order of arguments does not matter, but it runs more
* efficiently if they are sorted based on the intervals and the assumeSorted argument is used.
*
*/
- @Input(fullName="variant", shortName="V", doc="Input VCF file/s named .vcf or .bcf", required = true)
+ @Input(fullName="variant", shortName="V", doc="Input VCF file/s", required = true)
private List variant = null;
- @Output(fullName = "outputFile", shortName = "out", doc = "output file name .vcf or .bcf", required = true)
+ @Output(fullName = "outputFile", shortName = "out", doc = "output file", required = true)
private File outputFile = null;
- @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false)
+ @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if the input files are already sorted (based on the position of the variants)", required = false)
private Boolean assumeSorted = false;
@Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false)
@@ -137,19 +135,69 @@ public class CatVariants extends CommandLineProgram {
*/
private static void printUsage() {
System.err.println("Usage: java -cp target/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants --reference --variant --outputFile [--assumeSorted]");
- System.err.println(" The input file(s) can be of type: VCF (must end in .vcf or .VCF) or");
- System.err.println(" BCF2 (must end in .bcf or .BCF).");
- System.err.println(" Output file must be of type vcf or bcf (must end in .vcf or .bcf).");
+ System.err.println(" The output file must be of the same type as all input files.");
System.err.println(" If the input files are already sorted, then indicate that with --assumeSorted to improve performance.");
}
+ private enum FileType {
+ VCF,
+ BCF,
+ BLOCK_COMPRESSED_VCF,
+ INVALID
+ }
+
+ private FileType fileExtensionCheck(File inFile, File outFile) {
+ final String inFileName = inFile.toString().toLowerCase();
+ final String outFileName = outFile.toString().toLowerCase();
+
+ FileType inFileType = FileType.INVALID;
+
+ if (inFileName.endsWith(".vcf")) {
+ inFileType = FileType.VCF;
+ if (outFileName.endsWith(".vcf"))
+ return inFileType;
+ }
+
+ if (inFileName.endsWith(".bcf")) {
+ inFileType = FileType.BCF;
+ if (outFileName.endsWith(".bcf"))
+ return inFileType;
+ }
+
+ for (String extension : AbstractFeatureReader.BLOCK_COMPRESSED_EXTENSIONS) {
+ if (inFileName.endsWith(".vcf" + extension)) {
+ inFileType = FileType.BLOCK_COMPRESSED_VCF;
+ if (outFileName.endsWith(".vcf" + extension))
+ return inFileType;
+ }
+ }
+
+ if (inFileType == FileType.INVALID)
+ System.err.println(String.format("File extension for input file %s is not valid for CatVariants", inFile));
+ else
+ System.err.println(String.format("File extension mismatch between input %s and output %s", inFile, outFile));
+
+ printUsage();
+ return FileType.INVALID;
+ }
+
+ private FeatureReader getFeatureReader(final FileType fileType, final File file) {
+ FeatureReader reader = null;
+ switch(fileType) {
+ case VCF:
+ case BLOCK_COMPRESSED_VCF:
+ // getFeatureReader will handle both block-compressed and plain text VCFs
+ reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false);
+ break;
+ case BCF:
+ reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false);
+ break;
+ }
+ return reader;
+ }
+
@Override
protected int execute() throws Exception {
- //if(help){
- // printUsage();
- // return 1;
- //}
-
BasicConfigurator.configure();
logger.setLevel(Level.INFO);
@@ -162,37 +210,27 @@ public class CatVariants extends CommandLineProgram {
Comparator> positionComparator = new PositionComparator();
-
- //PriorityQueue>> queue =
- // new PriorityQueue>>(2000, comparator);
Queue> priorityQueue;
- if(assumeSorted)
- priorityQueue = new LinkedList>();
+ if (assumeSorted)
+ priorityQueue = new LinkedList<>();
else
- priorityQueue = new PriorityQueue>(10000, positionComparator);
+ priorityQueue = new PriorityQueue<>(10000, positionComparator);
- Iterator files = variant.iterator();
- File file;
- while (files.hasNext()) {
- file = files.next();
- if (!(file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF") || file.getName().endsWith(".bcf") || file.getName().endsWith(".BCF"))){
- System.err.println("File " + file.getAbsolutePath() + " should be .vcf or .bcf");
- printUsage();
+ FileType fileType = FileType.INVALID;
+ for (File file : variant) {
+ // if it returns a valid type, it will be the same for all files
+ fileType = fileExtensionCheck(file, outputFile);
+ if (fileType == FileType.INVALID)
return 1;
- }
+
if (assumeSorted){
- priorityQueue.add(new Pair(0,file));
+ priorityQueue.add(new Pair<>(0,file));
}
else{
if (!file.exists()) {
throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath()));
}
- FeatureReader reader;
- boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF"));
- if(useVCF)
- reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false);
- else
- reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false);
+ FeatureReader reader = getFeatureReader(fileType, file);
Iterator it = reader.iterator();
if(!it.hasNext()){
System.err.println(String.format("File %s is empty. This file will be ignored",file.getAbsolutePath()));
@@ -201,37 +239,25 @@ public class CatVariants extends CommandLineProgram {
VariantContext vc = it.next();
int firstPosition = vc.getStart();
reader.close();
- //queue.add(new Pair>(firstPosition,reader));
- priorityQueue.add(new Pair(firstPosition,file));
+ priorityQueue.add(new Pair<>(firstPosition,file));
}
}
- if (!(outputFile.getName().endsWith(".vcf") || outputFile.getName().endsWith(".VCF"))){
- throw new UserException(String.format("Output file %s should be .vcf", outputFile));
- }
-
FileOutputStream outputStream = new FileOutputStream(outputFile);
EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary());
final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options);
boolean firstFile = true;
- int count =0;
- //while(!queue.isEmpty()){
+ int count = 0;
while(!priorityQueue.isEmpty() ){
count++;
- //FeatureReader reader = queue.remove().getSecond();
- file = priorityQueue.remove().getSecond();
+ File file = priorityQueue.remove().getSecond();
if (!file.exists()) {
throw new UserException(String.format("File %s doesn't exist",file.getAbsolutePath()));
}
- FeatureReader reader;
- boolean useVCF = (file.getName().endsWith(".vcf") || file.getName().endsWith(".VCF"));
- if(useVCF)
- reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false);
- else
- reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new BCF2Codec(), false);
+ FeatureReader reader = getFeatureReader(fileType, file);
if(count%10 ==0)
System.out.print(count);
@@ -255,13 +281,11 @@ public class CatVariants extends CommandLineProgram {
}
System.out.println();
- outputStream.close();
outputWriter.close();
return 0;
}
-
public static void main(String[] args){
try {
CatVariants instance = new CatVariants();
@@ -286,5 +310,4 @@ public class CatVariants extends CommandLineProgram {
return startPositionP1 < startPositionP2 ? -1 : 1 ;
}
}
-
}