Merge pull request #911 from broadinstitute/ab_vcf_cmd_line_header_909
Modify GATK command line header for unique keys
This commit is contained in:
commit
761e456d07
|
|
@ -79,13 +79,15 @@ public class GATKVCFUtils {
|
|||
/**
|
||||
* Gets the appropriately formatted header for a VCF file describing this GATK run
|
||||
*
|
||||
* @param header the existing VCFHeader that we will be adding this command line argument header line to. Existing
|
||||
* command line argument header lines will be used to generate a unique header line key.
|
||||
* @param engine the GATK engine that holds the walker name, GATK version, and other information
|
||||
* @param argumentSources contains information on the argument values provided to the GATK for converting to a
|
||||
* command line string. Should be provided from the data in the parsing engine. Can be
|
||||
* empty in which case the command line will be the empty string.
|
||||
* @return VCF header line describing this run of the GATK.
|
||||
*/
|
||||
public static VCFHeaderLine getCommandLineArgumentHeaderLine(final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) {
|
||||
public static VCFHeaderLine getCommandLineArgumentHeaderLine(final VCFHeader header, final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) {
|
||||
if ( engine == null ) throw new IllegalArgumentException("engine cannot be null");
|
||||
if ( argumentSources == null ) throw new IllegalArgumentException("argumentSources cannot be null");
|
||||
|
||||
|
|
@ -96,7 +98,36 @@ public class GATKVCFUtils {
|
|||
attributes.put("Date", date.toString());
|
||||
attributes.put("Epoch", Long.toString(date.getTime()));
|
||||
attributes.put("CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray()));
|
||||
return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes);
|
||||
|
||||
// in case the walker name contains space, remove any spaces
|
||||
String key = getCommandLineKey(header, engine.getWalkerName().replaceAll("\\s", ""));
|
||||
return new VCFSimpleHeaderLine(key, attributes);
|
||||
}
|
||||
|
||||
// create a unique command line argument header line key. This method will look for existing
|
||||
// keys using the same walker name and append a count after it to make it unique.
|
||||
private static String getCommandLineKey(final VCFHeader header, final String walkerName) {
|
||||
final Iterator<VCFHeaderLine> existingMetaDataIterator = header.getMetaDataInInputOrder().iterator();
|
||||
|
||||
// the command line argument keys are in the format GATK_COMMAND_LINE_KEY.(walker name)
|
||||
final String searchKey = String.format("%s.%s", GATK_COMMAND_LINE_KEY, walkerName);
|
||||
|
||||
int commandLineKeyCount = 0;
|
||||
VCFHeaderLine line;
|
||||
while ( existingMetaDataIterator.hasNext() ) {
|
||||
line = existingMetaDataIterator.next();
|
||||
// if we find another key that starts with the same text as the walker
|
||||
if ( line.getKey().startsWith(searchKey) )
|
||||
commandLineKeyCount++;
|
||||
}
|
||||
|
||||
// if there are no existing keys with this same walker name, then just return the
|
||||
// GATK_COMMAND_LINE_KEY.(walker name) format
|
||||
if ( commandLineKeyCount == 0 )
|
||||
return searchKey;
|
||||
// otherwise append the count associated with this new command (existing + 1)
|
||||
else
|
||||
return String.format("%s.%d", searchKey, commandLineKeyCount+1);
|
||||
}
|
||||
|
||||
public static <T extends Feature> Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List<RodBinding<T>> rodBindings) {
|
||||
|
|
|
|||
|
|
@ -253,7 +253,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
|
|||
// skip writing the command line header if requested
|
||||
if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) {
|
||||
// Always add the header line, as the current format allows multiple entries
|
||||
final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(engine, argumentSources);
|
||||
final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(vcfHeader, engine, argumentSources);
|
||||
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -232,7 +232,15 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
final File vcf = executeTest("testGATKVersionInVCF", spec).first.get(0);
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf)));
|
||||
final VCFHeaderLine versionLine = header.getMetaDataLine(GATKVCFUtils.GATK_COMMAND_LINE_KEY);
|
||||
|
||||
// go through the metadata headers and look for ones that start with the GATK_COMMAND_LINE_KEY
|
||||
VCFHeaderLine versionLine = null;
|
||||
for ( final VCFHeaderLine headerLine : header.getMetaDataInInputOrder()) {
|
||||
if(headerLine.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY)) {
|
||||
versionLine = headerLine;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Assert.assertNotNull(versionLine);
|
||||
Assert.assertTrue(versionLine.toString().contains("TestPrintVariantsWalker"));
|
||||
}
|
||||
|
|
@ -251,7 +259,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
boolean foundHC = false;
|
||||
boolean foundPV = false;
|
||||
for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line.getKey().equals(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
|
||||
if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
|
||||
if ( line.toString().contains("HaplotypeCaller") ) {
|
||||
Assert.assertFalse(foundHC);
|
||||
foundHC = true;
|
||||
|
|
@ -267,6 +275,41 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
Assert.assertTrue(foundPV, "Didn't find TestPrintVariantsWalker command line header field");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleGATKVersionsSameWalkerInVCF() throws Exception {
|
||||
WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference +
|
||||
" -V " + privateTestDir + "gatkCommandLineExistsInHeader.vcf"
|
||||
+ " -o %s",
|
||||
1, Arrays.asList(""));
|
||||
spec.disableShadowBCF();
|
||||
final File vcf = executeTest("testMultipleGATKVersionsSameWalkerInVCF", spec).first.get(0);
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf)));
|
||||
|
||||
boolean foundFirstWalker = false;
|
||||
boolean foundSecondWalker = false;
|
||||
for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
|
||||
// check if we found the second walker command line header field key
|
||||
if ( line.getKey().contains("TestPrintVariantsWalker.2") ) {
|
||||
Assert.assertFalse(foundSecondWalker);
|
||||
foundSecondWalker = true;
|
||||
}
|
||||
// otherwise if this is not the second walker command but contains the same
|
||||
// walker name, then it is the first occurrence. If we somehow got more than
|
||||
// two occurrences of this walker, the Assert.assertFalse(foundFirstWalker);
|
||||
// will catch this
|
||||
else if ( line.getKey().contains("TestPrintVariantsWalker") ) {
|
||||
Assert.assertFalse(foundFirstWalker);
|
||||
foundFirstWalker = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertTrue(foundFirstWalker, "Didn't find TestPrintVariantsWalker command line header field");
|
||||
Assert.assertTrue(foundSecondWalker, "Didn't find (second) TestPrintVariantsWalker command line header field");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test that defaultBaseQualities actually works
|
||||
|
|
|
|||
|
|
@ -70,17 +70,21 @@ public class GATKVCFUtilsUnitTest extends BaseTest {
|
|||
final GenomeAnalysisEngine testEngine2 = new GenomeAnalysisEngine();
|
||||
testEngine2.setWalker(walker2);
|
||||
|
||||
final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine1, Collections.EMPTY_LIST);
|
||||
final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST);
|
||||
logger.warn(line1);
|
||||
Assert.assertNotNull(line1);
|
||||
Assert.assertEquals(line1.getKey(), GATKVCFUtils.GATK_COMMAND_LINE_KEY);
|
||||
for ( final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions"))
|
||||
// assert the key matches the expected format (GATKVCFUtils.GATK_COMMAND_LINE_KEY).(walker name)
|
||||
final String expectedLine1Key = String.format("%s.%s", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName());
|
||||
Assert.assertEquals(line1.getKey(), expectedLine1Key);
|
||||
|
||||
for (final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions"))
|
||||
Assert.assertTrue(line1.toString().contains(field), "Couldn't find field " + field + " in " + line1.getValue());
|
||||
Assert.assertTrue(line1.toString().contains("ID=" + testEngine1.getWalkerName()));
|
||||
|
||||
final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine2, Collections.EMPTY_LIST);
|
||||
final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine2, Collections.EMPTY_LIST);
|
||||
logger.warn(line2);
|
||||
|
||||
|
||||
header.addMetaDataLine(line1);
|
||||
final Set<VCFHeaderLine> lines1 = header.getMetaDataInInputOrder();
|
||||
Assert.assertTrue(lines1.contains(line1));
|
||||
|
|
@ -89,6 +93,23 @@ public class GATKVCFUtilsUnitTest extends BaseTest {
|
|||
final Set<VCFHeaderLine> lines2 = header.getMetaDataInInputOrder();
|
||||
Assert.assertTrue(lines2.contains(line1));
|
||||
Assert.assertTrue(lines2.contains(line2));
|
||||
|
||||
// create a new header line using the same engine as used by line 1
|
||||
final VCFHeaderLine line3 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST);
|
||||
logger.warn(line3);
|
||||
|
||||
// ensure convention followed by getCommandLineArgumentHeaderLine is to append ".(number of duplicate engine runs)"
|
||||
// line3 uses the same walker as line1, whereas line2 uses a different walker. line3 is the second occurrence of walker1
|
||||
// so a ".2" gets appended afterwards
|
||||
final String expectedLine3Key = String.format("%s.%s.2", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName());
|
||||
Assert.assertEquals(line3.getKey(), expectedLine3Key);
|
||||
|
||||
header.addMetaDataLine(line3);
|
||||
|
||||
final Set<VCFHeaderLine> lines3 = header.getMetaDataInInputOrder();
|
||||
Assert.assertTrue(lines3.contains(line1));
|
||||
Assert.assertTrue(lines3.contains(line2));
|
||||
Assert.assertTrue(lines3.contains(line3));
|
||||
}
|
||||
|
||||
private class IndexCreatorTest extends TestDataProvider {
|
||||
|
|
|
|||
Loading…
Reference in New Issue