Modify GATK command line header for unique keys
The GATK command line header keys were being repeated in the VCF and subsequently lost to a single key value by HTSJDK. This resolves the issue by appending the name of the walker after the text "GATKCommandLine" and a number after that if the same walker was used more than once in the form: GATKCommandLine.(walker name) for the first occurrence of the walker, and GATKCommandLine.(walker name).# where # is the number of the occurrence of the walker (e.g. GATKCommandLine.SomeWalker.2 for the second occurrence of SomeWalker). Integration test added to EngineFeaturesIntegrationTest to verify two runs of same walker follow expected form. Resolves #909 See also: HTSJDK #43
This commit is contained in:
parent
c2c0037969
commit
024ec69e97
|
|
@ -79,13 +79,15 @@ public class GATKVCFUtils {
|
|||
/**
|
||||
* Gets the appropriately formatted header for a VCF file describing this GATK run
|
||||
*
|
||||
* @param header the existing VCFHeader that we will be adding this command line argument header line to. Existing
|
||||
* command line argument header lines will be used to generate a unique header line key.
|
||||
* @param engine the GATK engine that holds the walker name, GATK version, and other information
|
||||
* @param argumentSources contains information on the argument values provided to the GATK for converting to a
|
||||
* command line string. Should be provided from the data in the parsing engine. Can be
|
||||
* empty in which case the command line will be the empty string.
|
||||
* @return VCF header line describing this run of the GATK.
|
||||
*/
|
||||
public static VCFHeaderLine getCommandLineArgumentHeaderLine(final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) {
|
||||
public static VCFHeaderLine getCommandLineArgumentHeaderLine(final VCFHeader header, final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) {
|
||||
if ( engine == null ) throw new IllegalArgumentException("engine cannot be null");
|
||||
if ( argumentSources == null ) throw new IllegalArgumentException("argumentSources cannot be null");
|
||||
|
||||
|
|
@ -96,7 +98,36 @@ public class GATKVCFUtils {
|
|||
attributes.put("Date", date.toString());
|
||||
attributes.put("Epoch", Long.toString(date.getTime()));
|
||||
attributes.put("CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray()));
|
||||
return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes);
|
||||
|
||||
// in case the walker name contains space, remove any spaces
|
||||
String key = getCommandLineKey(header, engine.getWalkerName().replaceAll("\\s", ""));
|
||||
return new VCFSimpleHeaderLine(key, attributes);
|
||||
}
|
||||
|
||||
// create a unique command line argument header line key. This method will look for existing
|
||||
// keys using the same walker name and append a count after it to make it unique.
|
||||
private static String getCommandLineKey(final VCFHeader header, final String walkerName) {
|
||||
final Iterator<VCFHeaderLine> existingMetaDataIterator = header.getMetaDataInInputOrder().iterator();
|
||||
|
||||
// the command line argument keys are in the format GATK_COMMAND_LINE_KEY.(walker name)
|
||||
final String searchKey = String.format("%s.%s", GATK_COMMAND_LINE_KEY, walkerName);
|
||||
|
||||
int commandLineKeyCount = 0;
|
||||
VCFHeaderLine line;
|
||||
while ( existingMetaDataIterator.hasNext() ) {
|
||||
line = existingMetaDataIterator.next();
|
||||
// if we find another key that starts with the same text as the walker
|
||||
if ( line.getKey().startsWith(searchKey) )
|
||||
commandLineKeyCount++;
|
||||
}
|
||||
|
||||
// if there are no existing keys with this same walker name, then just return the
|
||||
// GATK_COMMAND_LINE_KEY.(walker name) format
|
||||
if ( commandLineKeyCount == 0 )
|
||||
return searchKey;
|
||||
// otherwise append the count associated with this new command (existing + 1)
|
||||
else
|
||||
return String.format("%s.%d", searchKey, commandLineKeyCount+1);
|
||||
}
|
||||
|
||||
public static <T extends Feature> Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List<RodBinding<T>> rodBindings) {
|
||||
|
|
|
|||
|
|
@ -253,7 +253,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
|
|||
// skip writing the command line header if requested
|
||||
if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) {
|
||||
// Always add the header line, as the current format allows multiple entries
|
||||
final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(engine, argumentSources);
|
||||
final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(vcfHeader, engine, argumentSources);
|
||||
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -232,7 +232,15 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
final File vcf = executeTest("testGATKVersionInVCF", spec).first.get(0);
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf)));
|
||||
final VCFHeaderLine versionLine = header.getMetaDataLine(GATKVCFUtils.GATK_COMMAND_LINE_KEY);
|
||||
|
||||
// go through the metadata headers and look for ones that start with the GATK_COMMAND_LINE_KEY
|
||||
VCFHeaderLine versionLine = null;
|
||||
for ( final VCFHeaderLine headerLine : header.getMetaDataInInputOrder()) {
|
||||
if(headerLine.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY)) {
|
||||
versionLine = headerLine;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Assert.assertNotNull(versionLine);
|
||||
Assert.assertTrue(versionLine.toString().contains("TestPrintVariantsWalker"));
|
||||
}
|
||||
|
|
@ -251,7 +259,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
boolean foundHC = false;
|
||||
boolean foundPV = false;
|
||||
for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line.getKey().equals(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
|
||||
if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
|
||||
if ( line.toString().contains("HaplotypeCaller") ) {
|
||||
Assert.assertFalse(foundHC);
|
||||
foundHC = true;
|
||||
|
|
@ -267,6 +275,41 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
|
|||
Assert.assertTrue(foundPV, "Didn't find TestPrintVariantsWalker command line header field");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleGATKVersionsSameWalkerInVCF() throws Exception {
|
||||
WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference +
|
||||
" -V " + privateTestDir + "gatkCommandLineExistsInHeader.vcf"
|
||||
+ " -o %s",
|
||||
1, Arrays.asList(""));
|
||||
spec.disableShadowBCF();
|
||||
final File vcf = executeTest("testMultipleGATKVersionsSameWalkerInVCF", spec).first.get(0);
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf)));
|
||||
|
||||
boolean foundFirstWalker = false;
|
||||
boolean foundSecondWalker = false;
|
||||
for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
|
||||
if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
|
||||
// check if we found the second walker command line header field key
|
||||
if ( line.getKey().contains("TestPrintVariantsWalker.2") ) {
|
||||
Assert.assertFalse(foundSecondWalker);
|
||||
foundSecondWalker = true;
|
||||
}
|
||||
// otherwise if this is not the second walker command but contains the same
|
||||
// walker name, then it is the first occurrence. If we somehow got more than
|
||||
// two occurrences of this walker, the Assert.assertFalse(foundFirstWalker);
|
||||
// will catch this
|
||||
else if ( line.getKey().contains("TestPrintVariantsWalker") ) {
|
||||
Assert.assertFalse(foundFirstWalker);
|
||||
foundFirstWalker = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertTrue(foundFirstWalker, "Didn't find TestPrintVariantsWalker command line header field");
|
||||
Assert.assertTrue(foundSecondWalker, "Didn't find (second) TestPrintVariantsWalker command line header field");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test that defaultBaseQualities actually works
|
||||
|
|
|
|||
|
|
@ -70,17 +70,21 @@ public class GATKVCFUtilsUnitTest extends BaseTest {
|
|||
final GenomeAnalysisEngine testEngine2 = new GenomeAnalysisEngine();
|
||||
testEngine2.setWalker(walker2);
|
||||
|
||||
final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine1, Collections.EMPTY_LIST);
|
||||
final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST);
|
||||
logger.warn(line1);
|
||||
Assert.assertNotNull(line1);
|
||||
Assert.assertEquals(line1.getKey(), GATKVCFUtils.GATK_COMMAND_LINE_KEY);
|
||||
for ( final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions"))
|
||||
// assert the key matches the expected format (GATKVCFUtils.GATK_COMMAND_LINE_KEY).(walker name)
|
||||
final String expectedLine1Key = String.format("%s.%s", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName());
|
||||
Assert.assertEquals(line1.getKey(), expectedLine1Key);
|
||||
|
||||
for (final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions"))
|
||||
Assert.assertTrue(line1.toString().contains(field), "Couldn't find field " + field + " in " + line1.getValue());
|
||||
Assert.assertTrue(line1.toString().contains("ID=" + testEngine1.getWalkerName()));
|
||||
|
||||
final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine2, Collections.EMPTY_LIST);
|
||||
final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine2, Collections.EMPTY_LIST);
|
||||
logger.warn(line2);
|
||||
|
||||
|
||||
header.addMetaDataLine(line1);
|
||||
final Set<VCFHeaderLine> lines1 = header.getMetaDataInInputOrder();
|
||||
Assert.assertTrue(lines1.contains(line1));
|
||||
|
|
@ -89,6 +93,23 @@ public class GATKVCFUtilsUnitTest extends BaseTest {
|
|||
final Set<VCFHeaderLine> lines2 = header.getMetaDataInInputOrder();
|
||||
Assert.assertTrue(lines2.contains(line1));
|
||||
Assert.assertTrue(lines2.contains(line2));
|
||||
|
||||
// create a new header line using the same engine as used by line 1
|
||||
final VCFHeaderLine line3 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST);
|
||||
logger.warn(line3);
|
||||
|
||||
// ensure convention followed by getCommandLineArgumentHeaderLine is to append ".(number of duplicate engine runs)"
|
||||
// line3 uses the same walker as line1, whereas line2 uses a different walker. line3 is the second occurrence of walker1
|
||||
// so a ".2" gets appended afterwards
|
||||
final String expectedLine3Key = String.format("%s.%s.2", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName());
|
||||
Assert.assertEquals(line3.getKey(), expectedLine3Key);
|
||||
|
||||
header.addMetaDataLine(line3);
|
||||
|
||||
final Set<VCFHeaderLine> lines3 = header.getMetaDataInInputOrder();
|
||||
Assert.assertTrue(lines3.contains(line1));
|
||||
Assert.assertTrue(lines3.contains(line2));
|
||||
Assert.assertTrue(lines3.contains(line3));
|
||||
}
|
||||
|
||||
private class IndexCreatorTest extends TestDataProvider {
|
||||
|
|
|
|||
Loading…
Reference in New Issue