Merge pull request #911 from broadinstitute/ab_vcf_cmd_line_header_909

Modify GATK command line header for unique keys
This commit is contained in:
droazen 2015-04-02 16:52:42 -04:00
commit 761e456d07
4 changed files with 104 additions and 9 deletions

View File

@ -79,13 +79,15 @@ public class GATKVCFUtils {
/**
* Gets the appropriately formatted header for a VCF file describing this GATK run
*
* @param header the existing VCFHeader that we will be adding this command line argument header line to. Existing
* command line argument header lines will be used to generate a unique header line key.
* @param engine the GATK engine that holds the walker name, GATK version, and other information
* @param argumentSources contains information on the argument values provided to the GATK for converting to a
* command line string. Should be provided from the data in the parsing engine. Can be
* empty in which case the command line will be the empty string.
* @return VCF header line describing this run of the GATK.
*/
public static VCFHeaderLine getCommandLineArgumentHeaderLine(final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) {
public static VCFHeaderLine getCommandLineArgumentHeaderLine(final VCFHeader header, final GenomeAnalysisEngine engine, final Collection<Object> argumentSources) {
if ( engine == null ) throw new IllegalArgumentException("engine cannot be null");
if ( argumentSources == null ) throw new IllegalArgumentException("argumentSources cannot be null");
@ -96,7 +98,36 @@ public class GATKVCFUtils {
attributes.put("Date", date.toString());
attributes.put("Epoch", Long.toString(date.getTime()));
attributes.put("CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray()));
return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes);
// in case the walker name contains space, remove any spaces
String key = getCommandLineKey(header, engine.getWalkerName().replaceAll("\\s", ""));
return new VCFSimpleHeaderLine(key, attributes);
}
// create a unique command line argument header line key. This method will look for existing
// keys using the same walker name and append a count after it to make it unique.
private static String getCommandLineKey(final VCFHeader header, final String walkerName) {
final Iterator<VCFHeaderLine> existingMetaDataIterator = header.getMetaDataInInputOrder().iterator();
// the command line argument keys are in the format GATK_COMMAND_LINE_KEY.(walker name)
final String searchKey = String.format("%s.%s", GATK_COMMAND_LINE_KEY, walkerName);
int commandLineKeyCount = 0;
VCFHeaderLine line;
while ( existingMetaDataIterator.hasNext() ) {
line = existingMetaDataIterator.next();
// if we find another key that starts with the same text as the walker
if ( line.getKey().startsWith(searchKey) )
commandLineKeyCount++;
}
// if there are no existing keys with this same walker name, then just return the
// GATK_COMMAND_LINE_KEY.(walker name) format
if ( commandLineKeyCount == 0 )
return searchKey;
// otherwise append the count associated with this new command (existing + 1)
else
return String.format("%s.%d", searchKey, commandLineKeyCount+1);
}
public static <T extends Feature> Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List<RodBinding<T>> rodBindings) {

View File

@ -253,7 +253,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
// skip writing the command line header if requested
if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) {
// Always add the header line, as the current format allows multiple entries
final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(engine, argumentSources);
final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(vcfHeader, engine, argumentSources);
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
}

View File

@ -232,7 +232,15 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
final File vcf = executeTest("testGATKVersionInVCF", spec).first.get(0);
final VCFCodec codec = new VCFCodec();
final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf)));
final VCFHeaderLine versionLine = header.getMetaDataLine(GATKVCFUtils.GATK_COMMAND_LINE_KEY);
// go through the metadata headers and look for ones that start with the GATK_COMMAND_LINE_KEY
VCFHeaderLine versionLine = null;
for ( final VCFHeaderLine headerLine : header.getMetaDataInInputOrder()) {
if(headerLine.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY)) {
versionLine = headerLine;
break;
}
}
Assert.assertNotNull(versionLine);
Assert.assertTrue(versionLine.toString().contains("TestPrintVariantsWalker"));
}
@ -251,7 +259,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
boolean foundHC = false;
boolean foundPV = false;
for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
if ( line.getKey().equals(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
if ( line.toString().contains("HaplotypeCaller") ) {
Assert.assertFalse(foundHC);
foundHC = true;
@ -267,6 +275,41 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
Assert.assertTrue(foundPV, "Didn't find TestPrintVariantsWalker command line header field");
}
@Test(enabled = true)
public void testMultipleGATKVersionsSameWalkerInVCF() throws Exception {
WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference +
" -V " + privateTestDir + "gatkCommandLineExistsInHeader.vcf"
+ " -o %s",
1, Arrays.asList(""));
spec.disableShadowBCF();
final File vcf = executeTest("testMultipleGATKVersionsSameWalkerInVCF", spec).first.get(0);
final VCFCodec codec = new VCFCodec();
final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf)));
boolean foundFirstWalker = false;
boolean foundSecondWalker = false;
for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) {
// check if we found the second walker command line header field key
if ( line.getKey().contains("TestPrintVariantsWalker.2") ) {
Assert.assertFalse(foundSecondWalker);
foundSecondWalker = true;
}
// otherwise if this is not the second walker command but contains the same
// walker name, then it is the first occurrence. If we somehow got more than
// two occurrences of this walker, the Assert.assertFalse(foundFirstWalker);
// will catch this
else if ( line.getKey().contains("TestPrintVariantsWalker") ) {
Assert.assertFalse(foundFirstWalker);
foundFirstWalker = true;
}
}
}
Assert.assertTrue(foundFirstWalker, "Didn't find TestPrintVariantsWalker command line header field");
Assert.assertTrue(foundSecondWalker, "Didn't find (second) TestPrintVariantsWalker command line header field");
}
// --------------------------------------------------------------------------------
//
// Test that defaultBaseQualities actually works

View File

@ -70,17 +70,21 @@ public class GATKVCFUtilsUnitTest extends BaseTest {
final GenomeAnalysisEngine testEngine2 = new GenomeAnalysisEngine();
testEngine2.setWalker(walker2);
final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine1, Collections.EMPTY_LIST);
final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST);
logger.warn(line1);
Assert.assertNotNull(line1);
Assert.assertEquals(line1.getKey(), GATKVCFUtils.GATK_COMMAND_LINE_KEY);
for ( final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions"))
// assert the key matches the expected format (GATKVCFUtils.GATK_COMMAND_LINE_KEY).(walker name)
final String expectedLine1Key = String.format("%s.%s", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName());
Assert.assertEquals(line1.getKey(), expectedLine1Key);
for (final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions"))
Assert.assertTrue(line1.toString().contains(field), "Couldn't find field " + field + " in " + line1.getValue());
Assert.assertTrue(line1.toString().contains("ID=" + testEngine1.getWalkerName()));
final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine2, Collections.EMPTY_LIST);
final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine2, Collections.EMPTY_LIST);
logger.warn(line2);
header.addMetaDataLine(line1);
final Set<VCFHeaderLine> lines1 = header.getMetaDataInInputOrder();
Assert.assertTrue(lines1.contains(line1));
@ -89,6 +93,23 @@ public class GATKVCFUtilsUnitTest extends BaseTest {
final Set<VCFHeaderLine> lines2 = header.getMetaDataInInputOrder();
Assert.assertTrue(lines2.contains(line1));
Assert.assertTrue(lines2.contains(line2));
// create a new header line using the same engine as used by line 1
final VCFHeaderLine line3 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST);
logger.warn(line3);
// ensure convention followed by getCommandLineArgumentHeaderLine is to append ".(number of duplicate engine runs)"
// line3 uses the same walker as line1, whereas line2 uses a different walker. line3 is the second occurrence of walker1
// so a ".2" gets appended afterwards
final String expectedLine3Key = String.format("%s.%s.2", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName());
Assert.assertEquals(line3.getKey(), expectedLine3Key);
header.addMetaDataLine(line3);
final Set<VCFHeaderLine> lines3 = header.getMetaDataInInputOrder();
Assert.assertTrue(lines3.contains(line1));
Assert.assertTrue(lines3.contains(line2));
Assert.assertTrue(lines3.contains(line3));
}
private class IndexCreatorTest extends TestDataProvider {